In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetching details of all 50 pages

In [10]:
# Creating a list of URLs for pages 1 to 50
url = []
for i in range(1, 51):
    url_str = "https://www.themoviedb.org/movie?page=" + str(i)
    url.append(url_str)

In [None]:
# List to store extracted movie data
moviesdata = []

# Function to extract data from each URL
def extract_data(url):
    for item in url:
        # Defining user-agent to avoid any restrictions
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
        
        # Sending a request to the URL and getting the HTML content
        source_data = requests.get(item, headers=header).text
        souped_data = BeautifulSoup(source_data, 'lxml')
        
        # Extracting movie information from each card on the page
        all_divs = souped_data.find_all('div', class_='card style_1')
        
        for i in all_divs:
            # Extracting movie name and release date
            name_element = i.find('h2')
            release_date_element = i.find('p')
            
            if name_element and release_date_element:
                name = name_element.text
                release_date = release_date_element.text
                
                # Extracting nested URL for detailed movie information
                wrapper = i.find('a', class_="image")
                if wrapper:
                    href = wrapper.get('href')
                    nested_url = "https://www.themoviedb.org" + href
                    
                    # Sending a request to the nested URL and getting the HTML content
                    req2 = requests.get(nested_url, headers=header).text
                    req2_s = BeautifulSoup(req2, 'lxml')
                    
                    # Extracting movie rating, duration, genres, and director
                    rating_element = req2_s.find('div', class_="user_score_chart")
                    duration_element = req2_s.find('span', class_="runtime")
                    genre_element = req2_s.find('div', class_='facts')
                    dc = req2_s.find('li', class_='profile')
                    
                    # Extracting data only if the elements are found
                    if rating_element:
                        rating = rating_element.get('data-percent')
                    if duration_element:
                        duration = duration_element.text.replace("\n", "").strip()
                    genres = [g.text for g in gen] if genre_element else []
                    director = dc.find('a').text if dc else None
                    
                    # Creating a dictionary to store movie details
                    moviedict = {
                        "Name": name,
                        "Release Date": release_date,
                        "Ratings": rating if 'rating' in locals() else None,
                        "Duration": duration if 'duration' in locals() else None,
                        "Genre": genres,
                        "Director": director if 'director' in locals() else None
                    }
                    
                    # Appending the dictionary to the list
                    moviesdata.append(moviedict)
    
    # Returning the list of movie data
    return moviesdata

# Extracting data from the URLs and storing it in a DataFrame
df = pd.DataFrame(extract_data(url))

# Saving the DataFrame to an Excel file
df.to_excel("moviedetails.xlsx")
