### Try scrapping the data from The Numbers site

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [17]:
The_Numbers_url = "https://www.the-numbers.com/movie/budgets/all"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

### Function to scrape the data

In [18]:
def scrape_The_Numbers():
    
    
    all_movies = []
    
    for start_index in range(1, 2002, 100): # Starf from 1 to ignore the headers, collect 2000 movies, 100 per page
        
        
        if start_index == 1:
            url = The_Numbers_url
        else:
            url = f"{The_Numbers_url}/{start_index}"  # this condition to handle the first page differently and go thorough multiple pages
            
        print(f"Scraping page starting at row {start_index}...")
        
        try:   # Exception handling to catch any errors during requests or parsing
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print("Failed to load page.")
                continue

            page = BeautifulSoup(response.content, 'html.parser')
            rows = page.find_all('tr')
            
            current_page_count = 0
            
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 6:
                    data = {
                        "Release Date": cols[1].get_text(strip=True),
                        "Title": cols[2].get_text(strip=True),
                        "Budget": cols[3].get_text(strip=True),
                        "Domestic Gross": cols[4].get_text(strip=True),
                        "Worldwide Gross": cols[5].get_text(strip=True)
                    }
                    all_movies.append(data)
                    current_page_count += 1
            
            print(f" -> Collected {current_page_count} movies.")
            
            time.sleep(2)  # to handle the robot detection by adding delay between requests
            
        except Exception as e:
            print(f"Error on page {start_index}: {e}")

    return all_movies

In [19]:
# --- RUN IT ---
data = scrape_The_Numbers()
df = pd.DataFrame(data)

df.head()


Scraping page starting at row 1...


 -> Collected 100 movies.
Scraping page starting at row 101...
 -> Collected 0 movies.
Scraping page starting at row 201...
 -> Collected 100 movies.
Scraping page starting at row 301...
 -> Collected 100 movies.
Scraping page starting at row 401...
 -> Collected 100 movies.
Scraping page starting at row 501...
 -> Collected 100 movies.
Scraping page starting at row 601...
 -> Collected 100 movies.
Scraping page starting at row 701...
 -> Collected 100 movies.
Scraping page starting at row 801...
 -> Collected 100 movies.
Scraping page starting at row 901...
 -> Collected 100 movies.
Scraping page starting at row 1001...
 -> Collected 100 movies.
Scraping page starting at row 1101...
 -> Collected 100 movies.
Scraping page starting at row 1201...
 -> Collected 100 movies.
Scraping page starting at row 1301...
 -> Collected 100 movies.
Scraping page starting at row 1401...
 -> Collected 100 movies.
Scraping page starting at row 1501...
 -> Collected 100 movies.
Scraping page starting at

Unnamed: 0,Release Date,Title,Budget,Domestic Gross,Worldwide Gross
0,"Dec 16, 2015",Star Wars Ep. VII: The Force Awakens,"$533,200,000","$936,662,225","$2,056,046,835"
1,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,717,503,922"
2,"Dec 9, 2022",Avatar: The Way of Water,"$400,000,000","$688,809,501","$2,322,902,023"
3,"May 17, 2025",Mission: Impossible—The Final Reckoning,"$400,000,000","$197,413,515","$591,353,074"
4,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802"


In [20]:
df.to_csv("data\movies_financial.csv", index=False)

  df.to_csv("data\movies_financial.csv", index=False)


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Release Date     2000 non-null   object
 1   Title            2000 non-null   object
 2   Budget           2000 non-null   object
 3   Domestic Gross   2000 non-null   object
 4   Worldwide Gross  2000 non-null   object
dtypes: object(5)
memory usage: 78.3+ KB


In [None]:
imdb_url = "https://www.imdb.com/chart/top/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}







In [None]:
API_KEY = "YOUR_API_KEY_HERE" 

def scrape_top_2000_movies():
    all_movies = []
    print("Starting collection of 2,000 movies...")

    # TMDB has 20 movies per page. To get 2,000, we need 100 pages.
    # Loop from page 1 to 100
    for page_num in range(1, 101):
        
        # We use the 'Top Rated' endpoint to mimic the IMDb Top 250 style
        url = f"https://api.themoviedb.org/3/movie/top_rated?api_key={API_KEY}&language=en-US&page={page_num}"
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            
            # Loop through the 20 results on this page
            for item in data['results']:
                all_movies.append({
                    "Title": item.get('title'),
                    "Year": item.get('release_date', 'N/A')[:4], # Extract just the year '2023-05-12' -> '2023'
                    "Rating": item.get('vote_average'),      # TMDB's version of IMDb Rating
                    "Vote Count": item.get('vote_count'),
                    "Language": item.get('original_language')
                })
            
            print(f"Page {page_num}/100 done. Total movies: {len(all_movies)}")
            
            # Be polite to the server
            time.sleep(0.2)
            
        except Exception as e:
            print(f"Error on page {page_num}: {e}")
            continue

    return all_movies

In [None]:
data = scrape_imdb_top_250()

In [None]:
if data:
    df_imdb = pd.DataFrame(data)
    print(df_imdb.head())
    
    # Save to CSV
    df_imdb.to_csv("data\imdb_data.csv", index=False)
else:
    print("None")