### Try scrapping the data from The Numbers site

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
The_Numbers_url = "https://www.the-numbers.com/movie/budgets/all"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

### Function to scrape the data

In [None]:
def scrape_The_Numbers():
    
    
    all_movies = []
    
    for start_index in range(1, 5001, 100): # Starf from 1 to ignore the headers, collect 5000 movies, 100 per page
        
        
        if start_index == 1:
            url = The_Numbers_url
        else:
            url = f"{The_Numbers_url}/{start_index}"  # this condition to handle the first page differently and go thorough multiple pages
            
        print(f"Scraping page starting at row {start_index}...")
        
        try:   # Exception handling to catch any errors during requests or parsing
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print("Failed to load page.")
                continue

            page = BeautifulSoup(response.content, 'html.parser')
            rows = page.find_all('tr')
            
            current_page_count = 0
            
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 6:
                    data = {
                        "Release Date": cols[1].get_text(strip=True),
                        "Title": cols[2].get_text(strip=True),
                        "Budget": cols[3].get_text(strip=True),
                        "Domestic Gross": cols[4].get_text(strip=True),
                        "Worldwide Gross": cols[5].get_text(strip=True)
                    }
                    all_movies.append(data)
                    current_page_count += 1
            
            print(f" -> Collected {current_page_count} movies.")
            
            time.sleep(2)  # to handle the robot detection by adding delay between requests
            
        except Exception as e:
            print(f"Error on page {start_index}: {e}")

    return all_movies

In [None]:
# --- RUN IT ---
data = scrape_The_Numbers()
df = pd.DataFrame(data)

df.head()

Scraping page starting at row 1...
 -> Collected 100 movies.
Scraping page starting at row 101...
 -> Collected 100 movies.
Scraping page starting at row 201...
 -> Collected 100 movies.
Scraping page starting at row 301...
 -> Collected 100 movies.
Scraping page starting at row 401...
 -> Collected 100 movies.
Scraping page starting at row 501...
 -> Collected 100 movies.
Scraping page starting at row 601...
 -> Collected 100 movies.
Scraping page starting at row 701...
 -> Collected 100 movies.
Scraping page starting at row 801...
 -> Collected 100 movies.
Scraping page starting at row 901...
 -> Collected 100 movies.
Scraping page starting at row 1001...
 -> Collected 100 movies.
Scraping page starting at row 1101...
 -> Collected 100 movies.
Scraping page starting at row 1201...
 -> Collected 100 movies.
Scraping page starting at row 1301...
 -> Collected 100 movies.
Scraping page starting at row 1401...
 -> Collected 100 movies.
Scraping page starting at row 1501...
 -> Collected 

Unnamed: 0,Release Date,Title,Budget,Domestic Gross,Worldwide Gross
0,"Dec 16, 2015",Star Wars Ep. VII: The Force Awakens,"$533,200,000","$936,662,225","$2,056,046,835"
1,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,717,503,922"
2,"Dec 9, 2022",Avatar: The Way of Water,"$400,000,000","$688,809,501","$2,322,902,023"
3,"May 17, 2025",Mission: Impossible—The Final Reckoning,"$400,000,000","$197,413,515","$591,353,074"
4,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802"


In [16]:
df.to_csv("data\movies_financial.csv", index=False)

  df.to_csv("data\movies_financial.csv", index=False)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Release Date     4800 non-null   object
 1   Title            4800 non-null   object
 2   Budget           4800 non-null   object
 3   Domestic Gross   4800 non-null   object
 4   Worldwide Gross  4800 non-null   object
dtypes: object(5)
memory usage: 187.6+ KB


### Try to scrapping the IMDB movies

In [10]:
API_KEY = "6b1c54551e0e2df55b6bdf448af1b9f1" 

def scrape_top_movies():
    all_movies = []
    print("Starting collection movies...")
    for page_num in range(1, 401):
        url = f"https://api.themoviedb.org/3/movie/top_rated?api_key={API_KEY}&language=en-US&page={page_num}"
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            for item in data['results']:
                all_movies.append({
                    "Title": item.get('title'),
                    "Year": item.get('release_date', 'N/A')[:4],
                    "Rating": item.get('vote_average'),      
                    "Vote Count": item.get('vote_count'),
                    "Language": item.get('original_language')
                })
            print(f"Page {page_num}/400 done. Total movies: {len(all_movies)}")
            time.sleep(0.2)
            
        except Exception as e:
            print(f"Error on page {page_num}: {e}")
            continue

    return all_movies

In [11]:
data = scrape_top_movies()

Starting collection movies...
Page 1/400 done. Total movies: 20
Page 2/400 done. Total movies: 40
Page 3/400 done. Total movies: 60
Page 4/400 done. Total movies: 80
Page 5/400 done. Total movies: 100
Page 6/400 done. Total movies: 120
Page 7/400 done. Total movies: 140
Page 8/400 done. Total movies: 160
Page 9/400 done. Total movies: 180
Page 10/400 done. Total movies: 200
Page 11/400 done. Total movies: 220
Page 12/400 done. Total movies: 240
Page 13/400 done. Total movies: 260
Page 14/400 done. Total movies: 280
Page 15/400 done. Total movies: 300
Page 16/400 done. Total movies: 320
Page 17/400 done. Total movies: 340
Page 18/400 done. Total movies: 360
Page 19/400 done. Total movies: 380
Page 20/400 done. Total movies: 400
Page 21/400 done. Total movies: 420
Page 22/400 done. Total movies: 440
Page 23/400 done. Total movies: 460
Page 24/400 done. Total movies: 480
Page 25/400 done. Total movies: 500
Page 26/400 done. Total movies: 520
Page 27/400 done. Total movies: 540
Page 28/400

In [12]:
if data:
    df_imdb = pd.DataFrame(data)
    print(df_imdb.head())
else:
    print("None")

                      Title  Year  Rating  Vote Count Language
0  The Shawshank Redemption  1994   8.713       29336       en
1             The Godfather  1972   8.685       22140       en
2     The Godfather Part II  1974   8.571       13384       en
3          Schindler's List  1993   8.566       16896       en
4              12 Angry Men  1957   8.500        9579       en


In [13]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       8000 non-null   object 
 1   Year        8000 non-null   object 
 2   Rating      8000 non-null   float64
 3   Vote Count  8000 non-null   int64  
 4   Language    8000 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 312.6+ KB


In [17]:
df_imdb.to_csv("data\imdb_data.csv", index=False)

  df_imdb.to_csv("data\imdb_data.csv", index=False)
