In [1]:
import requests
import pickle
import time
import os
import pandas as pd

In [2]:
BASE_TMDB_URL = "https://api.themoviedb.org/3"
TMDB_API_KEY = "922be14408412c8da1173be210ff4171"  # Replace with your actual API key

In [3]:
def get_top_movies(api_key, page=1):
    url = f"{BASE_TMDB_URL}/movie/top_rated?api_key={api_key}&language=en-US&page={page}"
    response = requests.get(url)
    movies = response.json().get('results', [])
    top_movies = [(movie['title'], movie['id'], movie['release_date']) for movie in movies]
    return top_movies

def fetch_movie_details(tmdb_id, api_key):
    #get attributes
    url = f"{BASE_TMDB_URL}/movie/{tmdb_id}?api_key={api_key}&language=en-US"
    response = requests.get(url)
    movie = response.json()

    imdb_id = movie.get('imdb_id')
    
    # get data and rate of movie
    release_dates_url = f"{BASE_TMDB_URL}/movie/{tmdb_id}/release_dates?api_key={api_key}"
    release_dates_response = requests.get(release_dates_url)
    release_dates = release_dates_response.json().get('results', [])
    
    us_release = next((release for release in release_dates if release.get('iso_3166_1') == 'US'), None)
    certification = 'Not Rated'
    if us_release:
        for release_date in us_release.get('release_dates', []):
            if release_date.get('certification'):
                certification = release_date.get('certification')
                break

    simplified_rating = 'Not Rated'
    if 'R' in certification:
        simplified_rating = 'R'
    elif 'PG-13' in certification:
        simplified_rating = 'PG-13'
    elif 'PG' in certification:
        simplified_rating = 'PG'
    elif 'G' in certification:
        simplified_rating = 'G'
    
    # get list of actors
    cast_url = f"{BASE_TMDB_URL}/movie/{tmdb_id}/credits?api_key={api_key}"
    cast_response = requests.get(cast_url)
    cast = cast_response.json().get('cast', [])
    actors = ', '.join([actor['name'] for actor in cast[:10]])  # Get top 10 actors

    return {
        'movie_code': movie.get('id'),
        'Title': movie.get('title'),
        'Year': movie.get('release_date', '').split('-')[0],
        'Revenue': movie.get('revenue'),
        'Budget': movie.get('budget'),
        'Runtime': movie.get('runtime'),
        'Actors': actors,
        'Rating': simplified_rating,
        'Production_company': ', '.join([comp['name'] for comp in movie.get('production_companies', [])]),
        'Genre': ', '.join([genre['name'] for genre in movie.get('genres', [])]),
        'IMDb_code': imdb_id
    }

def fetch_tmdb_reviews(tmdb_id, api_key, imdb_code):
    url = f"{BASE_TMDB_URL}/movie/{tmdb_id}/reviews?api_key={api_key}&language=en-US"
    response = requests.get(url)
    reviews = response.json().get('results', [])
    review_data = []
    for review in reviews:
        review_data.append({
            'movie_code': tmdb_id,
            'IMDb_code': imdb_code,
            'review_date': review.get('created_at'),
            'rating_of_movie': review.get('author_details', {}).get('rating'),
            'actual_review': review.get('content')
        })
    return review_data

def save_progress(movie_data, reviews_data, filename='movies_data.pkl', reviews_filename='reviews_data.pkl'):
    try:
        with open(filename, 'wb') as f:
            pickle.dump(movie_data, f)
        print(f"Saved movie data to {filename}")
        with open(reviews_filename, 'wb') as f:
            pickle.dump(reviews_data, f)
        print(f"Saved reviews data to {reviews_filename}")
    except (pickle.PickleError, IOError) as e:
        print(f"Error saving data: {e}")

def load_progress(filename='movies_data.pkl', reviews_filename='reviews_data.pkl'):
    movie_data = []
    reviews_data = []

    try:
        if os.path.exists(filename):
            with open(filename, 'rb') as f:
                movie_data = pickle.load(f)
    except Exception as e:
        print(f"Error loading {filename}: {e}")

    try:
        if os.path.exists(reviews_filename):
            with open(reviews_filename, 'rb') as f:
                reviews_data = pickle.load(f)
    except Exception as e:
        print(f"Error loading {reviews_filename}: {e}")

    return movie_data, reviews_data


def get_movies_and_reviews(movie_filename='movies_data.pkl', reviews_filename='reviews_data.pkl'):
    movie_data, reviews_data = load_progress(movie_filename, reviews_filename)
    approximate_entry_size_kb = 5
    data_size_limit_kb = 1 * 1024 * 1024  # 1GB in KB
    accumulated_size_kb = sum([len(str(movie)) / 1024 for movie in movie_data])
    page = (len(movie_data) // 20) + 1

    while accumulated_size_kb < data_size_limit_kb:
        top_movies = get_top_movies(TMDB_API_KEY, page)
        if not top_movies:
            break

        for title, tmdb_id, release_date in top_movies:
            if any(movie['movie_code'] == tmdb_id for movie in movie_data):
                continue

            movie_info = fetch_movie_details(tmdb_id, TMDB_API_KEY)
            imdb_code = movie_info.get('IMDb_code')
            reviews = fetch_tmdb_reviews(tmdb_id, TMDB_API_KEY, imdb_code)
            
            if reviews:
                movie_info['release_date'] = release_date
                movie_data.append(movie_info)
                reviews_data.extend(reviews)
                accumulated_size_kb += approximate_entry_size_kb
                print(f"Fetched data for: {title}")
                save_progress(movie_data, reviews_data)
                time.sleep(1)
            
            if accumulated_size_kb >= data_size_limit_kb:
                print(f"Reached data size limit of approximately {data_size_limit_kb / (1024 * 1024)} GB")
                break
        page += 1

    save_progress(movie_data, reviews_data)

    movie_df = pd.DataFrame(movie_data)
    reviews_df = pd.DataFrame(reviews_data)

    movie_df.to_csv(movie_filename, index=False)
    reviews_df.to_csv(reviews_filename, index=False)

    print(f"Data saved to {movie_filename} and {reviews_filename}")

    return movie_df, reviews_df

In [6]:
movie_data, reviews_data = get_movies_and_reviews()
save_progress(movie_data, reviews_data)

Fetched data for: The Shawshank Redemption
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: The Godfather
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: The Godfather Part II
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Schindler's List
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: 12 Angry Men
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Spirited Away
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Dilwale Dulhania Le Jayenge
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: The Dark Knight
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Parasite
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetch

KeyboardInterrupt: 

In [6]:
movie_data, reviews_data = get_movies_and_reviews()
save_progress(movie_data, reviews_data)

Fetched data for: 12 Angry Men
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Spirited Away
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl
Fetched data for: Dilwale Dulhania Le Jayenge
Saved movie data to movies_data.pkl
Saved reviews data to reviews_data.pkl


KeyboardInterrupt: 

In [7]:
movies_df = pd.DataFrame(load_progress()[0])
reviews_df = pd.DataFrame(load_progress()[1])

print("Movies DataFrame:")
movies_df.head(20)

Movies DataFrame:


Unnamed: 0,movie_code,Title,Year,Revenue,Budget,Runtime,Actors,Rating,Production_company,Genre,IMDb_code,release_date
0,278,The Shawshank Redemption,1994,28341469,25000000,142,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",R,Castle Rock Entertainment,"Drama, Crime",tt0111161,1994-09-23
1,238,The Godfather,1972,245066411,6000000,175,"Marlon Brando, Al Pacino, James Caan, Robert D...",R,"Paramount Pictures, Alfran Productions, Americ...","Drama, Crime",tt0068646,1972-03-14
2,240,The Godfather Part II,1974,102600000,13000000,202,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",R,"Paramount Pictures, The Coppola Company, Ameri...","Drama, Crime",tt0071562,1974-12-20
3,424,Schindler's List,1993,321365567,22000000,195,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",R,Amblin Entertainment,"Drama, History, War",tt0108052,1993-12-15
4,389,12 Angry Men,1957,4360000,397751,97,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",R,"United Artists, Orion-Nova Productions",Drama,tt0050083,1957-04-10
5,129,Spirited Away,2001,274925095,19000000,125,"Rumi Hiiragi, Miyu Irino, Mari Natsuki, Takash...",PG,Studio Ghibli,"Animation, Family, Fantasy",tt0245429,2001-04-18
6,19404,Dilwale Dulhania Le Jayenge,1995,100000000,13200000,190,"Kajol, Shah Rukh Khan, Amrish Puri, Farida Jal...",G,Yash Raj Films,"Comedy, Drama, Romance",tt0112870,1995-10-20
7,155,The Dark Knight,2008,1004558444,185000000,152,"Christian Bale, Heath Ledger, Michael Caine, G...",PG-13,"DC, Legendary Pictures, Syncopy, Isobel Griffi...","Drama, Action, Crime, Thriller",tt0468569,2008-07-16
8,496243,Parasite,2019,257591776,11363000,133,"Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong, Cho...",R,Barunson E&A,"Comedy, Thriller, Drama",tt6751668,2019-05-30
9,497,The Green Mile,1999,286801374,60000000,189,"Tom Hanks, Michael Clarke Duncan, David Morse,...",R,"Castle Rock Entertainment, Darkwoods Productions","Fantasy, Drama, Crime",tt0120689,1999-12-10


In [8]:
print("Reviews DataFrame:")
reviews_df.head(10)

Reviews DataFrame:


Unnamed: 0,movie_code,IMDb_code,review_date,rating_of_movie,actual_review
0,278,tt0111161,2016-04-29T18:08:41.892Z,9.0,very good movie 9.5/10 محمد الشعراوى
1,278,tt0111161,2016-07-10T00:16:50.561Z,10.0,Some birds aren't meant to be caged.\r\n\r\nTh...
2,278,tt0111161,2017-11-11T15:09:34.114Z,6.0,Make way for the best film ever made people. *...
3,278,tt0111161,2018-05-01T05:51:13.756Z,10.0,There is a reason why this movie is at the top...
4,278,tt0111161,2018-10-18T15:08:48.777Z,,It's still puzzling to me why this movie exact...
5,278,tt0111161,2019-07-30T08:25:48.402Z,,"I will not say that the film is predictable, b..."
6,278,tt0111161,2021-09-18T19:56:48.348Z,10.0,First time seeing this in probably close to 20...
7,278,tt0111161,2023-01-14T18:44:02.525Z,,No 1 movie for all the time
8,278,tt0111161,2023-04-03T15:38:39.540Z,3.0,This is much more predictable and Hollywood th...
9,278,tt0111161,2023-06-29T17:48:41.064Z,,This movie is great


In [None]:
#if __name__ == "__main__":
    #get_movies_and_reviews()