In [1]:
import requests
import pandas as pd

api_key = "ae60927b2621ffdf665f3b47791feda8"

def fetch_data(api_key, page_limit=10):
    all_movies = []

    for page in range(1, page_limit + 1):
        print(f"Fetching data from page {page}")
        url = f"https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to fetch page {page}: {response.status_code}")
            continue

        results = response.json().get("results", [])

        for movie in results:
            try:
                movie_id = movie['id']
                details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&append_to_response=credits"
                details_response = requests.get(details_url).json()

                title = details_response.get("title", "")
                genres = ", ".join([g['name'] for g in details_response.get("genres", [])])
                synopsis = details_response.get("overview", "")
                credits = details_response.get("credits", {})
                cast = credits.get("cast", [])
                crew = credits.get("crew", [])
                rating = details_response.get("vote_average", 0)

                actors = ", ".join([person['name'] for person in cast[:3]])
                directors = ", ".join([person['name'] for person in crew if person['job'] == "Director"])

                all_movies.append({
                    'title': title,
                    'genres': genres,
                    'actors': actors,
                    'directors': directors,
                    'synopsis': synopsis,
                    'rating': rating
                })
            except Exception as e:
                print(f"Error fetching movie ID {movie_id}: {e}")
                continue

    return pd.DataFrame(all_movies)

# Fetch and save data
movie_df = fetch_data(api_key, page_limit=10)
movie_df.to_csv('movies_dataset.csv', index=False)
print("Movie data has been saved to archive_tmdb_5000_credits.csv")


Fetching data from page 1


ConnectTimeout: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/popular?api_key=ae60927b2621ffdf665f3b47791feda8&language=en-US&page=1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001A8B74D0C20>, 'Connection to api.themoviedb.org timed out. (connect timeout=None)'))

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data=pd.read_csv("movies_dataset.csv")
data.fillna('',inplace=True)
data.head()

Unnamed: 0,title,genres,actors,directors,synopsis,rating
0,War of the Worlds,"Science Fiction, Thriller","Ice Cube, Eva Longoria, Clark Gregg",Rich Lee,Will Radford is a top analyst for Homeland Sec...,4.163
1,Superman,"Science Fiction, Adventure, Action","David Corenswet, Rachel Brosnahan, Nicholas Hoult",James Gunn,"Superman, a journalist in Metropolis, embarks ...",7.587
2,Mission: Impossible - The Final Reckoning,"Action, Adventure, Thriller","Tom Cruise, Hayley Atwell, Ving Rhames",Christopher McQuarrie,Ethan Hunt and team continue their search for ...,7.23
3,Jurassic World Rebirth,"Science Fiction, Adventure, Action","Scarlett Johansson, Mahershala Ali, Jonathan B...",Gareth Edwards,Five years after the events of Jurassic World ...,6.378
4,Striking Rescue,"Action, Crime, Thriller","Tony Jaa, Eason Hung, Xing Yu",Cheng Siyi,A veteran Muay Thai expert goes on a take-no-p...,7.696


In [6]:
avg_actor_rating=data.groupby('actors')['rating'].mean().to_dict()
avg_director_rating=data.groupby('directors')['rating'].mean().to_dict()

data['avg_actor_rating']=data['actors'].map(avg_actor_rating)
data['avg_director_rating']=data['directors'].map(avg_director_rating)

In [7]:
sentence_model=SentenceTransformer("all-MiniLM-L6-v2")
synopsis_embeddings=sentence_model.encode(data['synopsis'].tolist())

genre_ohe=pd.get_dummies(data['genres'])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [8]:
X=np.hstack((synopsis_embeddings,genre_ohe.values,data[['avg_actor_rating','avg_director_rating']].values))
y=data['rating'].values

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [9]:
xgb_model=XGBRegressor(
    n_estimators=100,
    learning_rate=0.3,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model.fit(X_train,y_train)
y_pred=xgb_model.predict(X_test)

In [None]:
r2=r2_score(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)

print(f"r2 Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

In [None]:
pickle.dump(xgb_model,open('movie_rating_model.pkl','wb'))
pickle.dump(sentence_model,open('Sentence_model.pkl','wb'))
pickle.dump(genre_ohe.columns.to_list(),open("genre_columns.pkl",'wb'))
pickle.dump(avg_actor_rating,open("avg_actor_rating.pkl",'wb'))
pickle.dump(avg_director_rating,open("avg_director_rating.pkl",'wb'))