**DATA PREVIEW**

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the CSV files (adjust paths as necessary)
ratings = pd.read_csv('./data/rating.csv')     # Contains: userId, movieId, rating, timestamp
movies = pd.read_csv('./data/movie.csv')       # Contains: movieId, title, genres
links = pd.read_csv('./data/link.csv')         # Contains: movieId, imdbId, tmdbId

# Display the first few rows of each DataFrame to confirm they loaded correctly
print("Ratings:")
ratings.head()

Ratings:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [3]:
print("\nMovies:")
movies.head()


Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
print("\nLinks:")
links.head()


Links:


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
# Merge movies and links on movieId
movies_links = pd.merge(movies, links, on='movieId', how='left')

# Check the merged DataFrame
print("Movies with TMDB IDs:")
print(movies_links.head())

# Merge the ratings with movies_links on movieId
data = pd.merge(ratings, movies_links[['movieId', 'tmdbId', 'title']], on='movieId', how='left')

# Optionally, drop rows with missing tmdbId values (if any)
data = data.dropna(subset=['tmdbId'])

# Convert tmdbId to integer if necessary
data['tmdbId'] = data['tmdbId'].astype(int)

# Display the combined data
print("Combined Data:")
print(data.head())

Movies with TMDB IDs:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  
1                   Adventure|Children|Fantasy  113497   8844.0  
2                               Comedy|Romance  113228  15602.0  
3                         Comedy|Drama|Romance  114885  31357.0  
4                                       Comedy  113041  11862.0  
Combined Data:
   userId  movieId  rating            timestamp  tmdbId  \
0       1        2     3.5  2005-04-02 23:53:47    8844   
1       1       29     3.5  2005-04-02 23:31:16     902   
2       1       32     3.5  2005-04-02 23:33:39      63   
3       1       47     3.5  20

**DATA PREPROCESSING**

In [6]:
# Merge movies and links on movieId
movies_links = pd.merge(movies, links, on='movieId', how='left')

# Check the merged DataFrame
print("Movies with TMDB IDs:")
print(movies_links.head())

# Merge the ratings with movies_links on movieId
data = pd.merge(ratings, movies_links[['movieId', 'tmdbId', 'title']], on='movieId', how='left')

# Optionally, drop rows with missing tmdbId values (if any)
data = data.dropna(subset=['tmdbId'])

# Convert tmdbId to integer if necessary
data['tmdbId'] = data['tmdbId'].astype(int)

# Drop unneeded columns: movieId and timestamp
data = data[['userId', 'tmdbId', 'rating', 'title']]

# Display the combined data
print("Combined Data:")
print(data.head())

Movies with TMDB IDs:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  
1                   Adventure|Children|Fantasy  113497   8844.0  
2                               Comedy|Romance  113228  15602.0  
3                         Comedy|Drama|Romance  114885  31357.0  
4                                       Comedy  113041  11862.0  
Combined Data:
   userId  tmdbId  rating                                              title
0       1    8844     3.5                                     Jumanji (1995)
1       1     902     3.5  City of Lost Children, The (Cité des enfants p...
2       1      63     3.5          

In [7]:
from surprise import Dataset, Reader

# Define the rating scale; adjust if necessary (e.g., 0.5 to 5)
reader = Reader(rating_scale=(data['rating'].min(), data['rating'].max()))

# Create a Surprise dataset from the DataFrame
surprise_data = Dataset.load_from_df(data[['userId', 'tmdbId', 'rating']], reader)

# Verify the dataset
print("Data prepared for the recommendation model.")

Data prepared for the recommendation model.


**AI Model Training**

In [8]:
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy

# Split data into training and test sets (80% training, 20% testing)
trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=42)

# Create and train an SVD model
model = SVD()
model.fit(trainset)

# Evaluate the model on the test set
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print("Test RMSE:", rmse)

RMSE: 0.7867
Test RMSE: 0.7867056513104431


In [9]:
import pickle

# Save the model to a file
with open('svd_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [10]:
with open('svd_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [11]:
def get_recommendations(user_id, model, ratings_df, movies_df, n=10):
    # Get all movie IDs from the movies DataFrame
    all_movie_ids = movies_df['tmdbId'].unique()
    
    # Find movie IDs that the user has already rated
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['tmdbId'].unique()
    
    # Movies not yet rated by the user
    unrated_movie_ids = [mid for mid in all_movie_ids if mid not in rated_movie_ids]
    
    # Predict ratings for all unrated movies
    predictions = [(mid, model.predict(user_id, mid).est) for mid in unrated_movie_ids]
    
    # Sort predictions by predicted rating in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top n recommendations
    return predictions[:n]

# Example usage: Get top 10 recommendations for user with ID 1
top_recs = get_recommendations(user_id=1, model=model, ratings_df=data, movies_df=movies_links, n=10)

print("Top 10 Recommendations for User 1:")
for tmdbId, predicted_rating in top_recs:
    title = movies_links[movies_links['tmdbId'] == tmdbId]['title'].iloc[0]
    print(f"TMDB ID: {tmdbId}, Title: {title}, Predicted Rating: {predicted_rating:.2f}")

Top 10 Recommendations for User 1:
TMDB ID: 119324.0, Title: Prime Suspect (1991), Predicted Rating: 4.49
TMDB ID: 241620.0, Title: Louis Theroux: Law & Disorder (2008), Predicted Rating: 4.49
TMDB ID: 13930.0, Title: For the Birds (2000), Predicted Rating: 4.46
TMDB ID: 11362.0, Title: The Count of Monte Cristo (2002), Predicted Rating: 4.45
TMDB ID: 16320.0, Title: Serenity (2005), Predicted Rating: 4.44
TMDB ID: 199.0, Title: Star Trek: First Contact (1996), Predicted Rating: 4.43
TMDB ID: 34576.0, Title: Most Dangerous Man in America: Daniel Ellsberg and the Pentagon Papers, The (2009), Predicted Rating: 4.41
TMDB ID: 13976.0, Title: Dylan Moran: Monster (2004), Predicted Rating: 4.39
TMDB ID: 129.0, Title: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Predicted Rating: 4.39
TMDB ID: 55192.0, Title: Tokyo Twilight (Tôkyô boshoku) (1957), Predicted Rating: 4.38


DISMISS ABOVE

DISMISS ABOVE

DISMISS ABOVE

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load movies.csv
movies_df = pd.read_csv("./data/movie.csv")
# Load links.csv
links_df = pd.read_csv("./data/link.csv")
# Load tags.csv
tags_df = pd.read_csv("./data/tag.csv")

# 1) Merge tags for each movie into one string
# Convert NaNs to empty strings and ensure all tags are strings
tags_df["tag"] = tags_df["tag"].fillna("").astype(str)

tags_agg = tags_df.groupby("movieId")["tag"].apply(lambda x: " ".join(x)).reset_index()
tags_agg.columns = ["movieId", "all_tags"]

# 2) Merge with movies to get each movie’s genres + tags
movies_df = pd.merge(movies_df, tags_agg, on="movieId", how="left")

# 3) Replace NaN tags with empty string
movies_df["all_tags"] = movies_df["all_tags"].fillna("")

# 4) Combine genres + tags into a single text column
movies_df["combined_text"] = (
    movies_df["genres"].str.replace("|", " ", regex=False) + " " +
    movies_df["all_tags"].fillna("")
)

# 5) Optional: add title as well (remove parentheses, years, etc. if you want)
movies_df["combined_text"] = (
    movies_df["combined_text"] + " " +
    movies_df["title"].str.lower().str.replace("[^a-z0-9 ]", "", regex=True)
)

# 6) Vectorize
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies_df["combined_text"])
# Each row in tfidf_matrix is a movie, each column is a token from the corpus

In [5]:
# Suppose we have user_doc from MongoDB with watchedMovies
user_doc = {
    "username": "bias72",
    "watchedMovies": [
        {"tmdbId": 862, "title": "Toy Story (1995)"},
        {"tmdbId": 240, "title": "The Godfather Part II"}
    ]
}

# 1) Map tmdbId -> movieId
#    links_df has columns: movieId, imdbId, tmdbId
tmdb_to_movie = dict(zip(links_df["tmdbId"], links_df["movieId"]))

watched_tmdbIds = [m["tmdbId"] for m in user_doc["watchedMovies"]]
watched_movieIds = []
for t in watched_tmdbIds:
    if t in tmdb_to_movie:
        watched_movieIds.append(tmdb_to_movie[t])

# 2) Get the rows in movies_df that match those movieIds
watched_rows = movies_df[movies_df["movieId"].isin(watched_movieIds)].index.tolist()

# 3) For each watched row, retrieve its tf-idf vector
import numpy as np

if watched_rows:
    watched_vectors = tfidf_matrix[watched_rows]
    # Combine them: e.g. take average
    user_profile = watched_vectors.mean(axis=0)
else:
    # If user has no mapped movies, fallback to a default approach
    user_profile = np.zeros((1, tfidf_matrix.shape[1]))

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compare user_profile to every row in tfidf_matrix
user_profile = np.asarray(user_profile)
user_profile = user_profile.reshape(1, -1)  # ensures shape = (1, n_features)

sims = cosine_similarity(user_profile, tfidf_matrix)

# Flatten to get a single array of similarity scores
similarities = sims.flatten()

# Sort movies by descending similarity
sorted_indices = np.argsort(-similarities)

# Create a list of recommended movieIds in descending order of similarity
recommended_indices = []
for idx in sorted_indices:
    movie_id = movies_df.iloc[idx]["movieId"]
    recommended_indices.append(movie_id)

# Exclude the movies the user has already watched
recommended_indices = [m for m in recommended_indices if m not in watched_movieIds]

# Take top 10
top_10 = recommended_indices[:10]

In [8]:
for mid in top_10:
    row = movies_df[movies_df["movieId"] == mid].iloc[0]
    print(mid, row["title"], row["genres"], row["all_tags"])

3114 Toy Story 2 (1999) Adventure|Animation|Children|Comedy|Fantasy animation humorous Pixar animation cute fanciful toys Pixar animation animation computer animation Disney friendship kids and family Pixar toys Oscar Nominee abandonment imdb top 250 Pixar sequel friendship cgi computer animation Disney Pixar RACE AGAINST TIME sequel Tom Hanks bright DARING RESCUES fanciful humorous light RACE AGAINST TIME TOYS COME TO LIFE whimsical sequel better than original Pixar Pixar animation Disney Pixar Tim Allen Tom Hanks original Pixar cgi Disney Pixar computer animation Pixar Tom Hanks Disney friendship funny Pixar childhood classic funny Pixar Tom Hanks Pixar cgi Pixar sequel animation Disney Pixar imdb top 250 animation Joan Cusack Pixar Tim Allen Tom Hanks original animation Disney Pixar Pixar original Disney pixar animation Disney Pixar Tom Hanks computer animation Pixar animation Pixar Pixar Pixar funny Oscar Nominee Pixar animation childish Disney Pixar sequel toys whimsical very good