<a href="https://colab.research.google.com/github/SerDavidE/CineMatch/blob/main/Movie_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the datasets
links_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Chapter_8/links.csv')
movies_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Chapter_8/movies.csv')
ratings_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Chapter_8/ratings.csv')
tags_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Chapter_8/tags.csv')

# Display the first few rows of each dataset to understand their structure
datasets = {'Links': links_df, 'Movies': movies_df, 'Ratings': ratings_df, 'Tags': tags_df}
for name, df in datasets.items():
    print(f"First few rows of {name} dataset:")
    display(df.head())
    print("\n")

First few rows of Links dataset:


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0




First few rows of Movies dataset:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy




First few rows of Ratings dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931




First few rows of Tags dataset:


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200






# Popularity-Based Recommender:


In [None]:
# Merge the Movies and Ratings datasets on the 'movieId' field
merged_df = pd.merge(movies_df, ratings_df, on='movieId')

# Calculate aggregate metrics
# Calculate the mean rating for each movie
# Calculate the number of ratings for each movie
agg_metrics = merged_df.groupby('movieId').agg({'rating': ['mean', 'count']}).reset_index()
agg_metrics.columns = ['movieId', 'mean_rating', 'rating_count']

# Merge these metrics back into the Movies dataset
movies_with_metrics = pd.merge(movies_df, agg_metrics, on='movieId')

# Display the first few rows of the merged dataset with metrics
movies_with_metrics.head()

Unnamed: 0,movieId,title,genres,mean_rating,rating_count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49


In [None]:
import numpy as np

def recommend_top_n_movies(n):
    # Exclude 5-star movies with only one rating
    filtered_df = movies_with_metrics[(movies_with_metrics['mean_rating'] != 5) | (movies_with_metrics['rating_count'] > 1)]

    # Calculate the Popularity Score
    filtered_df['popularity_score'] = filtered_df['mean_rating'] * np.log1p(filtered_df['rating_count'])

    # Sort movies by Popularity Score
    sorted_df = filtered_df.sort_values(by='popularity_score', ascending=False)

    # Return the top n movies
    top_n_movies = sorted_df.head(n)

    return top_n_movies[['title', 'mean_rating', 'rating_count', 'popularity_score']]

# Test the function with n=10
recommend_top_n_movies(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['popularity_score'] = filtered_df['mean_rating'] * np.log1p(filtered_df['rating_count'])


Unnamed: 0,title,mean_rating,rating_count,popularity_score
277,"Shawshank Redemption, The (1994)",4.429022,317,25.520253
314,Forrest Gump (1994),4.164134,329,24.148197
257,Pulp Fiction (1994),4.197068,307,24.049621
1938,"Matrix, The (1999)",4.192446,278,23.608552
510,"Silence of the Lambs, The (1991)",4.16129,279,23.447995
224,Star Wars: Episode IV - A New Hope (1977),4.231076,251,23.395433
2224,Fight Club (1999),4.272936,218,23.027157
461,Schindler's List (1993),4.225,220,22.807237
897,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211,22.581438
659,"Godfather, The (1972)",4.289062,192,22.572007


# Content-Based Recommender:
This system will take a movie title and a number n, and return the n most similar movies to the given title based on content features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the 'genres' column: Replace '|' with ' '
movies_df['genres'] = movies_df['genres'].apply(lambda x: ' '.join(x.split('|')))

# Create a CountVectorizer object
count_vectorizer = CountVectorizer()

# Fit and transform the 'genres' column
count_matrix = count_vectorizer.fit_transform(movies_df['genres'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)

# Create a mapping between movie titles and their index in the DataFrame
title_to_index = pd.Series(movies_df.index, index=movies_df['title']).to_dict()
index_to_title = pd.Series(movies_df['title'], index=movies_df.index).to_dict()

# Function to get most similar movies
def get_similar_movies(title, n=5):
    try:
        # Get the index of the movie title
        idx = title_to_index[title]
    except KeyError:
        return "Movie title not found in dataset."

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the n most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Get the corresponding movie titles
    similar_movies = [index_to_title[i] for i in movie_indices]

    return similar_movies

# Test the function
get_similar_movies('Toy Story (1995)', 5)

['Antz (1998)',
 'Toy Story 2 (1999)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 "Emperor's New Groove, The (2000)",
 'Monsters, Inc. (2001)']

# Collaborative Filtering Recommender using Surprise

In [None]:
%%capture
pip install scikit-surprise

In [None]:
from surprise import Reader, Dataset, KNNBasic, accuracy
from surprise.model_selection import train_test_split

## Building a surprise recommender:

In [None]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Prepare the data in Surprise format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data into training and test set
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [None]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

# Prepare the data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

# Train the model
model = SVD()
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)

# Function to get top-N recommendations for each user
from collections import defaultdict

def get_top_n_recommendations(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Get top-10 recommendations for each user
top_n = get_top_n_recommendations(predictions, n=10)

RMSE: 0.8786


# Hyperparameter Optimization with Grid Search

In [None]:
!pip install --upgrade scikit-surprise



In [None]:
from surprise.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import learning_curve, GridSearchCV

In [None]:
#from surprise import GridSearchCV

# Define parameter grid
param_grid = {
    'n_epochs': [5, 10],  # number of epochs
    'lr_all': [0.002, 0.005],  # learning rate
    'reg_all': [0.4, 0.6]  # regularization term
}

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

# Get best parameters
best_params = gs.best_params['rmse']

# Create model with best parameters
model = SVD(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

In [None]:
best_params

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

# Algorithm Selection



In [None]:
from surprise import KNNBasic

# Create a KNNBasic model
knn_model = KNNBasic()

# Fit the model
knn_model.fit(trainset)

# Make predictions
knn_predictions = knn_model.test(testset)

# Calculate RMSE
knn_rmse = accuracy.rmse(knn_predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9561


# Model Finalization:

In [None]:
final_model = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
final_model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e9bfb7d6cb0>

# Precision@k and Recall@k
For evaluating the quality of the top-k recommendations.



In [None]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

# Train final model with best parameters
final_model = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
final_model.fit(data.build_full_trainset())

# Make predictions
final_predictions = final_model.test(testset)

# Compute Precision@k and Recall@k
precisions, recalls = precision_recall_at_k(final_predictions, k=5, threshold=3.5)

# Compute average Precision@k and average Recall@k
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall = sum(rec for rec in recalls.values()) / len(recalls)

In [None]:
avg_precision

0.8442896174863413

In [None]:
avg_recall

0.3551629817978822

# Creating a Simple Web-App with Streamlit



In [None]:
import streamlit as st
import pickle

# Load the model
filename = 'finalized_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

st.title('Movie Recommender System')

# Get user input
user_id = st.text_input("Enter your User ID:", "")
movie_id = st.text_input("Enter the Movie ID:", "")

if st.button('Recommend'):
    data = [[user_id, movie_id]]
    # Make prediction (replace this line with your model's prediction method)
    prediction = loaded_model.predict(user_id, movie_id)

    st.write(f"Estimated rating is: {prediction.est}")

In [None]:
def popularity_recommender(n):
    # Your code for popularity-based recommendation
    return top_n_popular_movies

def content_recommender(movie_id):
    # Your code for content-based recommendation
    return similar_movies

def collaborative_recommender(user_id):
    # Your code for collaborative filtering recommendation
    return personalized_movies

In [None]:
def hybrid_recommender(user_id, movie_id=None, n=10):
    if not user_id:
        # New user: Use popularity-based recommender
        return popularity_recommender(n)

    if movie_id:
        # Item available: Use content-based recommender
        return content_recommender(movie_id)

    # Returning personalized recommendations for existing user
    return collaborative_recommender(user_id)