In [2]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163748 sha256=dc7bf12645afe3888d1b7a8222ee5985eb950f9b95c07f4e369e85b987b99b4e
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
import pandas as pd

# Load the dataset
movies_metadata = pd.read_csv('/content/sample_data/movies_metadata.csv')
ratings = pd.read_csv('/content/sample_data/ratings_small.csv')

# Dataset description
print(movies_metadata.info())
print(ratings.info())


  movies_metadata = pd.read_csv('/content/sample_data/movies_metadata.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [4]:
ratings.movieId = pd.to_numeric(ratings.movieId, errors='coerce')
ratings.userId = pd.to_numeric(ratings.userId, errors='coerce')
ratings.rating = pd.to_numeric(ratings.rating, errors='coerce')

len(ratings)
df = ratings
print(df)

        userId  movieId  rating   timestamp
0            1       31     2.5  1260759144
1            1     1029     3.0  1260759179
2            1     1061     3.0  1260759182
3            1     1129     2.0  1260759185
4            1     1172     4.0  1260759205
...        ...      ...     ...         ...
99999      671     6268     2.5  1065579370
100000     671     6269     4.0  1065149201
100001     671     6365     4.0  1070940363
100002     671     6385     2.5  1070979663
100003     671     6565     3.5  1074784724

[100004 rows x 4 columns]


In [5]:
from surprise import Reader, Dataset, KNNBasic, SVD, NMF
from surprise.model_selection import GridSearchCV, cross_validate

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df( ratings[['userId', 'movieId', 'rating']], reader = reader )

In [6]:
#SVD

algo_svd = SVD()
cross_validate(algo=algo_svd, data=data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8945  0.9002  0.9018  0.8978  0.8889  0.8966  0.0046  
Fit time          1.48    1.52    1.51    1.48    1.48    1.49    0.02    
Test time         0.29    0.14    0.23    0.12    0.27    0.21    0.07    


{'test_rmse': array([0.8944765 , 0.90019816, 0.90182399, 0.89783208, 0.8889074 ]),
 'fit_time': (1.4757559299468994,
  1.5237689018249512,
  1.5145845413208008,
  1.481130838394165,
  1.478412389755249),
 'test_time': (0.28756237030029297,
  0.14209246635437012,
  0.22837066650390625,
  0.122222900390625,
  0.26953792572021484)}

In [7]:
#Fine Tuning SVD using GridSearchCV

param_grid = {'n_factors' : [50, 75], 'lr_all' : [0.5, 0.05], 'reg_all' : [0.06, 0.04]}

gs = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid)
gs.fit(data)

# Best RMSE score
print('Best Score :', gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', gs.best_params['rmse'])


Best Score : 0.8898638506294645
Best Parameters : {'n_factors': 75, 'lr_all': 0.05, 'reg_all': 0.06}


In [8]:
#SVD_train

from surprise.model_selection import train_test_split
import random
import numpy as np

random.seed(42)
np.random.seed(42)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Instantiate the SVD algorithm with the best parameters
best_svd = SVD(n_factors=gs.best_params['rmse']['n_factors'],
               lr_all=gs.best_params['rmse']['lr_all'],
               reg_all=gs.best_params['rmse']['reg_all'])

# Train the model on the entire dataset
trainset = data.build_full_trainset()
best_svd.fit(trainset)

# Get a list of all movie IDs
all_movie_ids = set(ratings['movieId'].unique())

# Get a list of movie IDs rated by user 1
rated_movie_ids_by_user = set(ratings[ratings['userId'] == 1]['movieId'].unique())

# Movies that user 1 has not rated
unrated_movies = list(all_movie_ids - rated_movie_ids_by_user)

# Make predictions for unrated movies
predictions = [best_svd.predict(1, movie_id) for movie_id in unrated_movies]

# Sort predictions by estimated rating in descending order
top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)

# Print the top 10 recommended movie titles with available information
top_recommendations_with_info = []

for prediction in top_recommendations:
    movie_id = prediction.iid
    movie_info = movies_metadata[movies_metadata['id'] == str(movie_id)]
    if not movie_info.empty:
        top_recommendations_with_info.append(prediction)
        if len(top_recommendations_with_info) == 10:
            break

for i, prediction in enumerate(top_recommendations_with_info, start=1):
    movie_id = prediction.iid
    movie_title = movies_metadata[movies_metadata['id'] == str(movie_id)]['title'].values[0]
    print(f"{i}. {movie_title} (MovieID: {movie_id}, Predicted Rating: {prediction.est:.2f})")

1. Urban Explorer (MovieID: 73290, Predicted Rating: 3.85)
2. A Christmas Story (MovieID: 850, Predicted Rating: 3.79)
3. More of Me (MovieID: 134881, Predicted Rating: 3.77)
4. Gentlemen Prefer Blondes (MovieID: 759, Predicted Rating: 3.75)
5. The In-Laws (MovieID: 5146, Predicted Rating: 3.75)
6. License to Wed (MovieID: 2959, Predicted Rating: 3.73)
7. Design of Death (MovieID: 114464, Predicted Rating: 3.72)
8. The Crazy Stranger (MovieID: 5071, Predicted Rating: 3.68)
9. The Night of the Hunter (MovieID: 3112, Predicted Rating: 3.67)
10. Anybody's Son Will Do (MovieID: 38061, Predicted Rating: 3.66)


In [9]:
#SVD_test

from surprise import KNNBasic

# Create a KNNBasic model with user-based collaborative filtering
knn_user = KNNBasic(sim_options={'user_based': True})

# Fit the model to the data
knn_user.fit(trainset)

# Find the k-nearest neighbors of user 1
user_neighbors = knn_user.get_neighbors(1, k=10)

# Print the top 10 similar users to user 1
print(f"Top 10 similar users to user 1: {user_neighbors}")

# Find movies rated by these similar users that user 1 has not rated
movies_rated_by_neighbors = []
for neighbor in user_neighbors:
    movies_rated_by_neighbor = [item[0] for item in trainset.ur[trainset.to_inner_uid(neighbor)]]
    movies_rated_by_neighbors.extend(movies_rated_by_neighbor)

unrated_movies = set(movies_rated_by_neighbors) - rated_movie_ids_by_user

# Make predictions for unrated movies
predictions = [best_svd.predict(1, movie_id) for movie_id in unrated_movies]

# Sort predictions by estimated rating in descending order
top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)

# Print the top 10 recommended movie titles with available information
top_recommendations_with_info = []

for i, prediction in enumerate(top_recommendations, start=1):
    movie_id = prediction.iid
    movie_info = movies_metadata[movies_metadata['id'] == str(movie_id)]
    if not movie_info.empty:
        top_recommendations_with_info.append(prediction)
        if len(top_recommendations_with_info) == 10:
            break

for i, prediction in enumerate(top_recommendations_with_info, start=1):
    movie_id = prediction.iid
    movie_title = movies_metadata[movies_metadata['id'] == str(movie_id)]['title'].values[0]
    print(f"{i}. {movie_title} (MovieID: {movie_id}, Predicted Rating: {prediction.est:.2f})")


Computing the msd similarity matrix...
Done computing similarity matrix.
Top 10 similar users to user 1: [27, 34, 140, 195, 196, 201, 208, 232, 279, 285]
1. A Christmas Story (MovieID: 850, Predicted Rating: 3.79)
2. Gentlemen Prefer Blondes (MovieID: 759, Predicted Rating: 3.75)
3. Head-On (MovieID: 363, Predicted Rating: 3.63)
4. Human Nature (MovieID: 441, Predicted Rating: 3.62)
5. Notes on a Scandal (MovieID: 1259, Predicted Rating: 3.57)
6. A Streetcar Named Desire (MovieID: 702, Predicted Rating: 3.57)
7. Lassie Come Home (MovieID: 2202, Predicted Rating: 3.55)
8. Swept from the Sea (MovieID: 1959, Predicted Rating: 3.53)
9. Snakes on a Plane (MovieID: 326, Predicted Rating: 3.49)
10. The Bank Dick (MovieID: 911, Predicted Rating: 3.49)


In [10]:
from surprise import accuracy

# Calculate MSE for train set
train_predictions = best_svd.test(trainset.build_testset())
train_mse = accuracy.mse(train_predictions)
print(f"MSE for the train set(SVD): {train_mse}")

# Calculate MSE for test set
test_predictions = best_svd.test(testset)
test_mse = accuracy.mse(test_predictions)
print(f"MSE for the test set(SVD): {test_mse}")


MSE: 0.1814
MSE for the train set(SVD): 0.1814159494700053
MSE: 0.1823
MSE for the test set(SVD): 0.182318783341357


In [11]:
#Non-Negative Matrix Factorization (NMF)

algo_nmf = NMF()
cross_validate(data=data, algo=algo_nmf, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9454  0.9592  0.9394  0.9420  0.9504  0.9473  0.0070  
Fit time          2.60    2.60    2.66    3.80    2.58    2.85    0.48    
Test time         0.16    0.10    0.21    0.11    0.13    0.14    0.04    


{'test_rmse': array([0.94540763, 0.95917567, 0.93943878, 0.9420275 , 0.95035367]),
 'fit_time': (2.6045079231262207,
  2.5987815856933594,
  2.657496690750122,
  3.79813289642334,
  2.5752921104431152),
 'test_time': (0.15596652030944824,
  0.09773492813110352,
  0.21096134185791016,
  0.11241650581359863,
  0.12791895866394043)}

In [12]:
#NMF_train

import random
import numpy as np
from surprise.model_selection import train_test_split

random.seed(42)
np.random.seed(42)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train the NMF model on the entire dataset
trainset = data.build_full_trainset()
algo_nmf.fit(trainset)

# Get a list of all movie IDs
all_movie_ids = set(ratings['movieId'].unique())

# Get a list of movie IDs rated by user 1
rated_movie_ids_by_user = set(ratings[ratings['userId'] == 1]['movieId'].unique())

# Movies that user 1 has not rated
unrated_movies = list(all_movie_ids - rated_movie_ids_by_user)

# Make predictions for unrated movies
predictions = [algo_nmf.predict(1, movie_id) for movie_id in unrated_movies]

# Sort predictions by estimated rating in descending order
top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)

# Print the top 10 recommended movie titles with available information
top_recommendations_with_info = []

for prediction in top_recommendations:
    movie_id = prediction.iid
    movie_info = movies_metadata[movies_metadata['id'] == str(movie_id)]
    if not movie_info.empty:
        top_recommendations_with_info.append(prediction)
        if len(top_recommendations_with_info) == 10:
            break

for i, prediction in enumerate(top_recommendations_with_info, start=1):
    movie_id = prediction.iid
    movie_title = movies_metadata[movies_metadata['id'] == str(movie_id)]['title'].values[0]
    print(f"{i}. {movie_title} (MovieID: {movie_id}, Predicted Rating: {prediction.est:.2f})")

1. End of the World (MovieID: 3030, Predicted Rating: 4.17)
2. The Vampire Lovers (MovieID: 31952, Predicted Rating: 4.09)
3. Gentlemen Prefer Blondes (MovieID: 759, Predicted Rating: 3.97)
4. Pulgasari (MovieID: 26974, Predicted Rating: 3.96)
5. Still Bill (MovieID: 55063, Predicted Rating: 3.94)
6. The Return of the King (MovieID: 1361, Predicted Rating: 3.93)
7. Veerana (MovieID: 98122, Predicted Rating: 3.91)
8. Murder in Three Acts (MovieID: 6107, Predicted Rating: 3.90)
9. Before Sunset (MovieID: 80, Predicted Rating: 3.85)
10. Strangers on a Train (MovieID: 845, Predicted Rating: 3.81)


In [13]:
#NMF_test

from surprise import KNNBasic

# Create a KNNBasic model with user-based collaborative filtering
knn_user = KNNBasic(sim_options={'user_based': True})

# Fit the model to the data
knn_user.fit(trainset)

# Find the k-nearest neighbors of user 1
user_neighbors = knn_user.get_neighbors(1, k=10)

# Print the top 10 similar users to user 1
print(f"Top 10 similar users to user 1: {user_neighbors}")

# Find movies rated by these similar users that user 1 has not rated
movies_rated_by_neighbors = []
for neighbor in user_neighbors:
    movies_rated_by_neighbor = [item[0] for item in trainset.ur[trainset.to_inner_uid(neighbor)]]
    movies_rated_by_neighbors.extend(movies_rated_by_neighbor)

unrated_movies = set(movies_rated_by_neighbors) - rated_movie_ids_by_user

# Make predictions for unrated movies
predictions = [algo_nmf.predict(1, movie_id) for movie_id in unrated_movies]

# Sort predictions by estimated rating in descending order
top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)

# Print the top 10 recommended movie titles with available information
top_recommendations_with_info = []

for i, prediction in enumerate(top_recommendations, start=1):
    movie_id = prediction.iid
    movie_info = movies_metadata[movies_metadata['id'] == str(movie_id)]
    if not movie_info.empty:
        top_recommendations_with_info.append(prediction)
        if len(top_recommendations_with_info) == 10:
            break

for i, prediction in enumerate(top_recommendations_with_info, start=1):
    movie_id = prediction.iid
    movie_title = movies_metadata[movies_metadata['id'] == str(movie_id)]['title'].values[0]
    print(f"{i}. {movie_title} (MovieID: {movie_id}, Predicted Rating: {prediction.est:.2f})")


Computing the msd similarity matrix...
Done computing similarity matrix.
Top 10 similar users to user 1: [27, 34, 140, 195, 196, 201, 208, 232, 279, 285]
1. Gentlemen Prefer Blondes (MovieID: 759, Predicted Rating: 3.97)
2. Before Sunset (MovieID: 80, Predicted Rating: 3.85)
3. Wild at Heart (MovieID: 483, Predicted Rating: 3.72)
4. Saw (MovieID: 176, Predicted Rating: 3.66)
5. Lassie Come Home (MovieID: 2202, Predicted Rating: 3.64)
6. Within the Woods (MovieID: 2186, Predicted Rating: 3.56)
7. Walk on Water (MovieID: 26, Predicted Rating: 3.55)
8. Blondie Johnson (MovieID: 4112, Predicted Rating: 3.54)
9. Unforgiven (MovieID: 33, Predicted Rating: 3.54)
10. Paradise Now (MovieID: 67, Predicted Rating: 3.54)


In [14]:

# Calculate MSE for train set
train_predictions = algo_nmf.test(trainset.build_testset())
train_mse = accuracy.mse(train_predictions)
print(f"MSE for the train set(NMF): {train_mse}")

# Calculate MSE for test set
test_predictions = algo_nmf.test(testset)
test_mse = accuracy.mse(test_predictions)
print(f"MSE for the test set(NMF): {test_mse}")


MSE: 0.4291
MSE for the train set(NMF): 0.42912908405631817
MSE: 0.4320
MSE for the test set(NMF): 0.43203497034285226
