In [47]:
#importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [48]:
#loading the dataset
df = pd.read_csv("Dataset_Movie/ratings_small.csv")

In [49]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [51]:
#converting the dataset in correct format
df = df.pivot(index= "userId",columns="movieId",values="rating")

In [52]:
df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


Now we will calculate the centered cosine similartity for each user

In [54]:
# normalizing rating by substracting row mean 
normalized_df = df.sub(df.mean(axis=1), axis=0)

In [55]:
normalized_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,0.513158,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,-0.348039,...,,,,,,,,,,
5,,,0.09,,,,,,,,...,,,,,,,,,,


In [56]:
normalized_df.fillna(0,inplace=True)

In [57]:
# User-Based Cosine Similarity
# compute the user similarity matrix
user_similarity= cosine_similarity(normalized_df.values)

In [58]:
user_sim_df = pd.DataFrame(user_similarity, index=normalized_df.index, columns=normalized_df.index)
user_sim_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.00362,-0.002274,0.0,-0.070321,0.0,0.042632,0.0,...,0.0,0.0,0.018643,0.001031,0.0,0.0,0.0,0.044095,0.0,-0.013096
2,0.0,1.0,-0.001852,-0.004854,0.012639,0.0,0.042691,0.021066,0.011109,-0.007989,...,-0.018248,-0.021546,0.018902,-0.058952,0.028515,-0.106828,-0.007999,-0.041628,-0.090233,0.056258
3,0.0,-0.001852,1.0,0.018594,-0.025903,-0.0632,0.0549,0.026488,-0.036187,0.038021,...,0.044297,0.019581,0.070702,0.030669,0.143705,0.096713,0.027451,0.089297,-0.009815,0.062276
4,0.00362,-0.004854,0.018594,1.0,0.010801,0.019224,0.057519,0.05543,-0.010442,0.005126,...,0.011978,0.006569,0.027687,0.092092,0.021334,0.040833,0.018428,0.028642,0.019848,0.032749
5,-0.002274,0.012639,-0.025903,0.010801,1.0,-0.005843,-0.015075,-0.038886,0.013708,0.0305,...,0.046134,0.001903,0.00162,0.036819,-0.038269,-0.019537,-0.071721,0.00376,-0.029455,-0.036814


In [61]:
def recommend_movies_for_user(user_id, n =5):
    """Recommend top-N movies for a user based on similar users."""
    if user_id not in user_sim_df:
        return "User not found"
        
    #getting the top 5 similar users to user_id [excluding the user itself
    similar_users = user_sim_df[user_id].sort_values(ascending = False)[1:6].index

    # Get movies watched by similar users
    similar_users_ratings = user_sim_df.loc[similar_users].mean()

    # Get movies the target user has already rated
    user_rated_movies = user_sim_df.loc[user_id]
    
    # Filter out movies the user has already watched
    recommended_movies = similar_users_ratings[user_rated_movies == 0].sort_values(ascending=False).head(n)

    return recommended_movies.index.tolist()      
    

In [81]:
print(f'top 5 Moives that should be suggested to user with user_id 2 are : {recommend_movies_for_user(2)}')
print(f'Top 5 Moives that should be suggested to user with user_id 5 are : {recommend_movies_for_user(5)}')
print(f'Top 5 Moives that should be suggested to user with user_id 10 are : {recommend_movies_for_user(10)}')

top 5 Moives that should be suggested to user with user_id 2 are : [221, 6, 604, 252, 459]
Top 5 Moives that should be suggested to user with user_id 5 are : [661, 337, 40, 540, 576]
Top 5 Moives that should be suggested to user with user_id 10 are : [335, 207, 404, 223, 112]
