In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings = pd.read_csv(r"final_dataset(2).csv")
ratings.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,original_title
0,0,1,110,1.0,Trois couleurs : Rouge
1,66512,1,147,4.5,Les Quatre Cents Coups
2,71479,1,858,5.0,Sleepless in Seattle
3,128549,1,1246,5.0,Rocky Balboa
4,154301,1,1968,4.0,Fools Rush In


In [3]:
ratings = ratings.drop('Unnamed: 0',axis=1)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,original_title
0,1,110,1.0,Trois couleurs : Rouge
1,1,147,4.5,Les Quatre Cents Coups
2,1,858,5.0,Sleepless in Seattle
3,1,1246,5.0,Rocky Balboa
4,1,1968,4.0,Fools Rush In


# Collaborative Filtering:



## -> Code wherein we are first finding users similar to a particular user by measuring the similarity between the user ratings(similarity measure used: cosine similarity) and then we are storing the top five similar users corresponding to every user Id which have been obtained by using KNN algorithm.

In [5]:
# Import necessary libraries: pandas for data manipulation, csr_matrix and cosine_similarity from scipy.sparse for 
# creating a sparse matrix and computing cosine similarity between vectors, and NearestNeighbors from scikit-learn for 
# finding nearest neighbors:

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

ratings_copy = ratings[ratings['userId']<=1219]

# Create a sparse matrix with userId as the rows and movieId as the columns.It is typically used to represent the user-item
# interaction data, where the rows represent the users and the columns represent the items (in this case, movies). 
# The non-zero entries in the matrix correspond to the user ratings for the movies. Since most users have not rated 
# most movies, the matrix is very sparse(contains large number of zero values).Sparse matrix saves memory and computation 
# time.
sparse_matrix = csr_matrix((ratings_copy['rating'], (ratings_copy['userId'], ratings_copy['movieId'])))

# Create a NearestNeighbors object using cosine similarity as the metric, and fit it with the sparse matrix.
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(sparse_matrix)

similar_users_df = pd.DataFrame(columns=['userId', 'similar_user_1', 'similar_user_2', 'similar_user_3', 'similar_user_4', 'similar_user_5'])

for user_id in ratings_copy['userId']:
    
     # The sparse_matrix[user_id] is a row vector representing the given user's sparse matrix representation, 
     # while sparse_matrix is the full matrix of all users' sparse matrix representations.
     # The cosine_similarity function takes these two inputs and computes the cosine similarity between the two vectors.
    similarities = cosine_similarity(sparse_matrix[user_id], sparse_matrix)
    
    # Find the k nearest neighbors of the user
    distances, indices = knn.kneighbors(sparse_matrix[user_id])
    num_similar_users = len(indices[0])
    if num_similar_users >= 5:
        
        # Store the similar users in the new DataFrame
        row_data = {'userId': user_id,
                    'similar_user_1': indices[0,1],
                    'similar_user_2': indices[0,2],
                    'similar_user_3': indices[0,3],
                    'similar_user_4': indices[0,4],
                    'similar_user_5': indices[0,5]}
    else:
        # If there are fewer than k similar users, fill the remaining columns with NaN
        row_data = {'userId': user_id}
        for i in range(1, 6+1):
            if i <= num_similar_users:
                row_data[f'similar_user_{i}'] = indices[0,i]
            else:
                row_data[f'similar_user_{i}'] = float('nan')
                
    similar_users_df = similar_users_df.append(row_data, ignore_index=True)

In [6]:
# Let's see the top 5 similar users list corresponding to every user got via cosine similarity and KNN:

similar_users_df.drop_duplicates(subset=['userId'], inplace=True)
similar_users_df

Unnamed: 0,userId,similar_user_1,similar_user_2,similar_user_3,similar_user_4,similar_user_5
0,1,1207,719,1136,672,388
11,2,341,191,518,900,430
28,3,493,201,521,773,871
36,4,53,77,999,692,213
42,5,453,1209,401,959,734
...,...,...,...,...,...,...
4979,1213,1187,153,427,1047,853
4986,1214,846,33,1096,546,638
4993,1215,138,495,758,337,540
4997,1216,816,29,1194,982,768


# Analyzing how close the similar users have rated in the similar_users_df:

In [7]:
result = ratings[ratings['userId']==341]
print(result)

      userId  movieId  rating                   original_title
1345     341        5     3.0                       Four Rooms
1346     341       64     3.0                   Hable con ella
1347     341       79     4.0                               ??
1348     341      260     3.0                     The 39 Steps
1349     341      339     5.0                   Night on Earth
1350     341      605     4.0           The Matrix Revolutions
1351     341      648     4.0              La belle et la b?te
1352     341      762     2.0  Monty Python and the Holy Grail
1353     341      500     3.0                   Reservoir Dogs
1354     341     1092     4.0                    The Third Man


In [8]:
result = ratings[ratings['userId']==2]
print(result)

    userId  movieId  rating                              original_title
11       2        5     3.0                                  Four Rooms
12       2       25     3.0                                     Jarhead
13       2       58     3.0  Pirates of the Caribbean: Dead Man's Chest
14       2       64     4.0                              Hable con ella
15       2       79     4.0                                          ??
16       2      141     3.0                                Donnie Darko
17       2      260     4.0                                The 39 Steps
18       2      339     5.0                              Night on Earth
19       2      377     4.0                   A Nightmare on Elm Street
20       2      605     4.0                      The Matrix Revolutions
21       2      628     4.0                  Interview with the Vampire
22       2      648     4.0                         La belle et la b?te
23       2      762     3.0             Monty Python and the Hol

In [9]:
result = ratings[ratings['userId']==191]
print(result)

     userId  movieId  rating              original_title
728     191      110     5.0      Trois couleurs : Rouge
729     191     1246     4.0                Rocky Balboa
730     191        5     5.0                  Four Rooms
731     191       25     1.0                     Jarhead
732     191       64     3.0              Hable con ella
733     191      141     4.0                Donnie Darko
734     191      260     5.0                The 39 Steps
735     191      339     5.0              Night on Earth
736     191      377     4.0   A Nightmare on Elm Street
737     191      605     3.0      The Matrix Revolutions
738     191      628     4.0  Interview with the Vampire
739     191      648     3.0         La belle et la b?te
740     191      780     4.0  La passion de Jeanne d'Arc
741     191      786     3.0               Almost Famous
742     191      788     3.0              Mrs. Doubtfire
743     191      480     4.0             Monsoon Wedding
744     191      500     4.0   

In [10]:
file_path = r'C:\Users\KIIT\Desktop\MINOR_PROJECT\Datasets\similarity_without_considering_reviews.csv'
similar_users_df.to_csv(file_path,index=False)

In [11]:
similar_users_df[similar_users_df['userId']==991]

Unnamed: 0,userId,similar_user_1,similar_user_2,similar_user_3,similar_user_4,similar_user_5
3974,991,338,979,118,1158,646


In [None]:
cosine_similarity_df