Library imports

In [2]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine
%matplotlib inline

Dataset imports

In [3]:
# Import the Movies dataset
movies = pd.read_csv('data/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Import the Ratings dataset
ratings = pd.read_csv('data/rating.csv', usecols=['userId', 'movieId', 'rating'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [5]:
# Load the dataset (1/10 of the data due to huge size)
df = pd.read_csv('data/rating.csv', nrows=int(0.1 * sum(1 for line in open('data/rating.csv'))))

# Create a pivot table of users and their ratings for movies
pivot_table = df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)
pivot_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,13559,13560,13561,13562,13563,13564,13565,13566,13567,13568
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,4.0,0.0,2.0,4.0,0.0,5.0,4.5,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Function to normalize ratings
def normalize_ratings(pivot_table):
    return pivot_table / 5

In [29]:
pivot_table = normalize_ratings(pivot_table)
pivot_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,13559,13560,13561,13562,13563,13564,13565,13566,13567,13568
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.8,0.0,0.0,1.0,0.0,0.8,0.0,0.8,...,0.8,0.0,0.4,0.8,0.0,1.0,0.9,0.0,0.0,0.0
2,0.7,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0
3,0.0,0.8,0.0,0.0,0.0,0.6,0.6,1.0,0.0,0.0,...,0.4,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
pivot_table.shape

(16715, 13568)

Utilities

In [31]:
def get_movie_info(movieId):
    movie_info = movies[movies['movieId'] == movieId]
    if len(movie_info) > 0:
        movie_info = movie_info.iloc[0]
        return f"Movie ID: {movie_info['movieId']}, Title: {movie_info['title']}"
    else:
        return "Movie not found"
    
# Function to find movies with the highest rating rated by a given userId
def get_top_rated_movies_for_user(userId):
    user_ratings = ratings[ratings['userId'] == userId]
    max_rating = user_ratings['rating'].max()
    top_movies = user_ratings[user_ratings['rating'] == max_rating]['movieId'].tolist()
    return top_movies, max_rating

KNN Algorithm

In [35]:
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

sims = []
# Function to find k-nearest neighbors using cosine similarity
def find_nearest_neighbors(pivot_table, target_point, k=1):
    movieId = target_point[0]
    userId = target_point[1]
    similarities = []
    # Finding vector with given movieId that contains all users' ratings
    target_vec = pivot_table.loc[movieId].values
    
    for index, row in pivot_table.iterrows():
        if index == movieId:
            continue
        if pivot_table.loc[index, row.index.isin([userId])].any() != 0:
            continue
        # For given movie vector with all users' ratings we are looking for
        # the similar vector (based on other users' ratings) using cosine similarity
        point_vec = row.values
        similarity = cosine_similarity(target_vec, point_vec)
        print(target_vec)
        print(point_vec)
        sims.append(similarity)
        similarities.append((index, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)  # Sort similarities in descending order
    neighbors = similarities[:k]  # Get top k neighbors
    return neighbors


Sample usage

In [36]:
k = 3
userId = 55
top_movies, max_rating = get_top_rated_movies_for_user(userId)
print("Maximum rating for user", userId, ":", max_rating)
print("Movies with the maximum rating:", top_movies)

for movieId in top_movies:
    nearest_neighbors = find_nearest_neighbors(pivot_table, (movieId, userId), k)
    movie_info = get_movie_info(movieId)
    
    print(f"\nFor movie {movie_info}, the {k} nearest neighbors are:")
    for neighbor in nearest_neighbors:
        neighbor_info = get_movie_info(neighbor[0])
        print(f"- {neighbor_info} with similarity {neighbor[1]:.2f}")
        print("____________________________________________________________________")

Maximum rating for user 55 : 5.0
Movies with the maximum rating: [260, 858, 1196, 1198, 1221, 1287, 1387, 1721, 1954, 2028, 2947]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0.  0.  0.8 ... 0.  0.  0. ]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0.7 0.  0.  ... 0.  0.  0. ]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0.  0.8 0.  ... 0.  0.  0. ]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0.  0.  0.  ... 0.9 0.  0. ]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0. 0. ... 0. 0. 0.]
[0.8 1.  1.  ... 0.9 0.  0. ]
[0. 0.

KeyboardInterrupt: 