Import libraries

In [90]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine
%matplotlib inline


Importing datasets

In [91]:
# Import the Movies dataset
movies = pd.read_csv('data/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [92]:
#Import the Ratings dataset
ratings = pd.read_csv('data/rating.csv', usecols=['userId', 'movieId', 'rating'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [93]:
# Calculate the length of each smaller array
number_of_smaller_arrays = 10
array_length = len(ratings)
small_array_length = array_length // number_of_smaller_arrays  # Floor division

# Create smaller arrays
ratings_arrays = []
movies_users = []
for i in range(number_of_smaller_arrays):
    start_index = i * small_array_length
    end_index = (i + 1) * small_array_length
    ratings_arrays.append(ratings[start_index:end_index])

In [94]:
df = pd.read_csv('data/rating.csv', nrows=int(0.1 * sum(1 for line in open('data/rating.csv'))))
ratings_arrays[0] = df

In [95]:
movies_users.append(ratings_arrays[0].pivot(index='movieId', columns='userId', values='rating').fillna(0))
movies_users[0].head()

userId,1,2,3,4,5,6,7,8,9,10,...,13559,13560,13561,13562,13563,13564,13565,13566,13567,13568
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,4.0,0.0,2.0,4.0,0.0,5.0,4.5,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
# Create sparse matrix
mat_movies = csr_matrix(movies_users[0].values)

In [97]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20)
model.fit(mat_movies)

Importing library fuzzywuzzy for searching improvement

In [98]:
from fuzzywuzzy import process

In [99]:
def recommender(movie_name, data, n):
    idx = process.extractOne(movie_name, movies['title'])[2]
    print('Movie selected: ', movies['title'][idx], 'Index: ', idx)
    print('Searching for recommendation ...')
    distance, indices = model.kneighbors(data[idx], n_neighbors=n+1)
    for i in indices[0][1:]:
        print(movies['title'][i])

In [100]:
recommender('Lord of the Rings: The Fellowship of the Ring, The (2001)', mat_movies, 10)

Movie selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Index:  4897
Searching for recommendation ...
Ice Station Zebra (1968)
Hedgehog, The (Le hérisson) (2009)
Cooler, The (2003)
Ash Wednesday (2002)
Grapes of Death, The (Raisins de la mort, Les) (1978)
Tyler Perry's Meet the Browns (2008)
Gamera vs. Jiger (1970)
Bank Job, The (2008)
Fatty Drives the Bus (1999)
Katzelmacher (1969)


Recommend movies for a given user

In [101]:
def best_rated_movie_for_user(user_id, ratings_df, movies_df):
    
    # Merge ratings with movie titles
    movie_data = pd.merge(ratings_df, movies_df, on='movieId')
    
    # Filter ratings for the given user
    user_ratings = movie_data[movie_data['userId'] == user_id]
    
    # Find the movie with the highest rating
    best_movie = user_ratings.loc[user_ratings['rating'].idxmax()]
    
    return best_movie

In [102]:
best_movie_for_user = best_rated_movie_for_user(1, ratings, movies)
print(best_movie_for_user)
recommender(best_movie_for_user['title'], mat_movies, 10)

userId                                                     1
movieId                                                 4993
rating                                                   5.0
title      Lord of the Rings: The Fellowship of the Ring,...
genres                                     Adventure|Fantasy
Name: 131, dtype: object
Movie selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Index:  4897
Searching for recommendation ...
Ice Station Zebra (1968)
Hedgehog, The (Le hérisson) (2009)
Cooler, The (2003)
Ash Wednesday (2002)
Grapes of Death, The (Raisins de la mort, Les) (1978)
Tyler Perry's Meet the Browns (2008)
Gamera vs. Jiger (1970)
Bank Job, The (2008)
Fatty Drives the Bus (1999)
Katzelmacher (1969)


In [103]:
class KNNRecommender:
    def __init__(self, k=20):
        self.k = k
    
    def fit(self, data):
        self.data = data
    
    def recommend(self, movie_index, movies, n):
        print('Movie selected:', movies['title'].iloc[movie_index], 'Index:', movie_index)
        print('Searching for recommendation ...')
        
        # Calculate distances
        distances = []
        for i, row in enumerate(self.data):
            print(self.data[movie_index])
            dist = cosine(self.data[movie_index], row)
            distances.append((i, dist))
        
        # Sort distances
        distances.sort(key=lambda x: x[1])
        
        # Get top n recommendations
        recommendations = []
        for i in range(1, n + 1):
            movie_index = distances[i][0]
            recommendations.append(movies['title'].iloc[movie_index])
        
        return recommendations

In [104]:
best_movie_for_user = best_rated_movie_for_user(1, ratings, movies)
print(best_movie_for_user)

userId                                                     1
movieId                                                 4993
rating                                                   5.0
title      Lord of the Rings: The Fellowship of the Ring,...
genres                                     Adventure|Fantasy
Name: 131, dtype: object


In [105]:
movies_merged = movies.merge(ratings, left_on='movieId', right_on='movieId', how='left')
movies_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5


In [106]:
# Splitting genres and stacking them into a Series
genres_series = movies_merged['genres'].str.split('|', expand=True).stack()

# Getting unique genres
genreList = genres_series.unique()

print(genreList)

['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'IMAX'
 'Documentary' 'War' 'Musical' 'Western' 'Film-Noir' '(no genres listed)']


In [107]:
# Create binary genre list
def binary(genre_list):
    binaryList = []
    
    for genre in genreList:
        if genre in genre_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [108]:
# Add new column with binary genres list
movies_merged['genres_bin'] = movies_merged['genres'].apply(lambda x: binary(x))

In [109]:
movies_merged['genres_bin'].tail()

20000792    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20000793    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20000794    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20000795    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20000796    [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
Name: genres_bin, dtype: object

In [110]:
from scipy import spatial

def similarity(movie_rating_df, movieId1, movieId2):
    a = movie_rating_df.iloc[movieId1]
    b = movie_rating_df.iloc[movieId2]

    # ratingA = a['rating']
    # ratingB = b['rating']

    # ratingDistance = spatial.distance.cosine(ratingA, ratingB)

    genreA = a['genres_bin']
    genreB = b['genres_bin']
    genreDistance = spatial.distance.cosine(genreA, genreB)

    return genreDistance

In [111]:
# movies.head()
# dist = similarity(1, 2)
# print(dist)

abdist = similarity(movies_merged, 1, 2000000)
print(abdist)

0.7763932022500211


In [112]:
# Load the dataset
df = pd.read_csv('data/rating.csv', nrows=int(0.1 * sum(1 for line in open('data/rating.csv'))))

# Create a pivot table of users and their ratings for movies
pivot_table = df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

In [113]:
pivot_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,13559,13560,13561,13562,13563,13564,13565,13566,13567,13568
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,4.0,0.0,2.0,4.0,0.0,5.0,4.5,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
pivot_table.shape

(16715, 13568)

In [115]:
# Calculate cosine similarity between users
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# Function to find k nearest neighbors
def find_neighbors(movieId, k, pivot_table):
    similarities = []
    for idx, row in pivot_table.iterrows():
        if idx != movieId:
            similarity = cosine_similarity(pivot_table.loc[movieId], row)
            similarities.append((idx, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]

# Function to recommend movies
def recommend_movies(userId, k, pivot_table):
    top_user_movie_id = best_rated_movie_for_user(userId, ratings, movies)['movieId']
    print(top_user_movie_id)
    neighbors = find_neighbors(top_user_movie_id, k, pivot_table)
    print(neighbors)
    recommended_movies = []
    for neighbor, similarity in neighbors:
        movieId = df.loc[neighbor, 'movieId']
        rated_movie = {'movieId': movieId, 'title': movies[movies['movieId'] == movieId]['title'], 'similarity':  similarity}
        recommended_movies.append(rated_movie)
    return recommended_movies

In [116]:
userId = 1
recommended_movies = recommend_movies(userId, 10, pivot_table)
for movie in recommended_movies:
    print(movie)

4993
[(5952, 0.8462670846739063), (7153, 0.8214076840708104), (4306, 0.6680991947076371), (2571, 0.657668556891434), (6539, 0.6484930050964552), (5349, 0.6377449220325212), (2959, 0.6078656241206561), (5445, 0.6063999617245383), (4226, 0.594872772153359), (4963, 0.5929649312238658)]
{'movieId': 2944, 'title': 2858    Dirty Dozen, The (1967)
Name: title, dtype: object, 'similarity': 0.8462670846739063}
{'movieId': 2424, 'title': 2339    You've Got Mail (1998)
Name: title, dtype: object, 'similarity': 0.8214076840708104}
{'movieId': 6, 'title': 5    Heat (1995)
Name: title, dtype: object, 'similarity': 0.6680991947076371}
{'movieId': 231, 'title': 228    Dumb & Dumber (Dumb and Dumber) (1994)
Name: title, dtype: object, 'similarity': 0.657668556891434}
{'movieId': 1242, 'title': 1214    Glory (1989)
Name: title, dtype: object, 'similarity': 0.6484930050964552}
{'movieId': 1719, 'title': 1659    Sweet Hereafter, The (1997)
Name: title, dtype: object, 'similarity': 0.6377449220325212}
{'mo

In [117]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20)
model.fit(pivot_table)
recommender('Lord of the Rings: The Fellowship of the Ring, The (2001)', pivot_table, 10)

Movie selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Index:  4897
Searching for recommendation ...


ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [118]:
movieId = df.loc[5952, 'movieId']
movies[movies['movieId'] == movieId]['title']

2858    Dirty Dozen, The (1967)
Name: title, dtype: object

In [119]:
# Utilities
def get_movie_info(movieId):
    movie_info = movies[movies['movieId'] == movieId]
    if len(movie_info) > 0:
        movie_info = movie_info.iloc[0]
        return f"Movie ID: {movie_info['movieId']}, Title: {movie_info['title']}"
    else:
        return "Movie not found"
    
# Function to find movies with the highest rating rated by a given userId
def get_top_rated_movies_for_user(userId):
    user_ratings = ratings[ratings['userId'] == userId]
    max_rating = user_ratings['rating'].max()
    top_movies = user_ratings[user_ratings['rating'] == max_rating]['movieId'].tolist()
    return top_movies, max_rating



In [149]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sims = []
# Function to find k-nearest neighbors using cosine similarity
def find_nearest_neighbors(pivot_table, target_point, k=1):
    movieId = target_point[0]
    userId = target_point[1]
    similarities = []
    target_vec = pivot_table.loc[target_point[0]].values  # Reshape to ensure 2D array
    
    for index, row in pivot_table.iterrows():
        if index == movieId:
            continue
        if pivot_table.loc[index, row.index.isin([userId])].any() != 0:
            continue
        point_vec = row.values  # Reshape to ensure 2D array
        similarity = cosine_sim(target_vec, point_vec)
        sims.append(similarity)
        similarities.append((index, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)  # Sort similarities in descending order
    neighbors = similarities[:k]  # Get top k neighbors
    return neighbors


In [150]:
userId = 1
movieId = 4993
target_point = (movieId, userId)  # Example movieId and userId
nearest_neighbors = find_nearest_neighbors(pivot_table, target_point)
print("Top 1 Nearest Neighbors:")
print(nearest_neighbors)

Top 1 Nearest Neighbors:
[(2571, 0.657668556891434)]


In [None]:
for i in nearest_neighbors:
    print(get_movie_info(i[0]))

Movie ID: 2571, Title: Matrix, The (1999)
Movie ID: 5349, Title: Spider-Man (2002)
Movie ID: 5445, Title: Minority Report (2002)
Movie ID: 4963, Title: Ocean's Eleven (2001)
Movie ID: 3793, Title: X-Men (2000)
Movie ID: 3578, Title: Gladiator (2000)
Movie ID: 6377, Title: Finding Nemo (2003)
Movie ID: 4886, Title: Monsters, Inc. (2001)
Movie ID: 6874, Title: Kill Bill: Vol. 1 (2003)
Movie ID: 5418, Title: Bourne Identity, The (2002)


In [None]:
userId = 1
top_movies, max_rating = get_top_rated_movies_for_user(userId)
print("Maximum rating for user", userId, ":", max_rating)
print("Movies with the maximum rating:", top_movies)

for movieId in top_movies:
    nearest_neighbors = find_nearest_neighbors(pivot_table, (movieId, userId), k)
    movie_info = get_movie_info(movieId)
    
    print(f"\nFor movie {movie_info}, the {k} nearest neighbors are:")
    for neighbor in nearest_neighbors:
        neighbor_info = get_movie_info(neighbor[0])
        print(f"- {neighbor_info} with similarity {neighbor[1]:.2f}")
        print("_____________________________________________________")

Maximum rating for user 1 : 5.0
Movies with the maximum rating: [4993, 5952, 7153, 8507]

For movie Movie ID: 4993, Title: Lord of the Rings: The Fellowship of the Ring, The (2001), the 1 nearest neighbors are:
- Movie ID: 2571, Title: Matrix, The (1999) with similarity 0.66
_____________________________________________________

For movie Movie ID: 5952, Title: Lord of the Rings: The Two Towers, The (2002), the 1 nearest neighbors are:
- Movie ID: 2571, Title: Matrix, The (1999) with similarity 0.63
_____________________________________________________

For movie Movie ID: 7153, Title: Lord of the Rings: The Return of the King, The (2003), the 1 nearest neighbors are:
- Movie ID: 2571, Title: Matrix, The (1999) with similarity 0.62
_____________________________________________________

For movie Movie ID: 8507, Title: Freaks (1932), the 1 nearest neighbors are:
- Movie ID: 6987, Title: Cabinet of Dr. Caligari, The (Cabinet des Dr. Caligari., Das) (1920) with similarity 0.39
___________

In [None]:
recommender('Matrix, The (1999)', mat_movies, 10)

Movie selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Index:  4897
Searching for recommendation ...
Ice Station Zebra (1968)
Hedgehog, The (Le hérisson) (2009)
Cooler, The (2003)
Ash Wednesday (2002)
Grapes of Death, The (Raisins de la mort, Les) (1978)
Tyler Perry's Meet the Browns (2008)
Gamera vs. Jiger (1970)
Bank Job, The (2008)
Fatty Drives the Bus (1999)
Katzelmacher (1969)
