In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

In [2]:
#Importing user-rating data from the movielens dataset https://grouplens.org/datasets/movielens/
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.merge(movies,ratings).drop(['genres', 'timestamp'], axis=1)
ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [3]:
# Creating a matrix where the columns are movies and the rows are users, with ratings as each value
# Removing users with less than 10 ratings

user_ratings = ratings.pivot_table(index=['userId'], columns=['title'], values='rating').dropna(thresh=10, axis=1)

In [4]:
# Normalizing data algorithm -> This is based off research on decoupling models.
# For the full article visit this link: http://sifaka.cs.uiuc.edu/czhai/pub/cikm03-cf.ps
# This method for normalization takes into account the frequency that a user rates a movie a certain number.
# This allows for users that are generous with their rating and users that tend to give lower ratings to be compared.

for index, row in user_ratings.iterrows():
    ratingLessThan = pd.DataFrame()
    valueCounts = row.value_counts().sort_index()[1:]
    dictionary = {}
    total = 0
    actualTotal = sum(valueCounts)
    ratingEqualTo = valueCounts/actualTotal
    
    for j, value in valueCounts.items():
        total += value
        dictionary[j] = total/actualTotal
    
    
    ratingLessThan = ratingLessThan.append(dictionary, ignore_index=True)
    ratingFinal = ratingLessThan-ratingEqualTo/2
    
    #iterate through every row and replace rating with normalized
    for k, rating in row.items():
        if rating in ratingFinal:
            user_ratings.loc[[index], [k]] = ratingFinal[rating][0]

In [29]:
#Replacing NAN values with 0
user_ratings = user_ratings.fillna(0)
user_ratings

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.293839
2,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.115385,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.843575,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
607,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
608,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.378472,0.000000,0.907639,0.565278,0.000000
609,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [30]:
# Matrix Factorization (SVD)

U, sigma, Vt = svds(user_ratings, k = 100)
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [31]:
# Creating similiarty matrix between movie items

item_sim = cosine_similarity(predicted_ratings.T)
item_sim = pd.DataFrame(item_sim, index=user_ratings.columns, columns=user_ratings.columns)

In [45]:
# Method for recommending movies
def get_recommendation(movie_name, user_rating):
    score = (item_sim[movie_name]*(user_rating - 2.5)).sort_values(ascending=False)
    return score

In [52]:
# Testing Program
user1 = [("Thor: Ragnarok (2017)", 5)]

similar_movies = pd.DataFrame()
for movie, rating in user1:
    similar_movies = similar_movies.append(get_recommendation(movie, rating), ignore_index=True)
    
similar_movies.sum().sort_values(ascending=False)

Thor: Ragnarok (2017)                                    2.500000
Untitled Spider-Man Reboot (2017)                        2.391000
Avengers: Infinity War - Part I (2018)                   2.256643
Guardians of the Galaxy 2 (2017)                         2.235251
Logan (2017)                                             2.182665
                                                           ...   
Alexander (2004)                                        -0.087294
Night at the Museum: Battle of the Smithsonian (2009)   -0.093623
Fantasia 2000 (1999)                                    -0.102011
Inside Job (2010)                                       -0.114328
Evita (1996)                                            -0.145845
Length: 2269, dtype: float64