In [11]:
# Import everything needed
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import csv
import re
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pprint import pprint

In [12]:
# Importing dataset needed and prepreprocessing data
dataset = pd.read_csv("./ml-latest-small/movies.csv")
dataset2 = pd.read_csv("./ml-latest-small/ratings.csv")

In [13]:
#Looking into dataset
dataset.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
#Looking into dataset
dataset2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
# Summary statistic of movie.csv
dataset.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [16]:
# Summary statistic of ratings.csv
dataset2.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [17]:
#Check for any null values in dataset
dataset.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [18]:
#Check for any null values in dataset2
dataset2.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [19]:
#Check for duplicated movie titles
dataset.duplicated(subset=("title")).sum()

5

In [20]:
#Drop duplicate movie titles
dataset = dataset.drop_duplicates(subset="title")
dataset.duplicated(subset=("title")).sum()

0

In [21]:
#Drop genres column from dataset
datasetN = dataset.drop("genres", axis="columns")
datasetN.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [22]:
#Drop timestamp column from dataset2
datasetN2 = dataset2.drop("timestamp", axis="columns")
datasetN2.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [23]:
#Merging Dataset 1 and Dataset 2
df = pd.merge(datasetN2, datasetN, how="inner", on="movieId")
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [24]:
#Checking the counts of each rating, to check for small value removals
ratingCountDf = pd.DataFrame(df.groupby(['rating']).size(), columns=['count'])
ratingCountDf

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
0.5,1370
1.0,2811
1.5,1791
2.0,7551
2.5,5548
3.0,20047
3.5,13134
4.0,26816
4.5,8551
5.0,13211


In [27]:
#Checking count of movie
moviesCountDf = pd.DataFrame(df.groupby('movieId').size(), columns=['count'])
moviesCountDf.head()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49


Personalised Recommender System

In [28]:
#Create pivot table from merged dataset
pivotDf = df.pivot(index='userId', columns='title', values='rating')
pivotDf

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [29]:
#Fill pivot table NaN values with 0
newPivotDf = pivotDf.fillna(0)
newPivotDf.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
#Pivot table into sparse matrix/table
sparsePivotDf = csr_matrix(newPivotDf.values)
sparsePivotDf 

<610x9719 sparse matrix of type '<class 'numpy.float64'>'
	with 100830 stored elements in Compressed Sparse Row format>

In [32]:
#Creating our KNN model and fitting to our sparse atrix
knnModel = NearestNeighbors(metric='cosine', algorithm='brute')
knnModel.fit(sparsePivotDf)

NearestNeighbors(algorithm='brute', metric='cosine')

In [33]:
#function to create list of similar user and the distance
def similarUser(user, recAmount):
    knn = np.asarray([newPivotDf.values[user-1]])
    distance, index = knnModel.kneighbors(knn, n_neighbors=recAmount+1)
    return index.flatten()[1:] + 1, distance.flatten()[1:]

def recommendMovies(n, items_to_ignore=[], topn=10, verbose=False):
  n = min(len(meanRating),n)
  return list(movieListId[np.argsort(meanRating)[::-1][:n]])

In [34]:
userInput = int(input("Input your user Id(0-610): "))
userInput2 = int(input("Input the amount of users you want to recommend against(5-610)"))
similarUserList, distanceList = similarUser(userInput, userInput2)

Input your user Id(0-610): 12
Input the amount of users you want to recommend against(5-610)20


In [35]:
#Weighted list
tempVal = np.sum(distanceList)
weightingList = distanceList/tempVal

# Similar Movies
movieSimilar = newPivotDf.values[similarUserList]
movieListId = newPivotDf.columns

weightingList = weightingList[:,np.newaxis] + np.zeros(len(movieListId))

ratingMatrix = weightingList*movieSimilar
meanRating = ratingMatrix.sum(axis=0)

In [36]:
recAm = int(input("Enter the amount of movies you want recommended: "))
pprint(recommendMovies(recAm))

Enter the amount of movies you want recommended: 7
['Silence of the Lambs, The (1991)',
 'Matrix, The (1999)',
 'American Beauty (1999)',
 'Pulp Fiction (1994)',
 'Forrest Gump (1994)',
 'Terminator 2: Judgment Day (1991)',
 'Fight Club (1999)']


Non-Personalised Recommender System

In [37]:
def createMovieRank(data, N):
    trainDataGrouped = data.groupby(['title']).agg({'userId': 'count'}).reset_index()
    trainDataGrouped.rename(columns = {'userId': 'score'}, inplace=True)
    
    trainDataSort = trainDataGrouped.sort_values(['score', 'title'], ascending = [0,1])
    trainDataSort['Rank'] = trainDataSort['score'].rank(ascending = False, method = 'first')
    
    topNMovies = trainDataSort.head(N)
    
    return topNMovies

def recommend(data):
    topNMovies = data
    cols = topNMovies.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    topNMovies = topNMovies[cols]

    #Nice output
    topMoviesReturn = topNMovies.title.to_string(index=False)

    return print(topMoviesReturn)

In [38]:
nPuserInput = int(input("Input your user Id(0-610): "))
nPrecAm = int(input("How many movies would you like Recommended(3-20): "))
topNMovies = createMovieRank(df, nPrecAm)
recommend(topNMovies)

Input your user Id(0-610): 12
How many movies would you like Recommended(3-20): 7
                      Forrest Gump (1994)
         Shawshank Redemption, The (1994)
                      Pulp Fiction (1994)
         Silence of the Lambs, The (1991)
                       Matrix, The (1999)
Star Wars: Episode IV - A New Hope (1977)
                     Jurassic Park (1993)
