# predicting the rating of a movie based on the 5 movies that are "nearest" to it in terms of their genres and ratings
## Loading up the pandas dataframe with Movielens 100k dataset

In [6]:
import pandas as pd

cols_names = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=cols_names, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


### Now group by movie ID, and calculating  the number of ratings ("size" in numpy) and average rating ("mean" in numpy) for each movie.

In [10]:
import numpy as np

moviedata = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
moviedata.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


Normalizing the rating values by ranging between 0 and 1 

In [11]:
movieNumRatings = pd.DataFrame(moviedata['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.774914
2,0.223368
3,0.152921
4,0.357388
5,0.146048


In [13]:
movieDict = {}
with open(r'ml-100k/u.item') as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), moviedata.loc[movieID].rating.get('mean'))

In [20]:
movieDict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7749140893470791,
 3.8783185840707963)

In [15]:
from scipy import spatial
import math

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    overallDistance = math.sqrt((genreDistance ** 2) + (popularityDistance ** 2))
    return overallDistance

In [16]:
ComputeDistance(movieDict[2], movieDict[4])

0.6800043901643321

In [17]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

In [18]:
K = 5

print (str(K) + " Nearest Neigbors: ")
print (" ")

sumRatings = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    sumRatings += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating = sumRatings / K
print (" ")
print ("Average Rating of the " + str(K) + " Nearest Neigbors: " + str(avgRating))

5 Nearest Neigbors: 
 
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Aladdin (1992) 3.8127853881278537
Liar Liar (1997) 3.156701030927835
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
 
Average Rating of the 5 Nearest Neigbors: 3.7189656165466287


In [19]:
print (movieDict[1][0] + " " + str(movieDict[1][3]))

Toy Story (1995) 3.8783185840707963
