## Movie Recommendations
## Purnendu Ghosh
## MDS201811

In [1]:
import numpy as np
import pandas as pd
from math import sqrt
df = pd.read_csv("http://files.grouplens.org/datasets/movielens/ml-100k/u.data", delimiter="\t", header = None)
df.columns = ['user_id','item_id','rating','timestamp']
del df['timestamp']
df.head(7)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2


## Creates a dictionary

In [2]:
critics= {}
for index,row in df.iterrows():
    try:
        critics[row['user_id']][row['item_id']]=row['rating']
    except:
        critics[row['user_id']]={row['item_id']:row['rating']}

## similarity between two users as measured through distance

In [3]:
def sim_distance(critics, person1, person2):
    sum_of_squares = 0
    common_count = 0
    for movie in critics[person1]:
        if movie in critics[person2]:
            sum_of_squares = sum_of_squares + pow(critics[person1][movie] - critics[person2][movie], 2)
            common_count = common_count + 1

    if common_count == 0:
        return 0
    else:
        return 1/(1+sum_of_squares)

## Function for measuring similarity between two users using jaccard distance

In [4]:
def sim_jaccard(critics, p1, p2):
    numerator = 0
    difference = 0
    for movie in critics[p1]:
        if movie in critics[p2]:
            numerator = numerator + 1
        else:
            difference = difference + 1
    denominator = len(critics[p1].keys()) + difference        
    return numerator/denominator 


## Measuring similarity between two persons using pearson correlation coefficient

In [5]:
def sim_pearson(critics, p1, p2):
    sum1 = sum(critics[p1].values())
    sum2 = sum(critics[p2].values())
    # Get the list of mutually rated items
    si={}
    for item in critics[p1]:
        if item in critics[p2]: si[item]=1
    # Find the number of elements
    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    # Add up all the preferences
    sum1=sum([critics[p1][it] for it in si])
    sum2=sum([critics[p2][it] for it in si])
    # Sum up the squares
    sum1Sq=sum([pow(critics[p1][it],2) for it in si])
    sum2Sq=sum([pow(critics[p2][it],2) for it in si])
    # Sum up the products
    pSum=sum([critics[p1][it]*critics[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r


## Returns the best matches for a user from the dictionary

In [6]:
def topMatches(critics, person, n=5, similarity=sim_distance):
     scores = [(similarity(critics, person, other), other) for other in critics]

     scores.sort(reverse=True)
     return scores[:n]

## recommendations for a person from critics dictionary

In [7]:
def getRecommendations(critics,person,similarity=sim_jaccard):
    totals={}
    simSums={}
    for other in critics:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(critics,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in critics[other]:
            # only score movies I haven't seen yet
            if item not in critics[person] or critics[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=critics[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings


In [8]:
getRecommendations(critics,186,sim_distance)

[(5.000000000000001, 1500),
 (5.0, 1653),
 (5.0, 1599),
 (5.0, 1536),
 (5.0, 1467),
 (5.0, 1201),
 (5.0, 1189),
 (5.0, 1122),
 (5.0, 814),
 (4.999999999999999, 1293),
 (4.882988261382381, 1558),
 (4.869281045751634, 1642),
 (4.841269841269841, 1191),
 (4.827922804127619, 1367),
 (4.815899581589958, 1306),
 (4.773959967114827, 1388),
 (4.723270440251572, 1450),
 (4.698764337276046, 1524),
 (4.691744415913542, 119),
 (4.6871165644171775, 1398),
 (4.652868314267706, 1449),
 (4.635284913368677, 850),
 (4.609700991267887, 1628),
 (4.576395407448048, 408),
 (4.557496073972955, 513),
 (4.5437255171884265, 1194),
 (4.512031689726405, 923),
 (4.507434944237918, 1537),
 (4.505070612299916, 604),
 (4.49738137651202, 589),
 (4.4959633602634765, 253),
 (4.4935622317596575, 1125),
 (4.488065870692279, 114),
 (4.474242277629629, 1375),
 (4.473225786193122, 656),
 (4.458103182783963, 169),
 (4.456649713321133, 170),
 (4.455539413866815, 483),
 (4.4462057083599404, 251),
 (4.435934602041148, 707),
 (4.

In [9]:
topMatches(critics, 186, n=10, similarity=sim_distance)

[(1.0, 861),
 (1.0, 469),
 (1.0, 186),
 (1.0, 172),
 (0.5, 912),
 (0.5, 909),
 (0.5, 765),
 (0.5, 736),
 (0.5, 692),
 (0.5, 321)]