# Importing the necessary Libraries

In [147]:
import pandas as pd
import numpy as np
from math import sqrt

# Function to calculate the pearson correlation between 2 vectors


In [148]:
'''
This function is the core of the calculating pearson correlation between 2 users based on the movies they have rated.
It uses the pearson correlation formula as 

pearson coeff = sum((x-x_avg)*(y-y_avg))/sqrt((sum(square(x-x_avg)))*(sum(square(y-y_avg))))


'''
def pearson_def(x, y):
    #print(x)
    #print(y)
    assert len(x) == len(y)
    n = len(x)
    if(n==0):
        return 0
    avg_x = np.average(x)
    avg_y = np.average(y)
    diffprod = 0
    xdiff2 = 0
    ydiff2 = 0
    for idx in range(n):
        xdiff = x[idx] - avg_x
        ydiff = y[idx] - avg_y
        diffprod += xdiff * ydiff
        xdiff2 += xdiff * xdiff
        ydiff2 += ydiff * ydiff
    if(xdiff2 == np.float64(0)  or ydiff2 == np.float64(0)):
        return 0
    return diffprod / sqrt(xdiff2 * ydiff2)

# Data Preparation to calculate the pearson correlation

In [149]:

def pearson_correlation(p1,p2,df,usercolname, itemcolname, ratingColName):

    # To get both rated items
    both_rated = {}
    dataset1 = df[(df[usercolname] == p1)]
    dataset2 = df[(df[usercolname] == p2)]
    #print(dataset1)
    #print(dataset2)
    
    a = [item for item in dataset1[itemcolname]]
    b = [item for item in dataset2[itemcolname]]
    bothrated = [item for item in a if item in b]
    #print(bothrated)
    
    number_of_ratings = len(bothrated)
    #print(number_of_ratings)
    if number_of_ratings == 0:
        return 0
    
    #print(dataset1[(dataset1[itemcolname] == 405)].iloc[0]['rating'])
    p1_arr = [dataset1[(dataset1[itemcolname] == item)].iloc[0][ratingColName] for item in bothrated]
    p2_arr = [dataset2[(dataset2[itemcolname] ==item)].iloc[0][ratingColName] for item in bothrated]
    #print(p1_arr)
    #print(p2_arr)
    retval = pearson_def(p1_arr,p2_arr)
    return retval

In [150]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

In [151]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Testing the pearson calculations method

In [153]:
pearson_correlation(84,122,ratings,'user_id','movie_id', 'rating')

0

# Using the pearson correlation find n_neighbor number of similar users

In [198]:
def most_similar_users(person,n_neighbors,df,usercolname, itemcolname, ratingColName):
    persondata = df[usercolname].unique()
    #print(persondata)
	# returns the number_of_users (similar persons) for a given specific person.
    scores = [(pearson_correlation(person,other_person,df,usercolname, itemcolname, ratingColName),other_person) for\
              other_person in persondata if  other_person != person ]
	
	# Sort the similar persons so that highest scores person will appear at the first
    scores.sort()
    scores.reverse()
    return scores[0:n_neighbors]



In [172]:
print(most_similar_users(84,1,ratings,'user_id','movie_id', 'rating'))

[(1.0, 720)]


# After finding similar users find recommendation of the movies

In [192]:
def user_reommendations(person,k, df,usercolname, itemcolname, ratingColName):

	# Gets recommendations for a person by using a weighted average of every other user's rankings
    similar_users = most_similar_users(person,k,df,usercolname, itemcolname, ratingColName)
    #get users who are similar in the list
    similar_users1 = [user for user in similar_users if  user[0] > 0 ]
    
    totals = {}
    simSums = {}
    dataset = df[(df[usercolname] == person)]
    #print(similar_users1)
    for val in similar_users1:
        sim = val[0]
        user = val[1]
        #print(sim)
        #print(user)
        dataset1 = df[(df[usercolname] == user)]
        for index, row in dataset1.iterrows():
            # only score movies i haven't seen yet
            item = row[itemcolname]
            #print(item)
            if item not in dataset[itemcolname] :
                # Similrity * score
                totals.setdefault(item,0)
                #multiply the users and the item by similarity score and add it to total for that item
                rating = row[ratingColName]
                #print("%s :%d"%(item,rating* sim))                
                totals[item] += rating* sim
                # sum of similarities
                simSums.setdefault(item,0)
                #add all similarities for demominator 
                simSums[item]+= sim
    

    # Create the normalized list  
    #print(simSums)
    #print(totals)
    rankings = [(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    # returns the recommended items
    #recommendataions_list = [recommend_item for score,recommend_item in rankings[:10]]
    return rankings[:20]
    #return recommendataions_list
#user_reommendations('Toby',6)

# Find movie recommendations for user id 86

In [194]:
print(user_reommendations(86,3,ratings,'user_id','movie_id', 'rating'))

[(5.0, 1277), (5.0, 1017), (5.0, 1012), (5.0, 949), (5.0, 919), (5.0, 741), (5.0, 740), (5.0, 713), (5.0, 696), (5.0, 475), (5.0, 432), (5.0, 427), (5.0, 340), (5.0, 334), (5.0, 333), (5.0, 295), (5.0, 273), (5.0, 265), (5.0, 202), (5.0, 181)]


# Find users to which movie 86 can be recommended
Since the methods above are generic just by swapping the users column with the movie column we can perform collaborative filtering for the movies

In [199]:
print(user_reommendations(86,3,ratings,'movie_id','user_id', 'rating'))

[(5.0, 889), (5.0, 519), (5.0, 440), (4.0, 655), (4.0, 445), (3.0, 883), (3.0, 782), (3.0, 486), (1.0, 724), (1.0, 405)]
