In [2]:
import pandas as pd
import numpy as np
from math import sqrt
import codecs
from IPython.display import Image
from collections import defaultdict

In [3]:
#getting the movie database
doc = codecs.open("F:/ML/Git ML/RecommendationEngine/u.item",'rU','latin-1') #open for reading with "universal" type set
moviedb = pd.read_csv(doc, sep='|',names= range(0,24))
moviedb.drop((moviedb.iloc[:,2:]),inplace=True,axis=1) #dropping columns that will not be used for this excercise 
moviedb.columns= ["movieid","movie_title"] 
moviedb

Unnamed: 0,movieid,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [4]:
#reading the dataset
df = pd.read_table("F:/ML/Git ML/RecommendationEngine/u.data",names=("userid","itemid","rating","timestamp"))
df

Unnamed: 0,userid,itemid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [5]:
#preprocessing
df.sort_values(by=["userid"],ascending=True,inplace=True)
df.drop(columns=["timestamp"],inplace=True) #dropping unwanted columns
df

Unnamed: 0,userid,itemid,rating
66567,1,55,5
62820,1,203,4
10207,1,183,5
9971,1,150,5
22496,1,68,4
9811,1,201,3
9722,1,157,4
9692,1,184,4
9566,1,210,4
9382,1,163,4


In [6]:
#function to convert dataframe to nested dictionary
def recur_dictify(frame):
    if len(frame.columns) == 1:
        if frame.values.size == 1: return frame.values[0][0]
        return frame.values.squeeze()
    grouped = frame.groupby(frame.columns[0])
    d = {k: recur_dictify(g.iloc[:,1:]) for k,g in grouped}
    return d

In [7]:
df2=recur_dictify(df)
print (df2)

{1: {1: 5, 2: 3, 3: 4, 4: 3, 5: 3, 6: 5, 7: 4, 8: 1, 9: 5, 10: 3, 11: 2, 12: 5, 13: 5, 14: 5, 15: 5, 16: 5, 17: 3, 18: 4, 19: 5, 20: 4, 21: 1, 22: 4, 23: 4, 24: 3, 25: 4, 26: 3, 27: 2, 28: 4, 29: 1, 30: 3, 31: 3, 32: 5, 33: 4, 34: 2, 35: 1, 36: 2, 37: 2, 38: 3, 39: 4, 40: 3, 41: 2, 42: 5, 43: 4, 44: 5, 45: 5, 46: 4, 47: 4, 48: 5, 49: 3, 50: 5, 51: 4, 52: 4, 53: 3, 54: 3, 55: 5, 56: 4, 57: 5, 58: 4, 59: 5, 60: 5, 61: 4, 62: 3, 63: 2, 64: 5, 65: 4, 66: 4, 67: 3, 68: 4, 69: 3, 70: 3, 71: 3, 72: 4, 73: 3, 74: 1, 75: 4, 76: 4, 77: 4, 78: 1, 79: 4, 80: 4, 81: 5, 82: 5, 83: 3, 84: 4, 85: 3, 86: 5, 87: 5, 88: 4, 89: 5, 90: 4, 91: 5, 92: 3, 93: 5, 94: 2, 95: 4, 96: 5, 97: 3, 98: 4, 99: 3, 100: 5, 101: 2, 102: 2, 103: 1, 104: 1, 105: 2, 106: 4, 107: 4, 108: 5, 109: 5, 110: 1, 111: 5, 112: 1, 113: 5, 114: 5, 115: 5, 116: 3, 117: 3, 118: 3, 119: 5, 120: 1, 121: 4, 122: 3, 123: 4, 124: 5, 125: 3, 126: 2, 127: 5, 128: 4, 129: 5, 130: 3, 131: 1, 132: 4, 133: 4, 134: 4, 135: 4, 136: 3, 137: 5, 138: 1,

Pearson Correlation Score
Implementation for the Pearson correlation score first finds the items rated by both users. It then calculates the sums and the sum of the squares of the ratings for the both users and calculates the sum of the products of their ratings. Finally, it uses these results to calculate the Pearson correlation coefficient.Unlike the distance metric, this formula is not intuitive, but it does tell you how much the variables change together divided by the product of how much they alter individually.

Generally, this pearson_correlation function returns a value between -1 to 1 . A value 1 means both users are having the same taste in all most all cases.

![Image](https://i0.wp.com/dataaspirant.com/wp-content/uploads/2015/05/correlation2.png?w=322)

In [8]:
#defining pearson similiarity to get simillarity between 2 users
def pearson_correlation(person1,person2):
    # To get both rated items
    both_rated = {}
    for item in df2[person1]:
        if item in df2[person2]:
            both_rated[item] = 1 # gives the list of movies both have rated 
 
    number_of_ratings = len(both_rated)#gives the total no of the above 
    
    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0
    
     # Add up all the preferences of each user
    person1_preferences_sum = sum([df2[person1][item] for item in both_rated])
    person2_preferences_sum = sum([df2[person2][item] for item in both_rated])
    
    # Sum up the squares of preferences of each user
    person1_square_preferences_sum = sum([pow(df2[person1][item],2) for item in both_rated])
    person2_square_preferences_sum = sum([pow(df2[person2][item],2) for item in both_rated])
    
    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = sum([df2[person1][item] * df2[person2][item] for item in both_rated])
    
    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
    denominator_value = sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings))
    if denominator_value == 0:
        return (0)
    else:
        r = numerator_value/denominator_value
        return (r) 
    #returns a value between -1 to 1 . A value 1 means both users are having the same taste in all most all cases.



![Image](https://i0.wp.com/dataaspirant.com/wp-content/uploads/2015/05/recommendataion_for_toby.png?w=640)

This image shows the correlation scores for each person and the ratings they gave for three movies The Night Listener, Lady in the Water, and Just My Luck that Toby hasn’t rated. The Columns beginning with S.x give the similarity multiplied by the rating,so a person who is similar to Toby will contribute more to the overall score than a person who is different from Toby. The Total row shows the sum of all these numbers.

We could just use the totals to calculate the rankings, but then a movie reviewed by more people would have a big advantage. To correct for this you need to divide by the sum of all the similarities for persons that reviewed that movie (the Sim.Sum row in the table) because The Night Listener was reviewed by everyone, it’s total is divided by the average of similarities. Lady in the water ,however , was not reviewed by Puig, The last row shows the results of this division

In [9]:
def user_recommendations(person,no_of_movies):
 
    # Gets recommendations for a person by using a weighted average of every other user's rankings
    totals = {}
    simSums = {}
    rankings_list =[]
    for other in df2:
        # don't compare me to myself
        if other == person:
            continue
        sim = pearson_correlation(person,other)
 
        # ignore scores of zero or lower
        if sim == 0: 
            continue
        for item in df2[other]:
 
            # only score movies i haven't seen yet
            if item not in df2[person] or df2[person][item] == 0:
 
            # Similrity * score
                totals.setdefault(item,0)
                totals[item] += df2[other][item]* sim
                # sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+= sim
 
        # Create the normalized list
 
    rankings = [(total/simSums[item],item) for item,total in totals.items()]
    rankings.sort()
    rankings.reverse()
    # returns the recommended items
    recommendataions_list = [recommend_item for score,recommend_item in rankings]
    best_movies=recommendataions_list[0:no_of_movies]
    print("\n")
    for movie in best_movies:
        likes= (moviedb.loc[moviedb['movieid'] == movie,'movie_title'].tolist())
        
        print ( ' '.join(likes)) #removing the quotes and brackets
    print("\n")
    return("\n These are the movies user %s would like..." %person)    


In [10]:
user = input ("Please enter the user id: ")
noofmovies = input("How many movies do you want to recommend? ")
print (user_recommendations(int (user),int (noofmovies)))

Please enter the user id: 7
How many movies do you want to recommend? 5


Incognito (1997)
Truman Show, The (1998)
Legal Deceit (1997)
Apostle, The (1997)
Hugo Pool (1997)



 These are the movies user 7 would like...


In [50]:
#Recommending movies for a new user 
def userlist():
    df2=recur_dictify(df)
    allratings = {} #empty dict for appending all the rating of that user
    name = input("Please enter your name: ")
    noofmovies = input ("How many movies do you want to recommend? ")
    for item in moviedb['movieid']:
        #getting the movie for movie id for the user to see
        moviename= moviedb.loc[moviedb['movieid'] == item,'movie_title'].tolist()
        rating = input("Please enter your rating 0-5 for %s: \n to skip this rating press s \n to exit press e " % moviename)
        if rating == "e":
            break
        elif rating == "s":
            continue
        else :
            allratings.update({item:int(float(rating))})
    #appending the ratings to that user
    df2.update({name:allratings})
    return (df2,name,noofmovies)

In [51]:
df2,name2,nof =userlist()
print (user_recommendations(name2 ,int(nof)))


Please enter your name: rosh
How many movies do you want to recommend? 4
Please enter your rating 0-5 for ['Toy Story (1995)']: 
 to skip this rating press s 
 to exit press e 5
Please enter your rating 0-5 for ['GoldenEye (1995)']: 
 to skip this rating press s 
 to exit press e 4
Please enter your rating 0-5 for ['Four Rooms (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Get Shorty (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Copycat (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Twelve Monkeys (1995)']: 
 to skip this rating press s 
 to exit press e 5
Please enter your rating 0-5 for ['Babe (1995)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Dead Man Walking

Please enter your rating 0-5 for ['Brother Minister: The Assassination of Malcolm X (1994)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ["Carlito's Way (1993)"]: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Firm, The (1993)']: 
 to skip this rating press s 
 to exit press e s
Please enter your rating 0-5 for ['Free Willy (1993)']: 
 to skip this rating press s 
 to exit press e 3
Please enter your rating 0-5 for ['Fugitive, The (1993)']: 
 to skip this rating press s 
 to exit press e e


Rosewood (1997)
Assignment, The (1997)
Shadow Conspiracy (1997)
Show, The (1995)



 These are the movies user rosh would like...
