In [1]:
import pandas as pd
import numpy as np

Load dataset and display some data


In [2]:
names = ['userid', 'itemid', 'rating', 'timestamp']
data = pd.read_csv('./ml-100k/u.data', sep='\t', names=names)

print('Count of ratings', len(data))
print('First ten rows')
print(data[0:10])

Count of ratings 100000
First ten rows
   userid  itemid  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596
5     298     474       4  884182806
6     115     265       2  881171488
7     253     465       5  891628467
8     305     451       3  886324817
9       6      86       3  883603013


In [3]:
# a, b = userid, data = whole dataset
def pearson_correlation(a, b, data):
    P_a = data.loc[data['userid'] == a]
    P_b = data.loc[data['userid'] == b]
    
    # means for a and b ratings
    r_a = P_a['rating'].mean()
    r_b = P_b['rating'].mean()

    P = pd.merge(P_a, P_b, how='inner', on=['itemid'])
    P = P.rename(columns={'rating_x': 'r_ap', 'rating_y': 'r_bp'})
    
    # If no common movies, return similarity 0
    if len(P) == 0:
        return 0
    
    # r_ap and r_bp normalized values
    r_a_norm = P['r_ap'] - r_a
    r_b_norm = P['r_bp'] - r_b
    
    # denominator and nominator
    n = np.sum(np.multiply(r_a_norm, r_b_norm))
    d = np.sqrt(np.sum(np.multiply(r_a_norm, r_a_norm))) * np.sqrt(np.sum(np.multiply(r_b_norm, r_b_norm)))
    
    # handle divide zero error case (n and d are both zeros, it seems)
    if n == 0:
        return 0
    
    return n/d

In [4]:
# a = userid, p = itemid, data = whole dataset, n = neighborhood size, neighbours = sorted dataframe of user similarities
def pearson_predict(a, p, data, n=99999, neighbours=pd.DataFrame()):
    P_a = data.loc[data['userid'] == a]
    
    # mean of ratings for user a
    r_a = P_a['rating'].mean()
    # all rows where itemid is p
    N = data.loc[data['itemid'] == p]

    # if similar users are given, use them
    if not neighbours.empty:
        rows = []
        
        i = 0 # found similar users that have seen movie p
        j = 0 # iterations, break if it goes over the number of people that have seen movie p
        while i < n:
            user = neighbours.iloc[j]['userid']
            row = N.loc[N['userid'] == user]
            if not row.empty:
                rows.append(row)
                i += 1
            if j > len(N):
                break
            j += 1
        if i > 0:
            N = pd.concat(rows)  
        
    # check if given neighbourhood size n is larger than N
    if len(N) < n:
        n = len(N)
    
    similarities = []
    r_b_norms = []

    for index, b in N.iterrows():
        if b['userid'] != a:
            sim = pearson_correlation(a, b['userid'], data)
            
            # ignore negative similarity
            if sim < 0:
                continue
                
            similarities.append(sim)

            # All ratings for user b and mean of them
            P_b = data.loc[data['userid'] == b['userid']]
            r_b = P_b['rating'].mean()
            
            # normalized ratings for user b
            r_b_norms.append(b['rating'] - r_b)
    
    
    n = np.sum(np.multiply(similarities, r_b_norms))
    d = np.sum(similarities)
    
    if n == 0 or d == 0:
        return r_a
    
    return r_a + np.divide(n,d)    

In [5]:
def n_most_similar_users(a, n, data):
    all_users = data['userid'].unique()
    similarities = []
    similar_users = []
    
    for u in all_users:
        if a != u:
            similar_users.append(u)
            similarities.append(pearson_correlation(a, u, data))
    
    sim = pd.DataFrame(list(zip(similar_users, similarities)), columns=['userid', 'sim'])
    return sim.sort_values(by=['sim'], ascending=False)[0:n]

# Find 10 most similar users for this user
user = 10
sim = n_most_similar_users(user, 10, data)
print('Ten most similar users for user', user)
print(sim)

Ten most similar users for user 10
     userid       sim
337     341  1.000000
715     718  1.000000
170      61  0.952912
160     101  0.884358
682     689  0.842348
382     386  0.834023
49      260  0.812070
134      36  0.792138
551     550  0.790019
356     356  0.789812


In [6]:
# 20 most relevant movies for this user
user = 10

# movies the user has seen
user_movies = data.loc[data['userid'] == user]

# all itemids
all_movies = data['itemid'].unique()

# similariteis between user and all other users
sim = n_most_similar_users(user, 942, data)

predictions = []
itemids = []

for itemid in all_movies:
    # check that user has not seen the movie
    if user_movies.loc[user_movies['itemid'] == itemid].empty:
        predictions.append(pearson_predict(user, itemid, data, 10, sim))
        itemids.append(itemid)

        
relevant_movies = pd.DataFrame(list(zip(itemids, predictions)), columns=['itemid', 'pred'])
relevant_movies = relevant_movies.sort_values(by=['pred'], ascending=False)[0:20]

print('User', user, '20 most relevant movies.')
print(relevant_movies)

User 10 20 most relevant movies.
      itemid      pred
970      814  6.109037
1024     952  6.008605
1115    1536  5.911148
413      761  5.909224
951     1463  5.739144
1435    1467  5.698352
31       201  5.688505
404      251  5.671678
518     1142  5.637087
1378    1599  5.624773
919      902  5.606522
142      408  5.605599
106      466  5.535554
1242    1080  5.522801
554      855  5.492236
455      616  5.492236
1252    1293  5.476847
950     1500  5.420807
1418    1662  5.417729
676      883  5.406522


In [298]:
# testing the predictions, delete before submission
sim = n_most_similar_users(15, 942, data)
prediction = pearson_predict(15, 1355, data)
print(prediction)

6
0.08127607612801803
0.07311039558598809
0.49442379313504436
0.10748891293114388
0.2810733488177051
0.09187341912163241
1.0373725265978995
2.963563574575215
