In [1]:
import numpy as np
import pandas as pd
from scipy import stats

Read file and display some data

In [2]:
names = ['userid', 'itemid', 'rating', 'timestamp']
data = pd.read_csv('./ml-100k/u.data', sep='\t', names=names)

print('Count of ratings', len(data))
print('First ten rows')
print(data[0:10])

Count of ratings 100000
First ten rows
   userid  itemid  rating  timestamp
0     196     242       3  881250949
1     186     302       3  891717742
2      22     377       1  878887116
3     244      51       2  880606923
4     166     346       1  886397596
5     298     474       4  884182806
6     115     265       2  881171488
7     253     465       5  891628467
8     305     451       3  886324817
9       6      86       3  883603013


User-based collaborativve filtering approach using Pearson correlation function

In [3]:
# a, b = userids, data = whole data set
def pearson_correlation(a, b, data):
    # rows from data set containing given userids
    data_a = data.loc[data['userid'] == a]
    data_b = data.loc[data['userid'] == b]

    # dicts with itemid and ratings
    dict_a = dict(zip(data_a.itemid, data_a.rating))
    dict_b = dict(zip(data_b.itemid, data_b.rating))

    # intersections of common itemids
    P = list(set(dict_a).intersection(set(dict_b)))

    # keep only common itemids
    dict_a = {id: dict_a[id] for id in P}
    dict_b = {id: dict_b[id] for id in P}

    mean_a = np.mean(list(dict_a.values()))
    mean_b = np.mean(list(dict_b.values()))
    n = 0
    d1 = 0
    d2 = 0
    
    # calculate sums
    for item in P:
        n += ((dict_a[item] - mean_a) * (dict_b[item] - mean_b))
        d1 += ((dict_a[item] - mean_a) ** 2)
        d2 += ((dict_b[item] - mean_b) ** 2)
    
    # handle cases where n == 0 and d might be zero as well
    if n == 0:
        return 0

    sim = n / (np.sqrt(d1) * np.sqrt(d2))
    
    # compare to scipys result, with some values the 16th decimal can be different -> round to 10 decimal places
    #scipy_sim, p = stats.pearsonr(list(dict_a.values()), list(dict_b.values()))
    #if np.round(sim, 10) == np.round(scipy_sim, 10):
    #    print(sim, scipy_sim)
    #    print('all ok')
    
    return sim

Prediction function for predicting movie scores

In [18]:
# a = userid, p = itemid, data = whole data set, nh = userids and similarities of users in the neighbourhood
# if nh = None, neighbourhood is no limited
def pearson_predict(a, p, data, nh=None):
    # mean of ratings given by user a
    mean_a = np.mean(list(data.loc[data['userid'] == a].rating))
    

    # dict containing all userids and ratings for itemid p
    N = dict(zip(data.loc[data['itemid'] == p].userid, data.loc[data['itemid'] == p].rating))
    n = 0
    d = 0

    if nh != None:
        new_N = {}
        # Neighbour hood given, limit userids and ratings to that
        for id in nh:
            if id in N:
                new_N[id] = nh[id]
        
        # check if there are enough ratings in the neighbourhood
        if len(new_N) > 0:
            N = new_N

            for b in N:
                mean_b = np.mean(list(data.loc[data['userid'] == b].rating))
                n += nh[b] * (N[b] - mean_b)
                d +=  nh[b]

            if n == 0:
                return mean_a

            return mean_a + n/d
            
    for b in N:
        mean_b = np.mean(list(data.loc[data['userid'] == b].rating))
        n += (pearson_correlation(a, b, data) * (N[b] - mean_b))
        d += pearson_correlation(a, b, data)
    
    if n == 0:
        return mean_a

    return mean_a + n/d

Show n most similar users for any given user

In [19]:
def n_most_similar_users(a, n, data):
    # all unique user ids
    users = list(data['userid'].unique())

    similarities = {}
    for u in users:
        if a != u:
            sim = pearson_correlation(a, u, data)
            similarities[u] = sim

    # sort similarities based on dict values and return n highest values
    similarities = dict(sorted(similarities.items(), key=lambda x: x[1], reverse=True))
    return dict(list(similarities.items())[:n])

Ten most similar users to user 10

In [20]:
USER = 10
sim = n_most_similar_users(USER, 10, data)
df = pd.DataFrame(list(zip(list(sim.keys()), list(sim.values()))), columns=['userid', 'similarity'])
print(df)

   userid  similarity
0      61    1.000000
1     400    1.000000
2     772    1.000000
3     101    1.000000
4     477    1.000000
5     636    1.000000
6     238    1.000000
7     502    1.000000
8     729    1.000000
9     260    0.944911


Recommended 20 movies for the same user

In [21]:
# all itemids
movies = list(data['itemid'].unique())

# movies the user has seen
movies_user = list(data.loc[data['userid'] == USER].itemid)

predictions = {}
sim = n_most_similar_users(USER, 942, data)
for movie in movies:
    if movie not in movies_user:
        pred = pearson_predict(USER, movie, data, sim)
        predictions[movie] = pred

most_relevant = dict(sorted(predictions.items(), key=lambda x: x[1], reverse=True))
df = pd.DataFrame(list(zip(list(most_relevant.keys()), list(most_relevant.values()))), columns=['itemid', 'rating pred'])[:20]
print(df)

    itemid  rating pred
0     1360    14.129367
1      247    12.233215
2      138    12.170861
3     1480     9.675820
4     1628     9.357563
5      360     6.513314
6     1672     4.893793
7     1671     4.206522
8     1539     4.155533
9     1432     4.034306
10    1227     4.028426
11    1332     3.808683
12    1519     3.732777
13     920     3.361101
14    1066     3.312249
15    1430     3.193389
16     893     3.179575
17    1146     3.162948
18    1327     3.107773
19    1250     3.004603
