In [None]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
from scipy import stats
from scipy import spatial
import matplotlib.pyplot as plt
import seaborn as sb

## Display information

In [None]:
names = ['userid', 'itemid', 'rating', 'timestamp']
raw_data = pd.read_csv('./ml-100k/u.data', sep='\t', names=names)

print('Count of ratings', len(raw_data))
print('First ten rows')
print(raw_data[0:10])

# save data in a numpy array where each user ratings have their own rows
userids = sorted(list(raw_data['userid'].unique()))
itemids = sorted(list(raw_data['itemid'].unique()))

# first save in list of lists, use None values if user has not rated item
data = [[None] * len(userids) for x in range(len(itemids))]

# find ratings made by each user
for i in range(len(itemids)):
    # dict of ratings for item i+1 (key = userid, value = rating)
    item_ratings = dict(zip(raw_data.loc[raw_data['itemid'] == (i+1)].userid, raw_data.loc[raw_data['itemid'] == (i+1)].rating))
    for j in range(len(userids)):
        # check if user has rated item with id j+1
        if j+1 in item_ratings:
            data[i][j] = item_ratings[j+1]

data = np.array(data)
print(data.shape)

In [None]:
# a = item a itemid, b = item b itemid, data = whole dataset
def cosine_similarity(a, b, data):
    # ratings for items a and b
    data_a = data[a-1] # indexing starts at one
    data_b = data[b-1]

    # dicts with userids and ratings
    dict_a = {u: r for u, r in enumerate(data_a, start=1) if r is not None}
    dict_b = {u: r for u, r in enumerate(data_b, start=1) if r is not None}

    # intersection between two sets
    P = list(set(dict_a).intersection(set(dict_b)))

    dict_a = {id: dict_a[id] for id in P}
    dict_b = {id: dict_b[id] for id in P}

    mean_a = np.mean(list(dict_a.values()))
    mean_b = np.mean(list(dict_b.values()))
    n = 0
    d1 = 0
    d2 = 0

    # calculate sums
    for userid in P:
        n += ((dict_a[userid] - mean_a) * (dict_b[userid] - mean_b))
        d1 += ((dict_a[userid] - mean_a) ** 2)
        d2 += ((dict_b[userid] - mean_b) ** 2)
    
    if n == 0:
        return 0

    sim = n / (np.sqrt(d1) * np.sqrt(d2))


    #scipy_sim = 1-spatial.distance.cosine(list(dict_a.values()), list(dict_b.values()))
    #if np.round(sim, 10) == np.round(scipy_sim, 10):
    #    print(sim, scipy_sim)
    #    print('all ok')
    #else:
    #    print(sim, scipy_sim)
    #    print("Not okay")
 
    return sim
cosine_similarity(567, 123, data)

### Similarity matrix of all items (makes calculations faster)

In [None]:
sim_matrix = [[1] * len(itemids) for x in range(len(itemids))]
for i in range(len(itemids)):
    for j in range(i+1, len(itemids)):
        sim_matrix[i][j] = sim_matrix[j][i] = cosine_similarity(i+1, j+1, data)

In [None]:
test_matrix = [[1] * int(len(itemids)/23) for x in range(int(len(itemids)/23))]
for i in range(int(len(itemids)/23)):
    for j in range(i+1, int(len(itemids)/23)):
        test_matrix[i][j] = test_matrix[j][i] = cosine_similarity(i+1, j+1, data)

test_matrix = np.array(test_matrix)

# heatmap of the similarity matrix
sb.heatmap(test_matrix)
plt.title('Similarity matrix of users 1 to 41')
plt.show()

In [None]:
# u = userid, p = itemid, data = whole data set, sim = item similarity vector, n = number of neighbours
def cosine_predict(u, p, data, sim, n):
    movies_seen = data[u-1]