In [1]:
import numpy as np
import pandas as pd
import os
import math
import copy
from matplotlib import pyplot as plt
from model.track_collection import TrackCollection
from utils.collection_splitter import splitter

In [2]:
matrix = pd.DataFrame(np.array([
    [1, 0, 3, 0, 0, 5, 0, 0, 5, 0, 4, 0],
    [0, 0, 5, 4, 0, 0, 4, 0, 0, 2, 1, 3],
    [2, 4, 0, 1, 2, 0, 3, 0, 4, 3, 5, 0],
    [0, 2, 4, 0, 5, 0, 0, 4, 0, 0, 2, 0],
    [0, 0, 4, 3, 4, 2, 0, 0, 0, 0, 2, 5],
    [1, 0, 3, 0, 3, 0, 0, 2, 0, 0, 4, 0],
    
]), columns=['user1', 'user2', 'user3', 'user4', 'user5', 'user6',
             'user7', 'user8', 'user9', 'user10', 'user11', 'user12'], dtype='float')
matrix

Unnamed: 0,user1,user2,user3,user4,user5,user6,user7,user8,user9,user10,user11,user12
0,1.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,4.0,0.0
1,0.0,0.0,5.0,4.0,0.0,0.0,4.0,0.0,0.0,2.0,1.0,3.0
2,2.0,4.0,0.0,1.0,2.0,0.0,3.0,0.0,4.0,3.0,5.0,0.0
3,0.0,2.0,4.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0
4,0.0,0.0,4.0,3.0,4.0,2.0,0.0,0.0,0.0,0.0,2.0,5.0
5,1.0,0.0,3.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0


In [3]:
def compute_normalize_matrix(df):
    df = copy.copy(df)
    for index, line in df.iterrows():
        sum = 0
        n = 0
        for el in line:
            if el > 0:
                sum += el
                n += 1
        if n != 0:
            line_mean = sum / n
        else:
            line_mean = 0
        print("line mean", line_mean)
        for el_idx, el in enumerate(line):
            if el > 0:
                line[el_idx] = el - line_mean
#             else:
#                 line[el_idx] = math.nan
        df.loc[index] = line
    return df
        


def cos_measure(df, index_x, index_y):
    """Takes a dataframe representing the matrix and the line indexes of the elements to compare"""
    rx = df.iloc[index_x]
    ry = df.iloc[index_y]
    sc = np.dot(rx, ry)
    return sc / (np.linalg.norm(rx) * np.linalg.norm(ry))


def rating_mean(df, track_index):
    track = df.iloc[track_index]
    if np.count_nonzero(track) == 0:
        return 0
    return np.sum(track) / np.count_nonzero(track)
    

def pearson_measure(df, index_x, index_y):
   # comuting the center of each value
    mean_x = rating_mean(df, index_x)
    mean_y  = rating_mean(df, index_y)
    
    # we get the two elemnts we are interested in
    df_x = df.iloc[index_x:index_x + 1]
    df_y = df.iloc[index_y:index_y + 1]
    filtered_df = pd.concat([df_x, df_y])
    
    # we remove the columns containing empty measures (cannot be compared)
    filtered_df = filtered_df.loc[:, (~filtered_df.isin([0])).all(axis=0)]

    upper_sum = 0
    for index, col in enumerate(filtered_df):
        upper_sum += (filtered_df.iat[0, index] - mean_x) * (filtered_df.iat[1, index] - mean_y)
    
    lower_sum_x = 0
    lower_sum_y = 0
    for index, col in enumerate(df):
        if df.iat[index_x, index] != 0:
            lower_sum_x += (df.iat[index_x, index] - mean_x) ** 2
        if df.iat[index_y, index] != 0:
            lower_sum_y += (df.iat[index_y, index] - mean_y) ** 2

#     print("LOW", lower_sum_x, lower_sum_y)
    if lower_sum_x == 0 or lower_sum_y == 0:
        return - math.inf
    return upper_sum / (math.sqrt(lower_sum_x) * math.sqrt(lower_sum_y))

In [4]:
n_matrix = compute_normalize_matrix(matrix)
n_matrix

line mean 3.6
line mean 3.1666666666666665
line mean 3.0
line mean 3.4
line mean 3.3333333333333335
line mean 2.6


Unnamed: 0,user1,user2,user3,user4,user5,user6,user7,user8,user9,user10,user11,user12
0,-2.6,0.0,-0.6,0.0,0.0,1.4,0.0,0.0,1.4,0.0,0.4,0.0
1,0.0,0.0,1.833333,0.833333,0.0,0.0,0.833333,0.0,0.0,-1.166667,-2.166667,-0.166667
2,-1.0,1.0,0.0,-2.0,-1.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0
3,0.0,-1.4,0.6,0.0,1.6,0.0,0.0,0.6,0.0,0.0,-1.4,0.0
4,0.0,0.0,0.666667,-0.333333,0.666667,-1.333333,0.0,0.0,0.0,0.0,-1.333333,1.666667
5,-1.6,0.0,0.4,0.0,0.4,0.0,0.0,-0.6,0.0,0.0,1.4,0.0


In [5]:
height = matrix.shape[0]
sim_matrix = np.empty((height, height))
columns = matrix.index.values
for i in range(height): # for each track
    for j in range(height):
        sim_matrix[i][j] = pearson_measure(matrix, i, j)
sim_matrix = pd.DataFrame(sim_matrix, columns=matrix.index.values)
sim_matrix

Unnamed: 0,0,1,2,3,4,5
0,1.0,-0.178542,0.414039,-0.10245,-0.308957,0.58704
1,-0.178542,1.0,-0.526235,0.468008,0.398911,-0.30644
2,0.414039,-0.526235,1.0,-0.623981,-0.284268,0.50637
3,-0.10245,0.468008,-0.623981,1.0,0.458735,-0.235339
4,-0.308957,0.398911,-0.284268,0.458735,1.0,-0.215917
5,0.58704,-0.30644,0.50637,-0.235339,-0.215917,1.0


In [6]:
height = n_matrix.shape[0]
n_sim_matrix = np.empty((height, height))
columns = n_matrix.index.values
for i in range(height): # for each track
    for j in range(height):
        n_sim_matrix[i][j] = cos_measure(n_matrix, i, j)
n_sim_matrix = pd.DataFrame(n_sim_matrix, columns=n_matrix.index.values)
n_sim_matrix

Unnamed: 0,0,1,2,3,4,5
0,1.0,-0.178542,0.414039,-0.10245,-0.308957,0.58704
1,-0.178542,1.0,-0.526235,0.468008,0.398911,-0.30644
2,0.414039,-0.526235,1.0,-0.623981,-0.284268,0.50637
3,-0.10245,0.468008,-0.623981,1.0,0.458735,-0.235339
4,-0.308957,0.398911,-0.284268,0.458735,1.0,-0.215917
5,0.58704,-0.30644,0.50637,-0.235339,-0.215917,1.0


In [11]:
# Predict the weighted average
def predict(df, sim_matrix, element_index, user_index):
    user_col = df.iloc[:, user_index]
    # compute predicted rating
    upper_sum = 0
    lower_sum = 0
    for index, el in enumerate(sim_matrix.iloc[:, element_index]):
        if el >= 0 and el <= 0.99:
            upper_sum += el * user_col[index]
            lower_sum += el
    if lower_sum == 0:
        return - math.inf
    return upper_sum / lower_sum

# below should be equal to 2.6
# print(predict(matrix, sim_matrix, 0, 4))

# we compute the predictions
print("PEARSON")
for i in range(12):
    for j in range(6):
        if predict(matrix, sim_matrix, j, i) > rating_mean(matrix, j):
            print("user_{} should listen music {}".format(i, j))

print("COS")
for i in range(12):
    for j in range(6):
        if predict(matrix, n_sim_matrix, j, i) > rating_mean(matrix, j):
            print("user_{} should listen music {}".format(i, j))

PEARSON
user_2 should listen music 1
user_2 should listen music 3
user_2 should listen music 4
user_3 should listen music 3
user_4 should listen music 1
user_5 should listen music 5
user_8 should listen music 5
user_10 should listen music 0
user_10 should listen music 2
user_10 should listen music 5
user_11 should listen music 3
COS
user_2 should listen music 1
user_2 should listen music 3
user_2 should listen music 4
user_3 should listen music 3
user_4 should listen music 1
user_5 should listen music 5
user_8 should listen music 5
user_10 should listen music 0
user_10 should listen music 2
user_10 should listen music 5
user_11 should listen music 3
