In [26]:
import csv
import os

import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import pairwise_distances

In [21]:
def load_csv(filename, delimeter='\t', encoding='UTF-8'):
    """
    :param filename: path inside data folder
    :return:
    """
    dirname = '/home/damian/Pulpit/Warsztat/CoML/recomm_project/customized/notebooks'
    filename = os.path.join(dirname, '../../data/{}'.format(filename))
    data = []

    with open(filename, newline='\n', encoding=encoding) as csvfile:
        reader = csv.reader(csvfile, delimiter=delimeter)

        for row in reader:
            data.append(row)

    return data


def cast_to_int(data):
    """
    Cast all elements to int
    :param data:
    :return:
    """
    for i in range(len(data)):
        for j in range(len(data[i])):
            data[i][j] = int(data[i][j])

    return data


def transform_to_user_item_mat(data, user_idx=0, item_idx=1, rating_idx=2, verbose=False):
    """
    Transform to user - item table with ratings
    :param verbose:
    :param rating_idx:
    :param item_idx:
    :param user_idx:
    :param data:
    :return: data_item matrix
    """
    user_num = np.sort(data[:, user_idx])[-1] + 1
    item_num = np.sort(data[:, item_idx])[-1] + 1
    if verbose:
        print('User number: {}, item number: {}'.format(user_num, item_num))
    data_item = np.zeros(shape=(user_num, item_num))

    for row in data:
        user_id = row[user_idx]
        item_id = row[item_idx]
        rating = row[rating_idx]
        data_item[user_id][item_id] = rating

    return data_item


def preprocess(data):
    data = cast_to_int(data)
    data = np.array(data)
    data = data - np.array([1, 1, 0, 0])

    return transform_to_user_item_mat(data, verbose=True)

In [29]:
ml100k_filename = 'ml-100k/ua.base'
user_item = preprocess(load_csv(ml100k_filename))

print(user_item[:10])

User number: 943, item number: 1682
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]]


In [32]:
user_similarity = pairwise_distances(user_item, metric='cosine')
print(user_similarity.shape) 
print(user_similarity[:10])

(943, 943)
[[0.         0.85324924 0.9493235  ... 0.96129522 0.8272823  0.61960392]
 [0.85324924 0.         0.87419215 ... 0.82629308 0.82681535 0.91905667]
 [0.9493235  0.87419215 0.         ... 0.97201154 0.87518372 0.97030738]
 ...
 [0.70450641 0.91392067 0.92654776 ... 0.94089532 0.84199768 0.74139694]
 [0.91753596 0.90405936 1.         ... 0.92209476 0.95380009 0.89226146]
 [0.63803389 0.87729665 0.94653205 ... 0.94291632 0.7917169  0.78725564]]


In [33]:
p = pearsonr(user_item[0], user_item[1])
print(p)

(0.09131021547798493, 0.00017691643724502694)


In [35]:
def pearson(a, b):
    return pearsonr(a, b)[0]

user_similarity = pairwise_distances(user_item, metric=pearson)
print(user_similarity.shape) 
print(user_similarity[:10])


(943, 943)
[[ 1.          0.09131022 -0.00581008 ...  0.00910694  0.10831478
   0.30758436]
 [ 0.09131022  1.          0.10288466 ...  0.1627796   0.1442557
   0.03401889]
 [-0.00581008  0.10288466  1.         ...  0.01597579  0.09786267
  -0.01432321]
 ...
 [ 0.25635099  0.0600081   0.05011905 ...  0.04659944  0.12976455
   0.22401851]
 [ 0.05622861  0.0835279  -0.01249907 ...  0.07172473  0.03059939
   0.08822838]
 [ 0.27661141  0.073382    0.00586339 ...  0.03297069  0.15581961
   0.13343531]]


In [36]:
user_similarity = pairwise_distances(user_item, metric="correlation")
print(user_similarity.shape) 
print(user_similarity[:10])

(943, 943)
[[0.         0.90868978 1.00581008 ... 0.99089306 0.89168522 0.69241564]
 [0.90868978 0.         0.89711534 ... 0.8372204  0.8557443  0.96598111]
 [1.00581008 0.89711534 0.         ... 0.98402421 0.90213733 1.01432321]
 ...
 [0.74364901 0.9399919  0.94988095 ... 0.95340056 0.87023545 0.77598149]
 [0.94377139 0.9164721  1.01249907 ... 0.92827527 0.96940061 0.91177162]
 [0.72338859 0.926618   0.99413661 ... 0.96702931 0.84418039 0.86656469]]
