In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from BiDict import BiDict
from sklearn.metrics.pairwise import cosine_distances
import pickle
import warnings


In [2]:
# how many items we recommend
top_n = 3
# number of nearest neighbours
nn_count = 5
# average similar threshold. If avegage item score of similar clients are lower than this we ignore it
score_threshold = 4.5
# if similar clients can't help us to build recommendations, we look for (1-default_frequency_percentile) most popular
# items ang choose best rated
default_frequency_percentile = 0.95


### We well need to create bidirectional dict, because we will need to use our own ids of users and items.

In [3]:
from collections import Counter


class BiDict(dict):
    def __init__(self, *args, **kwargs):
        super(BiDict, self).__init__(*args, **kwargs)
        self.check_integrity()
        self.inverse = {v: k for k, v in self.items()}

    def __setitem__(self, key, value):
        if key in self:
            del self.inverse[self[key]]
        if value in self.inverse and self.inverse[value] != key:
            raise KeyError(f'Values must be unique. Current value has already key={self.inverse[value]}')
        super(BiDict, self).__setitem__(key, value)
        self.inverse[value] = key

    def __delitem__(self, key):
        del self.inverse[self[key]]
        super(BiDict, self).__delitem__(key)

    def check_integrity(self):
        repeated_values = {value: count for value, count in Counter(self.values()).items() if count > 1}
        if repeated_values:
            raise KeyError(f'Values must be unique. Current repeated values: {repeated_values}')

In [4]:
df = pd.read_csv('Reviews.csv').sort_values('Time').drop_duplicates(subset=['UserId', 'ProductId'], keep='last').reset_index()
df_train, df_test = train_test_split(df, shuffle=False, test_size=0.5)
df_test = df_test.iloc[:2000]

users, items = [BiDict(enumerate(sorted(df_train[col].unique()))) for col in ['UserId', 'ProductId']]
users_test = sorted(df_test['UserId'].unique())
mean_score = df_train['Score'].mean()

## We can't t do straightforward pivoting, because resulting table will be too big. Because of that we will use sparse matrix. It is constructed for about 5 minutes, so we can pickle it once, if we want

In [5]:
# pivot_train = csr_matrix((len(users), len(items)), dtype=int
# for i, row in df_train.iterrows():
#     print(i)
#     pivot_train[users.inverse[row['UserId']], items.inverse[row['ProductId']]] = row['Score'] - mean_score
# with open('pivot.pickle', 'wb') as f:
#     pickle.dump(pivot_train, f)

## In this table rows are user ids, columns are item ids. We are using our own ids which are equal to corresponding row/column number. "users" and "items" are correspinding bidirectional mappings.

In [6]:
with open('pivot.pickle', 'rb') as f:
    pivot_train = pickle.load(f)
pivot_train[:10, :15].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

### best_common_item_ids - the most high rated popular items. We will recomend them if collaborative filtering can't help us (i. e. when we are "cold starting").

In [7]:
item_counts = pd.Series(np.array(np.array((pivot_train != 0).sum(axis=0))[0]))
most_common_item_ids = item_counts[item_counts > item_counts.quantile(default_frequency_percentile)].index
most_common_pivot = pivot_train[:, most_common_item_ids].todense().astype(float)
most_common_pivot[most_common_pivot == 0] = np.nan
best_common_item_ids = pd.Series(np.array(np.nanmean(most_common_pivot, axis=0))[0, :], index=most_common_item_ids).sort_values(ascending=False)
best_common_item_ids = pd.Series(best_common_item_ids.index)
best_common_item_ids

0        7604
1       32503
2        7622
3       34334
4       23415
        ...  
1902    23044
1903    39277
1904    43300
1905     6707
1906    35600
Length: 1907, dtype: int64

### That is how we are doing predictions for 2000 users. We are not using DataFrames, but scipy sparse matrices, so code is quite complicated.

In [8]:
predictions = {}

for user in users_test:
    if user in users.inverse.keys():
        user_id = users.inverse[user]
        already_used_item_ids = np.where(np.array(pivot_train[user_id].todense())[0, :] != 0)[0]
        distances = pd.Series(cosine_distances(pivot_train[user_id], pivot_train)[0], index=sorted(users.keys())).sort_values()
        distances = distances[distances.index != user_id][:nn_count]
        nearest_scores = pivot_train[distances.index].todense().astype(float)
        nearest_scores[nearest_scores == 0] = np.nan
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            best_items = pd.Series(np.array(np.nanmean(nearest_scores, axis=0))[0, :], index=sorted(items.keys())).sort_values(ascending=False)
        best_items = best_items[best_items.notnull() & (best_items > score_threshold)]
        # adding most popular items to nearest neighbour predictions
        predictions[user] = pd.Series(best_items.index).append(best_common_item_ids).drop_duplicates().to_list()
        # removing already used items
        predictions[user] = [x for x in predictions[user] if x not in already_used_item_ids][:top_n]
        # replacing item ids by item names
        predictions[user] = [items[item_id] for item_id in predictions[user]]
predictions

{'A108XP24UESKSV': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A11SWG9T60IQH8': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A123CCAYS2BXP8': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A12DQZKRKTNF5E': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A12IOCD2A7OC7K': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A12Z43CZ1O15D5': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A130VGG4P4PW5J': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A147FUNITGB21I': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A147MFU6M0DATT': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A14AZ5HGGKQNYE': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A14LI3UQKUO3AU': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A14R4APY0QEI2D': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A14X244VGHWPSX': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A14Y1TCC5HJZ13': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A158I4XBGHVQ7W': ['B000ER1CVK', 'B002ESSASK', 'B000ER3EKM'],
 'A15MUBAQPWYUR4': ['B000ER1CVK', 'B002ESSASK', 'B000ER

### We might want to know, how good is our predicions but unfortunately only 1 of our predicions in test dataset. This only recommended score is 5.

In [9]:
scores = {}
for user in predictions:
    for item in predictions[user]:
        assert item in df_train['ProductId'].to_list()
        cur_table = df_test[(df_test['UserId'] == user) & (df_test['ProductId'] == item)]
        if cur_table.shape[0] > 0:
            scores.setdefault(user, []).append(cur_table['Score'].iloc[0])
scores

{'A2UJGR6IOIUOJA': [5]}