# 0. Imports

In [9]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from metrics import euclidean_similarity, pearson_similarity
import numpy.ma as ma

import scipy.sparse as scp

from metrics import mapk

from elementary import ElementaryRecommendation

# 1. Models description

## Baseline

### **Elementary Recommendation**

The most simlest way to predict ratings could be just a predicting average rating of all items, but this is not good at all. So for a baseline we can consider users and items bias. For example, some users rate the movies only in range 1-4 out of 10: rating 5 for them is something phenomenal; or users can also rate movies only highly.

Let $\mu$ be the average rating of all items, $r_{ui}$ - rating of item $i$ by user $u$, $I_u$ - items that were rated by user $u$. Then we can calculate user bias $b_u$ as:

$$
\begin{align*}
b_u=&\frac{1}{|I_u|+\alpha}\sum_{i\in{}I_u}(r_{ui} - \mu)
\end{align*}, 
$$

where $\alpha$ is smoothing coefficient.

We can similarly define item bias, but now let's take into account calculated users biases. Let $U_i$ be the users that rated item $i$. So then item bias $b_i$ is:

$$
\begin{align*}
b_i=&\frac{1}{|U_i|+\alpha}\sum_{u\in{}U_i}(r_{ui} - b_u - \mu)
\end{align*}
$$

**Final prediction will be:**

$$
\begin{align*}
r_{ui}=&\mu+b_u+b_i
\end{align*}
$$

> ##### See implementation of this approach in **elementary.py** file.

## Similarity based

# 2. Data preparing

We will use subset of **The Movies Dataset** from **kaggle** to evaluate our models.

In [10]:
data = pd.read_csv('ratings_small.csv', index_col=False)
data.shape

(100004, 4)

In [11]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [12]:
print((data.userId.min(), data.userId.max(), len(data.userId.unique())))
print((data.movieId.min(), data.movieId.max(), len(data.movieId.unique())))

(1, 671, 671)
(1, 163949, 9066)


There are many missings id's so it would be more convenient if we changed the numbering.

In [13]:
user_to_idx = {userId : idx for idx, userId in enumerate(data.userId.unique())}
movie_to_idx = {movieId : idx for idx, movieId in enumerate(data.movieId.unique())}

data.replace({'userId': user_to_idx, 'movieId': movie_to_idx}, inplace=True)

In [14]:
print((data.userId.min(), data.userId.max(), len(data.userId.unique())))
print((data.movieId.min(), data.movieId.max(), len(data.movieId.unique())))

(0, 670, 671)
(0, 9065, 9066)


Let's split our data into train and test this way: we will remove the top 5 ratings from the most active users and put them in test data.

In [15]:
train_data = data.copy()
test_data = pd.DataFrame([], columns=train_data.columns)

active_users = train_data.userId.value_counts()[:10].index

for user_id in active_users:
    test = train_data[train_data.userId == user_id].sort_values(by='rating', ascending=False)[:5]
    test_data = test_data.append(test, ignore_index=True)
    train_data = train_data[~((train_data.userId == user_id) & (train_data.movieId.isin(test.movieId.values)))]

train_data.shape, test_data.shape

((99954, 4), (50, 4))

It would be rather well to use [sparse matrix](https://en.wikipedia.org/wiki/Sparse_matrix) for storing all ratings, but in this case let's use dense one for simplifying some actions.

In [16]:
X = scp.coo_matrix(
    (
        train_data['rating'],  # users ratings
        (train_data['userId'], train_data['movieId'])  # users and movies id's whose ratings are known
        ),
        shape=(len(user_to_idx), len(movie_to_idx))  # ratings matrix shape
    ).tocsr().A

In [17]:
X

array([[2.5, 3. , 3. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

# 3. Quantifying the quality of recommendations.

In [67]:
predicted = []

er = ElementaryRecommendation()
er.fit(X)

for id in test_data.userId.unique():
    predicted.append(er.make_recommendation(user_id=id, n_recommendations=100))

In [68]:
actual = []

for id in test_data.userId.unique():
    actual.append(test_data[test_data.userId == id]['movieId'].astype(int).values)

In [69]:
print(mapk(actual=actual, predicted=predicted, k=5))
print(mapk(actual=actual, predicted=predicted, k=10))
print(mapk(actual=actual, predicted=predicted, k=100))

0.035
0.02
0.0024773869699382953
