dataset:

original: 
- all - http://files.grouplens.org/datasets/movielens/ml-25m.zip
- small - http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

1. Exploratory analysis.

    Perform EDA:
    - number of users
    - number of items
    - number of tags
    - number of ratings
    - summary statistics of number of ratings per user (mean, median, quantiles)
    - rating sparcity
    - summary statistics of number of tags per user

2. Evaluation metrics report 

    Implement evaluation metrics to assess recommender performance
    - MAP score
    - Precision@k
    - Mean percentage ranking

3. Train-test split
    - Build a function for splitting data into two dataset:train and test
    - Garther stats for train vs. test: number of user and items and cold-user and new items in the test 
    
4. Base transformation for data
    - aggregation
    - implicit feedback

5. Collaborative filtering based recommenders 

    3.1. Most popular items (MP) recommender
    Build Most popular items (MP) recommender, evaluate performance of this baseline

    3.2. Build User2User and Item2Item CF approach


In [17]:
import pandas as pd
import numpy as np
from pathlib import Path

In [18]:
DATA_FOLDER = Path('./data/ml-latest-small/')

### Exploratory analysis

In [19]:
list(DATA_FOLDER.glob('*.csv'))

[WindowsPath('data/ml-latest-small/links.csv'),
 WindowsPath('data/ml-latest-small/movies.csv'),
 WindowsPath('data/ml-latest-small/ratings.csv'),
 WindowsPath('data/ml-latest-small/tags.csv')]

In [20]:
links = pd.read_csv(DATA_FOLDER / 'links.csv')
movies = pd.read_csv(DATA_FOLDER / 'movies.csv')
ratings = pd.read_csv(DATA_FOLDER / 'ratings.csv')
tags = pd.read_csv(DATA_FOLDER / 'tags.csv')

In [21]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [22]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [24]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [29]:
print('count unique users', ratings.userId.nunique())

count unique users 610


In [30]:
print('count unique item', ratings.movieId.nunique())

count unique item 9724


In [31]:
print('count tags', tags.tag.nunique())

count tags 1589


In [32]:
print('count ratings', ratings.shape[0])

count ratings 100836


In [34]:
# количество незаполненных пар юзер-рейтинг

print('sparcity of rating matrix', 
      1-ratings.shape[0]/(ratings.movieId.nunique()*ratings.userId.nunique()))

sparcity of rating matrix 0.9830003169443864


In [35]:
#summary statistics of number of ratings per user (mean, median, quantiles)
stats = ratings.groupby('userId')['rating']\
    .agg([np.mean, np.median, max, min, len, np.std])

In [36]:
stats_tags = tags.groupby('userId')['tag'].agg(len)

In [37]:
stats = stats.join(stats_tags)

In [45]:
stats.head(10)

Unnamed: 0_level_0,mean,median,max,min,len,std,tag,is_null_tag
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4.366379,5.0,5.0,1.0,232.0,0.800048,,True
2,3.948276,4.0,5.0,2.0,29.0,0.805615,9.0,False
3,2.435897,0.5,5.0,0.5,39.0,2.090642,,True
4,3.555556,4.0,5.0,1.0,216.0,1.314204,,True
5,3.636364,4.0,5.0,1.0,44.0,0.990441,,True
6,3.493631,3.0,5.0,1.0,314.0,0.850648,,True
7,3.230263,3.5,5.0,0.5,152.0,1.329594,1.0,False
8,3.574468,3.0,5.0,1.0,47.0,0.972328,,True
9,3.26087,3.0,5.0,1.0,46.0,1.272526,,True
10,3.278571,3.5,5.0,0.5,140.0,1.1757,,True


In [50]:
ratings.groupby('userId')['rating'].describe(percentiles=[0.05, 0.25, 0.75, .95]).head(10)

Unnamed: 0_level_0,count,mean,std,min,5%,25%,50%,75%,95%,max
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,232.0,4.366379,0.800048,1.0,3.0,4.0,5.0,5.0,5.0,5.0
2,29.0,3.948276,0.805615,2.0,2.7,3.5,4.0,4.5,5.0,5.0
3,39.0,2.435897,2.090642,0.5,0.5,0.5,0.5,4.75,5.0,5.0
4,216.0,3.555556,1.314204,1.0,1.0,3.0,4.0,5.0,5.0,5.0
5,44.0,3.636364,0.990441,1.0,2.0,3.0,4.0,4.0,5.0,5.0
6,314.0,3.493631,0.850648,1.0,2.0,3.0,3.0,4.0,5.0,5.0
7,152.0,3.230263,1.329594,0.5,1.0,2.0,3.5,4.5,5.0,5.0
8,47.0,3.574468,0.972328,1.0,2.0,3.0,3.0,4.0,5.0,5.0
9,46.0,3.26087,1.272526,1.0,1.0,2.25,3.0,4.0,5.0,5.0
10,140.0,3.278571,1.1757,0.5,0.5,3.0,3.5,4.0,5.0,5.0


In [51]:
stats['is_null_tag'] = stats.tag.isnull()

In [52]:
stats['is_null_tag'].sum()

552

In [53]:
import plotly.express as px

In [54]:
fig = px.histogram(stats, x="mean", y="len", color='is_null_tag',
                   marginal="rug", title = 'Mean rating distribution')
fig.show()

In [55]:
fig = px.histogram(stats, x="median", y="len", color='is_null_tag',
                   marginal="rug", title = 'Median rating distribution')
fig.show()

In [56]:
fig = px.histogram(stats, x="tag", y="len", marginal="rug", title = 'Count tags distribution')
fig.show()

In [57]:
ratings.timestamp = pd.to_datetime(ratings.timestamp, unit='s')

In [58]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [59]:
count_time_stats = ratings.groupby(ratings.timestamp.dt.to_period('M'))['timestamp'].count()

In [60]:
count_time_stats.name = 'cnt'
count_time_stats = count_time_stats.reset_index()

In [61]:
count_time_stats.head()

Unnamed: 0,timestamp,cnt
0,1996-03,58
1,1996-04,165
2,1996-05,832
3,1996-06,883
4,1996-07,489


In [62]:
count_time_stats.timestamp = count_time_stats.timestamp.astype(str)

In [63]:
fig = px.bar(count_time_stats, x="timestamp", y="cnt", title = 'Count ratings stats')
fig.show()

### Train-Test Split

#### timestamp

можно делить по времени, чтобы предсказать на какой-то сезон

In [64]:
TRESHOLD_DATE = '2018-01-01'

In [65]:
train = ratings[ratings.timestamp<=TRESHOLD_DATE]
test = ratings[ratings.timestamp>TRESHOLD_DATE]

In [66]:
print('dataset train ', train.shape[0], ' test', test.shape[0])
print('unique user in train', train.userId.nunique(), 'test ', test.userId.nunique())
print('cold-user in the test ', test.userId.nunique() - test[test.userId.isin(train.userId)].userId.nunique())

dataset train  94418  test 6418
unique user in train 581 test  49
cold-user in the test  29


In [67]:
print('unique user in train', train.movieId.nunique(), 'test ', test.movieId.nunique())
print('cold-item in the test ', test.movieId.nunique() - test[test.movieId.isin(train.movieId)].movieId.nunique())

unique user in train 8830 test  3349
cold-item in the test  894


#### by users

а можно делить по юзерам, но предсказывать для будущего

In [80]:
TRESHOLD = 0.94
MIN_RATE = 10

In [81]:
ratings_filtered = ratings.groupby('userId').filter(lambda x: len(x)>=MIN_RATE)

In [82]:
print('Rating table after min filtering', ratings_filtered.shape[0])

Rating table after min filtering 100836


In [83]:
train = pd.DataFrame()
test = pd.DataFrame()
for user in ratings_filtered.userId.unique():
    temp_table = ratings_filtered[ratings_filtered.userId == user]
    cnt = temp_table.shape[0]
    size_train = round(cnt*TRESHOLD, 0)
    train = train.append(temp_table.sort_values('timestamp').iloc[:int(size_train),:])
    test = test.append(temp_table.sort_values('timestamp').iloc[int(size_train):,:])

In [84]:
print('dataset train ', train.shape[0], ' test', test.shape[0])
print('unique user in train', train.userId.nunique(), 'test ', test.userId.nunique())
print('cold-user in the test ', test.userId.nunique() - test[test.userId.isin(train.userId)].userId.nunique())

dataset train  94797  test 6039
unique user in train 610 test  610
cold-user in the test  0


In [85]:
print('unique user in train', train.movieId.nunique(), 'test ', test.movieId.nunique())
print('cold-item in the test ', test.movieId.nunique() - test[test.movieId.isin(train.movieId)].movieId.nunique())

unique user in train 9241 test  3359
cold-item in the test  483
