## Prepare packages

### Basic data science libraries

In [12]:
import numpy as np
import pandas as pd
import pickle
import glob

### Learn-to-rank libraries

In [13]:
# make sure to run this only once
# !conda install -y -c maciejkula -c pytorch spotlight=0.1.5

In [14]:
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.evaluation import rmse_score
from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.interactions import Interactions

In [15]:
from spotlight.evaluation import sequence_mrr_score
from spotlight.evaluation import precision_recall_score

### Deep learning libraries

In [16]:
import torch

### Other

In [17]:
import pandas_profiling

In [18]:
from scipy import stats

## Prepare data

In [19]:
import os
print(os.listdir("../input"))

['articles_embeddings.pickle', 'clicks', 'articles_metadata.csv', 'clicks_sample.csv']


### Read articles metadata

In [20]:
articles_meta = pd.read_csv('../input/articles_metadata.csv')
f'There are {len(articles_meta)} rows in the {articles_meta.ndim}-dimensional data frame'

'There are 364047 rows in the 2-dimensional data frame'

### Read articles embeddings

In [21]:
articles_emb = pickle.load(open('../input/articles_embeddings.pickle', 'rb'))
f'There are {len(articles_emb)} rows in the {articles_emb.ndim}-dimensional data frame'

'There are 364047 rows in the 2-dimensional data frame'

In [22]:
articles_emb_df = pd.DataFrame(articles_emb, columns=[f'V{i}' for i in range(0,len(articles_emb[0]))])  # assign colnames basing on the embedding index
# articles_emb_df

### Read clicks data

In [23]:
all_files = glob.glob("../input/clicks/clicks/*")
f"We're going to read {len(all_files)} files"

"We're going to read 385 files"

There are many seperate CSV file holding information on all click events made by registered users. Data is basicly a set of pairs (user_id, article_id) together with metadata like: timestamp of the click, country, etc. The 'click' dataset holds only "positive" labels, meaning that the data don't hold information on the opposite event of 'no click'. 

In [24]:
clicks = pd.concat([pd.read_csv(_) for _ in all_files], ignore_index=True)
f'Successfully read {len(clicks)} rows of the click-data'

'Successfully read 2988181 rows of the click-data'

In [25]:
f'There are {len(set(clicks.user_id.values))} distinct users'

'There are 322897 distinct users'

Join articles (metadata) together with their embeddings to make just one pandas dataframe holding information on all articles. 

In [26]:
articles = pd.concat([articles_meta, articles_emb_df], axis=1)

In [27]:
articles.columns

Index(['article_id', 'category_id', 'created_at_ts', 'publisher_id',
       'words_count', 'V0', 'V1', 'V2', 'V3', 'V4',
       ...
       'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248',
       'V249'],
      dtype='object', length=255)

## Exploratory analysis

### First glance at the data

#### Article data

In [28]:
articles.head(10)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,...,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249
0,0,0,1513144419000,0,168,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,0.428434,0.355056,0.874437,-0.528883,0.625487,0.26892,-0.822835,-0.703853,-0.625845,-0.152855,-0.666241,0.043295,0.178638,0.04689,0.594531,-0.183348,0.195107,-0.46764,-0.304807,0.353175,0.278188,0.538623,-0.371209,0.489898,-0.103833,...,0.764381,0.275644,-0.692138,-0.39324,-0.325693,0.163377,-0.154952,-0.701639,0.711825,-0.839063,-0.459145,0.919913,0.621834,-0.640471,0.389934,0.764178,0.164851,-0.747023,0.411034,0.750901,0.860643,0.614755,0.54966,0.334437,-0.388059,-0.70373,-0.56742,0.006478,-0.20659,-0.385272,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,1,1,1405341936000,0,189,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,0.67284,-0.01118,-0.347506,-0.541346,0.458492,0.482316,0.713665,-0.47405,0.405787,0.671431,-0.480856,0.434779,0.468499,0.376424,0.368048,0.192545,0.095572,-0.542234,0.065547,0.363267,-0.162804,0.319617,0.469473,0.578466,-0.803637,...,0.344321,-0.53405,0.728764,0.86011,-0.028166,0.491114,0.531556,0.54535,-0.241544,-0.091119,0.001625,0.950923,-0.367805,-0.215411,-0.131891,0.709081,0.612876,0.814885,0.27868,0.00369,-0.286773,-0.100936,-0.382605,-0.191886,-0.41642,-0.856593,0.537442,0.281693,-0.681829,0.666095,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,2,1,1408667706000,0,250,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,0.670484,-0.280388,-0.557285,-0.084145,0.027782,0.294074,0.362697,-0.368549,0.14796,-0.011751,0.030209,0.106317,0.628013,0.388849,0.615911,-0.445113,0.106028,0.137109,-0.095536,0.342532,0.592646,-0.261791,0.342123,0.704539,-0.433067,...,-0.60084,-0.539323,0.418196,0.028715,0.135008,-0.313483,0.744361,-0.145113,-0.485484,-0.591555,0.030112,0.952736,-0.569376,-0.35968,0.561676,0.381671,-0.1893,0.42095,0.022337,-0.329559,0.232765,0.424368,-0.484598,0.394343,-0.329527,-0.852293,0.634992,-0.532673,-0.469396,0.714991,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,3,1,1408468313000,0,230,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,0.59164,-0.245205,-0.788418,-0.345293,-0.756407,0.141115,0.283783,-0.369959,0.374733,0.922633,0.172219,0.253582,0.552108,-0.454131,0.38516,-0.151101,0.598559,-0.61419,-0.258765,0.591907,0.618749,0.414598,0.698184,-0.114862,-0.069056,...,-0.137121,-0.24549,0.170559,0.568089,-0.252295,-0.455726,0.756765,0.626615,-0.005399,-0.472355,0.324625,0.954345,-0.840553,-0.800332,0.111637,-0.501806,0.115986,0.80584,-0.041481,0.32063,-0.450113,-0.3787,0.509616,0.142787,0.14967,-0.896181,0.234389,-0.189831,-0.597612,0.263807,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,4,1,1407071171000,0,162,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,0.698668,0.467503,-0.740664,-0.258062,-0.09411,0.256941,0.582071,-0.025041,0.339164,-0.348473,0.413076,0.159945,0.511137,0.608007,0.626225,-0.54069,0.62364,-0.019697,0.027607,-0.23637,0.311671,-0.559831,0.766381,0.412553,-0.663338,...,0.234426,-0.689234,0.751153,0.647424,-0.033631,-0.578617,0.820915,0.370377,-0.528718,-0.536308,-0.088349,0.944439,-0.887679,-0.617357,0.25459,0.084341,0.286366,0.360714,0.369629,-0.452543,-0.292962,-0.530023,-0.237123,0.233008,0.002609,-0.835145,0.494137,0.029535,-0.619638,0.813445,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292
5,5,1,1407413929000,0,196,-0.724885,-0.97461,0.114743,0.135984,-0.059124,0.139286,-0.720786,-0.311302,-0.543599,0.613687,0.762005,0.709754,-0.736228,-0.403499,0.171004,-0.297515,0.874423,-0.30524,0.400254,-0.439359,0.066276,-0.190274,0.553277,0.585319,0.835951,-0.420721,0.657003,0.147884,-0.681257,0.414024,0.354221,-0.535813,0.709872,0.040972,-0.653258,...,0.176661,-0.518598,0.231858,0.640447,-0.333733,-0.704026,0.851587,0.67902,-0.746762,-0.405075,0.412475,0.952301,-0.801871,-0.077079,0.200865,0.581785,0.639167,0.256305,0.550526,-0.735179,-0.202093,-0.526555,-0.04689,-0.150501,-0.431276,-0.884736,0.702977,0.261991,-0.37443,0.65662,0.430414,0.49201,0.650828,-0.627942,0.562439,0.792561,-0.87757,-0.002212,0.426587,-0.065801
6,6,1,1409896802000,0,203,-0.161707,-0.967413,0.637266,0.249817,0.471958,0.391854,-0.815684,-0.296389,-0.804284,-0.513054,0.785833,0.026876,-0.6718,-0.22047,-0.352501,0.053875,-0.236351,-0.292241,0.004806,0.818565,-0.002179,0.361274,0.459593,-0.423408,0.440065,-0.073019,0.313988,-0.592909,-0.427994,0.354607,0.314822,-0.0755,0.415698,0.494282,-0.371775,...,-0.618818,-0.472592,0.268117,0.331901,0.6543,-0.292657,0.602945,0.71873,-0.137349,-0.75719,0.008487,0.934082,-0.514351,-0.800751,-0.031917,-0.061942,-0.106038,0.502453,0.010406,0.622789,-0.449857,-0.066503,-0.040897,0.599051,0.731874,-0.811536,-0.144244,-0.277359,-0.710827,0.097672,0.02042,0.509749,0.559321,-0.794207,-0.366995,0.616669,-0.626267,-0.452933,0.166825,-0.72833
7,7,1,1412559620000,0,154,-0.104534,-0.965899,0.800449,0.351384,-0.389743,0.421198,-0.600862,-0.021568,-0.265164,-0.061443,0.529295,0.554996,-0.736896,-0.629256,0.031549,0.264446,0.738236,-0.063248,0.446135,0.5285,-0.071187,0.568788,0.498562,0.408146,0.751686,-0.416993,0.675601,-0.035497,-0.437063,0.485897,-0.200215,-0.278555,0.63977,0.596259,-0.792836,...,0.189375,-0.577892,0.690582,0.736201,0.099963,-0.592421,0.604038,0.770763,-0.456483,-0.336677,0.005851,0.936196,-0.838248,-0.438001,0.00368,0.075793,0.397864,0.683909,0.035378,-0.355743,-0.586051,-0.512331,-0.404841,0.361335,0.232215,-0.787002,0.648803,0.145114,-0.554226,0.875462,0.124667,0.545793,0.372644,-0.510156,-0.2171,0.676449,-0.85248,0.198584,0.243181,-0.596663
8,8,1,1414351550000,0,209,-0.417274,-0.977311,0.054967,0.351192,0.049049,-0.460898,-0.443555,0.154505,0.018687,0.378063,-0.186111,0.05094,-0.676748,-0.287247,-0.641594,0.192993,0.426948,-0.640258,0.306624,0.000994,0.670012,0.262158,0.422815,0.481606,0.341325,-0.43405,-0.394699,-0.255908,-0.378905,0.15467,0.33581,-0.389351,0.686418,0.031918,-0.858204,...,-0.521964,-0.670205,0.17434,0.668877,0.041758,0.115617,0.699685,0.698377,-0.677529,-0.491539,-0.313109,0.955452,-0.812667,-0.127691,0.25013,-0.515665,0.683172,0.861159,-0.052384,-0.030429,-0.628825,-0.399839,0.191211,-0.46856,-0.48572,-0.851513,0.421991,-0.236139,-0.7355,0.554666,0.601953,0.756593,-0.257438,-0.130827,-0.0612,0.438138,-0.719562,-0.744506,0.163397,-0.499539
9,9,1,1412526792000,0,181,-0.348442,-0.966953,0.682574,-0.400164,0.646688,0.085186,-0.636333,-0.351318,-0.542529,0.198339,0.63916,0.116398,-0.565867,-0.466943,-0.387493,0.13452,-0.105635,-0.274094,0.390694,0.692602,-0.530828,0.642726,0.566932,-0.244221,0.261095,0.066121,0.552338,-0.474602,-0.259136,-0.007225,0.364449,0.205759,0.553315,0.76093,-0.586314,...,-0.443223,-0.466292,0.335697,0.460732,0.15553,-0.452083,0.774534,0.686354,-0.088361,-0.689979,0.587864,0.93713,-0.32254,-0.807357,0.133773,-0.040301,0.060819,0.49066,-0.193324,0.430582,-0.490337,0.438179,0.085634,0.292708,0.159592,-0.800538,0.346945,-0.288825,-0.289413,0.562637,0.220249,0.025085,0.510007,-0.832078,-0.555277,0.317832,-0.576926,-0.67172,0.304353,-0.721567


In [29]:
articles_meta.describe()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
count,364047.0,364047.0,364047.0,364047.0,364047.0
mean,182023.0,283.108239,1474070000000.0,0.0,190.897727
std,105091.461061,136.72347,42930380000.0,0.0,59.502766
min,0.0,0.0,1159356000000.0,0.0,0.0
25%,91011.5,199.0,1444925000000.0,0.0,159.0
50%,182023.0,301.0,1489422000000.0,0.0,186.0
75%,273034.5,399.0,1509891000000.0,0.0,218.0
max,364046.0,460.0,1520943000000.0,0.0,6690.0


In [30]:
articles_emp_profile = pandas_profiling.ProfileReport(articles_emb_df.assign(category_id=articles.category_id).sample(n=3*1000, 
                                                                                                                      weights='category_id', 
                                                                                                                      random_state=42))  # stratified sample per category

In [31]:
articles_emp_profile.get_rejected_variables(threshold=0.9)

['V127', 'V134', 'V150', 'V198', 'V221', 'V235', 'V43', 'category_id']

#### Clean the article data

In [32]:
articles.drop(['publisher_id', 'V127', 'V134', 'V150', 'V198', 'V221', 'V235', 'V43'], axis=1, inplace=True)

In [33]:
articles.columns

Index(['article_id', 'category_id', 'created_at_ts', 'words_count', 'V0', 'V1',
       'V2', 'V3', 'V4', 'V5',
       ...
       'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248',
       'V249'],
      dtype='object', length=247)

#### Click data

In [34]:
clicks.head(10)

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,137066,1508106983217640,1508106983000,2,218028,1508107123103,4,1,17,1,25,2
1,137066,1508106983217640,1508106983000,2,214753,1508107153103,4,1,17,1,25,2
2,312478,1508106983626641,1508106983000,2,74719,1508107495538,4,1,17,1,25,4
3,312478,1508106983626641,1508106983000,2,74722,1508107525538,4,1,17,1,25,4
4,266161,1508106984212642,1508106984000,3,331116,1508107023602,4,3,2,1,20,2
5,266161,1508106984212642,1508106984000,3,199474,1508107335633,4,3,2,1,20,2
6,266161,1508106984212642,1508106984000,3,277492,1508107365633,4,3,2,1,20,2
7,33723,1508106984307643,1508106984000,2,284802,1508107248277,4,1,17,1,21,2
8,33723,1508106984307643,1508106984000,2,215613,1508107278277,4,1,17,1,21,2
9,115570,1508106985263644,1508106985000,2,202355,1508107182256,4,4,20,1,26,2


In [35]:
clicks = clicks.infer_objects()

In [36]:
clicks.dtypes

user_id                int64
session_id             int64
session_start          int64
session_size           int64
click_article_id       int64
click_timestamp        int64
click_environment      int64
click_deviceGroup      int64
click_os               int64
click_country          int64
click_region           int64
click_referrer_type    int64
dtype: object

In [37]:
pandas_profiling.ProfileReport(clicks)

0,1
Number of variables,12
Number of observations,2988181
Total Missing (%),0.0%
Total size in memory,273.6 MiB
Average record size in memory,96.0 B

0,1
Numeric,10
Categorical,0
Boolean,0
Date,0
Text (Unique),0
Rejected,2
Unsupported,0

0,1
Distinct count,46033
Unique (%),1.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,194920
Minimum,3
Maximum,364046
Zeros (%),0.0%

0,1
Minimum,3
5-th percentile,42223
Q1,124230
Median,202380
Q3,277070
95-th percentile,336250
Maximum,364046
Range,364043
Interquartile range,152840

0,1
Standard deviation,90768
Coef of variation,0.46566
Kurtosis,-0.94305
Mean,194920
MAD,76806
Skewness,-0.12344
Sum,582464155343
Variance,8238900000
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
160974,37213,1.2%,
272143,28943,1.0%,
336221,23851,0.8%,
234698,23499,0.8%,
123909,23122,0.8%,
336223,21855,0.7%,
96210,21577,0.7%,
162655,21062,0.7%,
183176,20303,0.7%,
168623,19526,0.7%,

Value,Count,Frequency (%),Unnamed: 3
3,1,0.0%,
27,1,0.0%,
69,1,0.0%,
81,2,0.0%,
84,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
364017,22,0.0%,
364022,1,0.0%,
364028,1,0.0%,
364043,8,0.0%,
364046,2,0.0%,

0,1
Distinct count,11
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.3577
Minimum,1
Maximum,11
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,11
Range,10
Interquartile range,0

0,1
Standard deviation,1.7259
Coef of variation,1.2712
Kurtosis,21.553
Mean,1.3577
MAD,0.68281
Skewness,4.8023
Sum,4056922
Variance,2.9786
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
1,2852406,95.5%,
10,61377,2.1%,
11,29999,1.0%,
8,9556,0.3%,
6,7256,0.2%,
9,6746,0.2%,
2,6101,0.2%,
3,4540,0.2%,
5,3498,0.1%,
4,3389,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,2852406,95.5%,
2,6101,0.2%,
3,4540,0.2%,
4,3389,0.1%,
5,3498,0.1%,

Value,Count,Frequency (%),Unnamed: 3
7,3313,0.1%,
8,9556,0.3%,
9,6746,0.2%,
10,61377,2.1%,
11,29999,1.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.8193
Minimum,1
Maximum,5
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,1
Q3,3
95-th percentile,3
Maximum,5
Range,4
Interquartile range,2

0,1
Standard deviation,1.0422
Coef of variation,0.57286
Kurtosis,-1.427
Mean,1.8193
MAD,0.99976
Skewness,0.57639
Sum,5436415
Variance,1.0862
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
1,1823162,61.0%,
3,1047086,35.0%,
4,117640,3.9%,
5,283,0.0%,
2,10,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1823162,61.0%,
2,10,0.0%,
3,1047086,35.0%,
4,117640,3.9%,
5,283,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1823162,61.0%,
2,10,0.0%,
3,1047086,35.0%,
4,117640,3.9%,
5,283,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.9427
Minimum,1
Maximum,4
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,4
Q1,4
Median,4
Q3,4
95-th percentile,4
Maximum,4
Range,3
Interquartile range,0

0,1
Standard deviation,0.33968
Coef of variation,0.086155
Kurtosis,33.013
Mean,3.9427
MAD,0.11148
Skewness,-5.8487
Sum,11781358
Variance,0.11538
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
4,2904478,97.2%,
2,79743,2.7%,
1,3960,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,3960,0.1%,
2,79743,2.7%,
4,2904478,97.2%,

Value,Count,Frequency (%),Unnamed: 3
1,3960,0.1%,
2,79743,2.7%,
4,2904478,97.2%,

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,13.278
Minimum,2
Maximum,20
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,2
Q1,2
Median,17
Q3,17
95-th percentile,20
Maximum,20
Range,18
Interquartile range,15

0,1
Standard deviation,6.8817
Coef of variation,0.5183
Kurtosis,-0.93175
Mean,13.278
MAD,6.0178
Skewness,-0.95412
Sum,39675882
Variance,47.358
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
17,1738138,58.2%,
2,788699,26.4%,
20,369586,12.4%,
12,60096,2.0%,
13,23711,0.8%,
19,6384,0.2%,
5,1513,0.1%,
3,54,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2,788699,26.4%,
3,54,0.0%,
5,1513,0.1%,
12,60096,2.0%,
13,23711,0.8%,

Value,Count,Frequency (%),Unnamed: 3
12,60096,2.0%,
13,23711,0.8%,
17,1738138,58.2%,
19,6384,0.2%,
20,369586,12.4%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.839
Minimum,1
Maximum,7
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,2
Q3,2
95-th percentile,5
Maximum,7
Range,6
Interquartile range,1

0,1
Standard deviation,1.1564
Coef of variation,0.6288
Kurtosis,9.1175
Mean,1.839
MAD,0.67065
Skewness,2.84
Sum,5495209
Variance,1.3372
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
2,1602601,53.6%,
1,1194321,40.0%,
5,80766,2.7%,
7,69798,2.3%,
6,20455,0.7%,
4,19820,0.7%,
3,420,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1194321,40.0%,
2,1602601,53.6%,
3,420,0.0%,
4,19820,0.7%,
5,80766,2.7%,

Value,Count,Frequency (%),Unnamed: 3
3,420,0.0%,
4,19820,0.7%,
5,80766,2.7%,
6,20455,0.7%,
7,69798,2.3%,

0,1
Distinct count,28
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,18.313
Minimum,1
Maximum,28
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,5
Q1,13
Median,21
Q3,25
95-th percentile,27
Maximum,28
Range,27
Interquartile range,12

0,1
Standard deviation,7.064
Coef of variation,0.38573
Kurtosis,-0.97551
Mean,18.313
MAD,6.2052
Skewness,-0.54588
Sum,54723498
Variance,49.9
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
25,804985,26.9%,
21,464230,15.5%,
13,320957,10.7%,
8,179339,6.0%,
16,164884,5.5%,
28,135793,4.5%,
24,130537,4.4%,
20,120884,4.0%,
5,96979,3.2%,
9,84693,2.8%,

Value,Count,Frequency (%),Unnamed: 3
1,7110,0.2%,
2,16728,0.6%,
3,3997,0.1%,
4,30265,1.0%,
5,96979,3.2%,

Value,Count,Frequency (%),Unnamed: 3
24,130537,4.4%,
25,804985,26.9%,
26,18893,0.6%,
27,18711,0.6%,
28,135793,4.5%,

0,1
Correlation,0.99901

0,1
Distinct count,1048594
Unique (%),35.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1507500000000000
Minimum,1506825423271737
Maximum,1508211379189330
Zeros (%),0.0%

0,1
Minimum,1506825423271737
5-th percentile,1506900000000000
Q1,1507100000000000
Median,1507500000000000
Q3,1507700000000000
95-th percentile,1508200000000000
Maximum,1508211379189330
Range,1385955917593
Interquartile range,625260000000

0,1
Standard deviation,385520000000
Coef of variation,0.00025574
Kurtosis,-1.1114
Mean,1507500000000000
MAD,335290000000
Skewness,0.18076
Sum,3594316781523237933
Variance,1.4863e+23
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
1507563657895091,124,0.0%,
1507896573228093,107,0.0%,
1507133567968022,106,0.0%,
1507309773225261,98,0.0%,
1508112331270612,94,0.0%,
1507647366292530,92,0.0%,
1507475403662486,86,0.0%,
1506959499272114,82,0.0%,
1508154737228813,79,0.0%,
1506999909218419,75,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1506825423271737,2,0.0%,
1506825426267738,2,0.0%,
1506825435299739,2,0.0%,
1506825442704740,2,0.0%,
1506825528135741,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1508211367250326,2,0.0%,
1508211369104327,7,0.0%,
1508211372158328,2,0.0%,
1508211376302329,2,0.0%,
1508211379189330,2,0.0%,

0,1
Distinct count,72
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.9019
Minimum,2
Maximum,124
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,2
Q1,2
Median,3
Q3,4
95-th percentile,9
Maximum,124
Range,122
Interquartile range,2

0,1
Standard deviation,3.9299
Coef of variation,1.0072
Kurtosis,158.46
Mean,3.9019
MAD,2.0089
Skewness,9.0901
Sum,11659539
Variance,15.444
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
2,1260372,42.2%,
3,670185,22.4%,
4,374240,12.5%,
5,220105,7.4%,
6,135762,4.5%,
7,88354,3.0%,
8,58544,2.0%,
9,40878,1.4%,
10,29530,1.0%,
11,21714,0.7%,

Value,Count,Frequency (%),Unnamed: 3
2,1260372,42.2%,
3,670185,22.4%,
4,374240,12.5%,
5,220105,7.4%,
6,135762,4.5%,

Value,Count,Frequency (%),Unnamed: 3
94,94,0.0%,
98,98,0.0%,
106,106,0.0%,
107,107,0.0%,
124,124,0.0%,

0,1
Correlation,1

0,1
Distinct count,322897
Unique (%),10.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,107950
Minimum,0
Maximum,322896
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,6370
Q1,40341
Median,86229
Q3,163260
95-th percentile,274160
Maximum,322896
Range,322896
Interquartile range,122920

0,1
Standard deviation,83648
Coef of variation,0.7749
Kurtosis,-0.46867
Mean,107950
MAD,69871
Skewness,0.72312
Sum,322567642028
Variance,6997000000
Memory size,22.8 MiB

Value,Count,Frequency (%),Unnamed: 3
5890,1232,0.0%,
73574,939,0.0%,
15867,900,0.0%,
80350,783,0.0%,
15275,746,0.0%,
2151,722,0.0%,
4568,529,0.0%,
12897,513,0.0%,
11521,502,0.0%,
34541,501,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,8,0.0%,
1,12,0.0%,
2,4,0.0%,
3,17,0.0%,
4,7,0.0%,

Value,Count,Frequency (%),Unnamed: 3
322892,2,0.0%,
322893,2,0.0%,
322894,2,0.0%,
322895,2,0.0%,
322896,2,0.0%,

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,137066,1508106983217640,1508106983000,2,218028,1508107123103,4,1,17,1,25,2
1,137066,1508106983217640,1508106983000,2,214753,1508107153103,4,1,17,1,25,2
2,312478,1508106983626641,1508106983000,2,74719,1508107495538,4,1,17,1,25,4
3,312478,1508106983626641,1508106983000,2,74722,1508107525538,4,1,17,1,25,4
4,266161,1508106984212642,1508106984000,3,331116,1508107023602,4,3,2,1,20,2


## Recommender model

In [38]:
small_data = clicks.sample(n=200*1000, weights="click_region", random_state=42)

### Prepare train-test data

In [39]:
dataset = Interactions(user_ids=small_data.user_id.values, 
                       item_ids=small_data.click_article_id.values, 
                       timestamps=small_data.click_timestamp.values)

In [40]:
train, test = random_train_test_split(dataset, test_percentage=0.1)

In [41]:
train, test

(<Interactions dataset (322895 users x 364044 items x 180000 interactions)>,
 <Interactions dataset (322895 users x 364044 items x 20000 interactions)>)

In [42]:
model = ImplicitFactorizationModel(n_iter=4, 
                                   loss='bpr', 
                                   batch_size=32,
                                   use_cuda=torch.cuda.is_available(), 
                                   random_state=np.random.RandomState(42))

In [43]:
model.fit(train, verbose=True)

  return torch._C._nn.nll_loss(input, target, weight, size_average, ignore_index, reduce)


Epoch 0: loss 0.144397094260984
Epoch 1: loss 0.013176932133899795
Epoch 2: loss 0.0005321995268265406
Epoch 3: loss 0.00023682927323712244


In [44]:
mrr = mrr_score(model, test)

In [45]:
f'Mean reciprocal rank score is {np.mean(mrr)}'

'Mean reciprocal rank score is 0.029870174154422592'

In [46]:
stats.describe(mrr)

DescribeResult(nobs=18031, minmax=(2.747418800037365e-06, 1.0), mean=0.029870174154422592, variance=0.011798552605954624, skewness=7.174885597789388, kurtosis=57.29601146956058)