In [2]:
import pandas as pd
from IPython.display import Image
import math
from datetime import datetime
from sklearn.model_selection import train_test_split
import numpy as np

###  These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

In [3]:
df =  pd.read_csv("../data/ml-latest-small/ratings.csv")

df['time'] = df['timestamp'].apply(datetime.fromtimestamp)
df_time = df.sort_values('time', ascending = True)
df_time['time'] = pd.to_datetime(df_time.time).dt.strftime('%Y-%m-%d')

In [53]:
df_time['movieId'].nunique()

9724

### Bin the dataset based on years:1996-2003, 2004-2011, 2012-2018

In [4]:
# 8, 8, 7 years
# split into bins
bin1 = df_time[df_time['time'] <= '2003-12-31']
bin2 = df_time[(df_time['time'] <= '2011-12-31') & (df_time['time'] > '2003-12-31')]
bin3 = df_time[(df_time['time'] > '2011-12-31' )]

In [5]:
bin1_train, bin1_test, r1_train, r1_test = train_test_split(bin1[['userId', 'movieId','time']], bin1['rating'], test_size=0.2, random_state=42)
bin2_train, bin2_test, r2_train, r2_test = train_test_split(bin2[['userId', 'movieId','time']], bin2['rating'], test_size=0.2, random_state=42)
bin3_train, bin3_test, r3_train, r3_test = train_test_split(bin3[['userId', 'movieId','time']], bin3['rating'], test_size=0.2, random_state=42)

bin1_train['rating'] = r1_train
bin2_train['rating'] = r2_train
bin3_train['rating'] = r3_train

bin1_test['rating'] = r1_test
bin2_test['rating'] = r2_test
bin3_test['rating'] = r3_test

In [6]:
# function to remove unseen movieId and userId and add them back to train dataset
def move_unseen(train, test):
    move1 = test[~test.movieId.isin(train.movieId)]
    test = test[test.movieId.isin(train.movieId)]
    
    move2 = test[~test.userId.isin(train.userId)]
    test = test[test.userId.isin(train.userId)]
    
    train = pd.concat([train, move1, move2])
    
    return train, test

In [7]:
bin1_train, bin1_test = move_unseen(bin1_train, bin1_test)
bin2_train, bin2_test = move_unseen(bin2_train, bin2_test)
bin3_train, bin3_test = move_unseen(bin3_train, bin3_test)

In [8]:
bin3_train.shape[0]/(bin3_train.shape[0] + bin3_test.shape[0])

0.8212304589006556

In [9]:
# convert to user-movie matrix
R1_train = bin1_train.pivot_table(index='userId', columns='movieId', values='rating')
R1_test = bin1_test.pivot_table(index='userId', columns='movieId', values='rating')

R2_train = bin2_train.pivot_table(index='userId', columns='movieId', values='rating')
R2_test = bin2_test.pivot_table(index='userId', columns='movieId', values='rating')

R3_train = bin3_train.pivot_table(index='userId', columns='movieId', values='rating')
R3_test = bin3_test.pivot_table(index='userId', columns='movieId', values='rating')

### calculate paramters

In [64]:
all_train = pd.concat([bin1_train, bin2_train, bin3_train])
R = all_train.pivot_table(index='userId', columns='movieId', values='rating') # all train item-user matrix

mu = all_train['rating'].mean()
### bu
bu = pd.DataFrame(np.nanmean(R, axis=1) - mu)  #bias for users among all train
bu['userId'] = R.index

bu1 = bu[bu['userId'].isin(R1_train.index)]
bu2 = bu[bu['userId'].isin(R2_train.index)]
bu3 = bu[bu['userId'].isin(R3_train.index)]

### bi
bi = pd.DataFrame(np.nanmean(R, axis=0) - mu) #bias for movies among all train
bi['movieId'] = R.columns

bi1 = bi[bi['movieId'].isin(R1_train.columns)]
bi2 = bi[bi['movieId'].isin(R2_train.columns)]
bi3 = bi[bi['movieId'].isin(R3_train.columns)]

# calculate bi,bin(t)
mu1 = bin1_train['rating'].mean()
bit1 = pd.DataFrame(np.nanmean(R1_train, axis = 0) - mu1)
bit1['movieId'] = R1_train.columns

mu2 = bin2_train['rating'].mean()
bit2 = pd.DataFrame(np.nanmean(R2_train, axis = 0) - mu2)
bit2['movieId'] = R2_train.columns

mu3 = bin3_train['rating'].mean()
bit3 = pd.DataFrame(np.nanmean(R3_train, axis = 0) - mu3)
bit3['movieId'] = R3_train.columns

 ### what we have in the end:
- train datasets: bin1_train,  bin2_train,  bin3_train
- test datasets: bin1_test, bin2_test, bin3_test

- train rating matrix: R1_train, R2_train, R3_train

- temporal dynamics bias: bu1, bu2, bu3, bi1, bi2, bi3, bit1, bit2, bit3

In [73]:
R1_train

movieId,1,2,3,4,5,6,7,8,9,10,...,7082,7085,7093,7102,7107,7121,7143,7149,7151,7153
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,,3.0,5.0,4.0,,3.0,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,5.0,,,,3.0,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
14,,,,3.0,,,3.0,,,,...,,,,,,,,,,
19,4.0,3.0,3.0,,,,2.0,,,,...,,,,,,,,,,


In [75]:
R1_train[1][1]

nan

0       0.437853
1      -0.030378
2      -0.294289
3      -1.396670
4      -0.309170
5       0.478638
6      -0.290149
7      -0.353813
8      -0.496670
9       0.008380
10      0.299248
11     -0.934170
12     -0.163337
13      0.412421
14     -0.718893
15      0.416028
16      0.238624
17      0.281107
18     -0.804890
19     -1.121670
20     -0.050004
21     -0.169084
22     -0.425242
23     -0.444039
24      0.045702
25      0.003330
26     -0.163337
27      0.614441
28      0.567846
29      0.503330
          ...   
4664   -0.096670
4680   -1.246670
4683   -0.530004
4687   -0.163337
4689   -0.496670
4690    0.503330
4691   -0.330004
4694    0.128330
4695   -0.746670
4706   -0.371670
4708   -0.353813
4717   -0.121670
4718   -0.496670
4719    0.253330
4720    0.003330
4721   -0.496670
4722    0.503330
4723    0.003330
4726    0.003330
4728    0.253330
4747    0.253330
4750   -0.246670
4758    0.253330
4762   -0.996670
4765   -0.746670
4773    1.253330
4786    0.442105
4788    0.1366