In [22]:
# libraries
import pandas as pd
import numpy as np
import time
import torch
from tqdm import tqdm
from preprocessing import extract_data_mooc, extractFeatures,extractItemUserId,extractNextStateItem,extractNextUserState,UserNextInteraction, delta, t_batch_update,train_test_split
from model import RODIE,dynamic_embedding
from train import train_rodie

In [2]:
## Téléchargement des données
!wget https://snap.stanford.edu/data/act-mooc.tar.gz
!tar -xzf  act-mooc.tar.gz
!mkdir data

--2022-03-17 09:37:34--  https://snap.stanford.edu/data/act-mooc.tar.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5378133 (5.1M) [application/x-gzip]
Saving to: ‘act-mooc.tar.gz’


2022-03-17 09:37:35 (5.23 MB/s) - ‘act-mooc.tar.gz’ saved [5378133/5378133]



In [23]:
features = pd.read_csv("act-mooc/mooc_action_features.tsv",sep="\t")
labels = pd.read_csv("act-mooc/mooc_action_labels.tsv",sep="\t")
users = pd.read_csv("act-mooc/mooc_actions.tsv",sep="\t")

#### Load & Preprocess Data

In [24]:
mooc_data = extract_data_mooc()

delta_u  = delta(mooc_data.copy(),"user_id")
delta_i  = delta(mooc_data.copy(),"item_id")
nextItemInteraction = UserNextInteraction(mooc_data.copy())
next_state_user = extractNextUserState(mooc_data.copy())


mooc_data['delta_u'] = delta_u
mooc_data['delta_i'] = delta_i
mooc_data['nextItemInteraction'] = nextItemInteraction
mooc_data['next_state_user'] = next_state_user

data = mooc_data.copy()
data = data[ (data.nextItemInteraction != -1) | (data.next_state_user != -1)  ]

data = data[['user_id', 'item_id', 'timestamp', 'state_label','delta_u', 'delta_i', 'nextItemInteraction', 'next_state_user','f1', 'f2', 'f3','f4']]
data.head()

features columns Index(['ACTIONID', 'FEATURE0', 'FEATURE1', 'FEATURE2', 'FEATURE3'], dtype='object')

labels columns Index(['ACTIONID', 'LABEL'], dtype='object')
users columns Index(['ACTIONID', 'USERID', 'TARGETID', 'TIMESTAMP'], dtype='object')
delta user_id
delta item_id


Unnamed: 0_level_0,user_id,item_id,timestamp,state_label,delta_u,delta_i,nextItemInteraction,next_state_user,f1,f2,f3,f4
ACTIONID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,0.0,0,0,0,1,0,-0.319991,-0.435701,0.106784,-0.067309
1,0,1,6.0,0,6,0,2,0,-0.319991,-0.435701,0.106784,-0.067309
2,0,2,41.0,0,35,0,1,0,-0.319991,-0.435701,0.106784,-0.067309
3,0,1,49.0,0,8,43,2,0,-0.319991,-0.435701,0.106784,-0.067309
4,0,2,51.0,0,2,10,3,0,-0.319991,-0.435701,0.106784,-0.067309


### T-batches

#### Train / Test SPLIT

In [26]:
df_train1,df_test1 = train_test_stratified_split(data)
df_train2,df_test2 = train_test_stratified_split(df_test1)
df_train3,df_test = train_test_stratified_split(df_test2)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)


In [27]:
df_train = pd.concat([df_train1,df_train2,df_train3],axis=0)
df_train.shape,df_test.shape

((354115, 12), (50587, 12))

In [47]:
print("Proportion of change state in :\n Train data = {:.1f}%\n Test Data= {:.1f}%".format(100*np.sum(df_train['next_state_user'])/df_train.shape[0],100*np.sum(df_test['next_state_user'])/df_test.shape[0]))

Proportion of change state in :
 Train data = 1.0%
 Test Data= 1.0%


In [30]:
t_batches_train = t_batch_update(df_train)
t_batches_test = t_batch_update(df_test)

T-Batch start...
Number of interaction = 354115
T-Batch ends !
T-Batch start...
Number of interaction = 50587
T-Batch ends !


##### Initialize Device

In [31]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print('Using device:', device)

Using device: cuda


##### Initialize the model

In [36]:
embedding_dim = 8
data_torch = torch.from_numpy(df_train.values.astype(np.float32))
U_dynamic,I_dynamic = dynamic_embedding(data_torch,embedding_dim)  # Initial dynamic embedding
    
U_dynamic = U_dynamic.to(device)
I_dynamic = I_dynamic.to(device)

model = RODIE(embedding_dim,data_torch,device=device).to(device)

Initialisation of dynamic embedding... Done !
Dynamic Embedding shape : Users [6882, 8], 	 Items [87, 8]
Number of users of 6882 
 Number of items 87 

Dataset size [354115, 12]
Initialisation of static embedding... Done !
Static Embedding shape : Users [6882, 6882], 	 Items [87, 87]
Initialisation of rnn's with relu activation function... Done !
Initialisation of MLP... Done !


In [37]:
# Its important to add this to the loss, because the dataset is unbalanced
dropout_ratio = len(df_train['next_state_user'])/(1+np.sum(df_train['next_state_user']))
weight_ratio = torch.Tensor([1,dropout_ratio]).to(device)
print(weight_ratio)

tensor([ 1.0000, 99.4985], device='cuda:0')


##### Train Loop

In [38]:
# Test le modèle sur peu de données
#import itertools
#t_batches_ = dict(itertools.islice(t_batches.items(), 3000))

In [None]:
n_epochs = 10
lambda_u = 1e-3
lambda_i = 1e-3
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3,weight_decay=1e-5)

model_,U,I = train_rodie(t_batches_train,
          data_torch,
          U_dynamic,
          I_dynamic,
          weight_ratio,
          model,
          optimizer,
          n_epochs,
          lambda_u,
          lambda_i,
          device
          )

## TSNE

In [None]:
l = []
for x,y in t_batches_train.items():
  l.append(y)


dd = sum(l, [])
ff = data.iloc[dd,:].copy()
list_of_change = ff[ff['next_state_user'] == 1]['user_id'].values

from sklearn.manifold import TSNE

data_  = (U.detach().cpu().clone()).numpy()

list_of_change = ff[ff['next_state_user'] == 1]['user_id'].values

df = pd.DataFrame(data_)
df['label'] = np.zeros((7047,1))

for index, row in df.iterrows():
    for d in list_of_change:
      if index == d:
        df.iloc[index,-1] = 1


tsne =TSNE(2)

data_tsne = tsne.fit_transform(data_)


plt.scatter(data_tsne[:,0],data_tsne[:,1],c=df['label'])