In [16]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import math
import torch
from tqdm import tqdm
from preprocessing import *
from model import RODIE
from sklearn.manifold import TSNE
from train import dynamic_embedding ,train_rodie
from  torch import nn
from torch.nn import RNNCell
from torch.nn.functional import one_hot
from torch.nn import functional as F
from torch.nn import MSELoss, HuberLoss,L1Loss,CrossEntropyLoss
from tqdm import tqdm
## Those settings speed up the training process on GPU
torch.backends.cudnn.benchmark = True
torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)

<torch.autograd.profiler.emit_nvtx at 0x7fea6e644650>

In [None]:
## Download data
!wget https://snap.stanford.edu/data/act-mooc.tar.gz
!tar -xzf  act-mooc.tar.gz
!mkdir data

In [17]:
features = pd.read_csv("act-mooc/mooc_action_features.tsv",sep="\t")
labels = pd.read_csv("act-mooc/mooc_action_labels.tsv",sep="\t")
users = pd.read_csv("act-mooc/mooc_actions.tsv",sep="\t")

#### Load & Preprocess Data

In [18]:
mooc_data = extract_data_mooc()

In [None]:
## Extract time diffs for both users and items interactions
delta_u  = delta(mooc_data.copy(),"user_id")
delta_i  = delta(mooc_data.copy(),"item_id")
## Standardize the columns
mooc_data['delta_u'] = delta_u/np.std(delta_u)
mooc_data['delta_i'] = delta_i/np.std(delta_i)

In [None]:
## Extract previous item interactions and past user state
PreviousItemInteraction_ = PreviousItemInteraction(mooc_data.copy()).astype(int)
next_state_user = PastUserState(mooc_data.copy()).astype(int)
interaction_column_name = "PreviousItemInteraction"
pastState_column_name =  "previous_state_label"
mooc_data[interaction_column_name] = PreviousItemInteraction_
mooc_data[pastState_column_name] = next_state_user
list_moins_un = list(mooc_data[mooc_data[pastState_column_name] == -1].index.values)
mooc_data.loc[list_moins_un,pastState_column_name] = 0

In [None]:
data = mooc_data.copy()
data = data[['user_id', 'item_id', 'timestamp', 'state_label','delta_u', 'delta_i', interaction_column_name, pastState_column_name,'f1', 'f2', 'f3','f4']].sort_values(['timestamp'])
data.head()

### T-batches

#### Train / Test SPLIT

In [21]:
t_batches = t_batch_update(data)


T-Batch start...
Number of interaction = 411749
T-Batch ends !


##### Initialize Device

In [22]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print('Using device:', device)

Using device: cpu


##### Initialize the model

In [24]:
t_batch_train, test, train_interactions, test_interactions = t_batch_train_test(data, 1000)

Train : Number of interactions in 1000 batches is equal to 3967
Test : Number of interactions is equal to 407782


In [25]:
# Its important to add this to the loss, because the dataset is unbalanced
dropout_ratio_train = len(train_interactions)/(1.0+ np.sum(data.loc[train_interactions,'state_label']))
weight_ratio_train = torch.Tensor([1.0,dropout_ratio_train]).to(device)
print("Number of dropout in the train data is {}".format(np.sum(data.loc[train_interactions,'state_label'])))

int_test = test.index.values.tolist()
dropout_ratio_test = len(int_test)/(1.0+ np.sum(data.loc[int_test,'state_label']))
print("Number of dropout in the test data is {}".format(np.sum(data.loc[test_interactions,'state_label'])))


Number of dropout in the train data is 21
Number of dropout in the test data is 4045


##### Train Loop

In [26]:
embedding_dim = 32
data_torch = torch.from_numpy(data.values.astype(np.float32))
model = RODIE(embedding_dim,data_torch,device=device,activation_rnn="relu").to(device)

Number of users of 7047 
 Number of items 97 

Dataset size [411749, 12]
Initialisation of static embedding... Done !
Static Embedding shape : Users [7047, 7047], 	 Items [98, 98]
Initialisation of rnn's with relu activation function... Done !
Initialisation of MLP... Done !


In [27]:
n_epochs = 1
lambda_u = 1
lambda_i = 1
learning_rate=1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=1e-5)

model_,U,I = train_rodie(t_batch_train,
          data_torch,
          weight_ratio_train,
          model,
          optimizer,
          learning_rate,
          n_epochs,
          lambda_u,
          lambda_i,
          device
          )

Initialisation of dynamic embedding... Done !
Dynamic Embedding shape : Users [7047, 32], 	 Items [690606, 32]
Training...


100%|█████████▉| 999/1000 [00:15<00:00, 64.27it/s]

tensor([[0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082],
        [0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082],
        [0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082],
        ...,
        [0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082],
        [0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082],
        [0.0686, 0.1067, 0.1597,  ..., 0.0224, 0.1768, 0.2082]])
Epoch 0 Loss 333.73341443389654





## TSNE

In [None]:
tsne_embedding_users = plot_tsne(data,embedding,interactions,"user_id")
tsne_embedding_items = plot_tsne(data,embedding,interactions,"item_id")

# Testing

In [None]:
y, pred,auc = test_rodie(test, U, I, data_torch, model_, device)