In [2]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import torch
from tqdm import tqdm
from preprocessing import extract_data_mooc, extractFeatures,extractItemUserId,extractNextStateItem,extractNextUserState,UserNextInteraction, delta, t_batch_update,train_test_split,train_test_stratified_split
from model import RODIE,dynamic_embedding
from sklearn.manifold import TSNE
from train import train_rodie

In [3]:
## Téléchargement des données
!wget https://snap.stanford.edu/data/act-mooc.tar.gz
!tar -xzf  act-mooc.tar.gz
!mkdir data

--2022-03-17 10:41:00--  https://snap.stanford.edu/data/act-mooc.tar.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5378133 (5.1M) [application/x-gzip]
Saving to: ‘act-mooc.tar.gz’


2022-03-17 10:41:01 (4.89 MB/s) - ‘act-mooc.tar.gz’ saved [5378133/5378133]



In [4]:
features = pd.read_csv("act-mooc/mooc_action_features.tsv",sep="\t")
labels = pd.read_csv("act-mooc/mooc_action_labels.tsv",sep="\t")
users = pd.read_csv("act-mooc/mooc_actions.tsv",sep="\t")

#### Load & Preprocess Data

In [5]:
mooc_data = extract_data_mooc()

delta_u  = delta(mooc_data.copy(),"user_id")
delta_i  = delta(mooc_data.copy(),"item_id")
nextItemInteraction = UserNextInteraction(mooc_data.copy())
next_state_user = extractNextUserState(mooc_data.copy())


mooc_data['delta_u'] = delta_u
mooc_data['delta_i'] = delta_i
mooc_data['nextItemInteraction'] = nextItemInteraction
mooc_data['next_state_user'] = next_state_user

data = mooc_data.copy()
data = data[ (data.nextItemInteraction != -1) | (data.next_state_user != -1)  ]

data = data[['user_id', 'item_id', 'timestamp', 'state_label','delta_u', 'delta_i', 'nextItemInteraction', 'next_state_user','f1', 'f2', 'f3','f4']]
data.head()

delta user_id
delta item_id


Unnamed: 0_level_0,user_id,item_id,timestamp,state_label,delta_u,delta_i,nextItemInteraction,next_state_user,f1,f2,f3,f4
ACTIONID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,0.0,0,0,0,1,0,-0.319991,-0.435701,0.106784,-0.067309
1,0,1,6.0,0,6,0,2,0,-0.319991,-0.435701,0.106784,-0.067309
2,0,2,41.0,0,35,0,1,0,-0.319991,-0.435701,0.106784,-0.067309
3,0,1,49.0,0,8,43,2,0,-0.319991,-0.435701,0.106784,-0.067309
4,0,2,51.0,0,2,10,3,0,-0.319991,-0.435701,0.106784,-0.067309


### T-batches

#### Train / Test SPLIT

In [39]:
df_train1,df_test1 = train_test_stratified_split(data)
df_train2,df_test2 = train_test_stratified_split(df_test1)
df_train3,df_test = train_test_stratified_split(df_test2)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)


In [40]:
df_train = pd.concat([df_train1],axis=0)
df_train.shape,df_test.shape

((202351, 12), (50587, 12))

In [41]:
print("Proportion of dropout user in :\n Train data = {:.1f}%\n Test Data= {:.1f}%".format(100*np.sum(df_train['next_state_user'])/df_train.shape[0],100*np.sum(df_test['next_state_user'])/df_test.shape[0]))

Proportion of dropout user in :
 Train data = 1.0%
 Test Data= 1.0%


In [42]:
t_batches_train = t_batch_update(df_train)
t_batches_test = t_batch_update(df_test)

T-Batch start...
Number of interaction = 202351
T-Batch ends !
T-Batch start...
Number of interaction = 50587
T-Batch ends !


##### Initialize Device

In [43]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print('Using device:', device)

Using device: cuda


##### Initialize the model

In [56]:
embedding_dim = 32
data_torch = torch.from_numpy(data.values.astype(np.float32))
U_dynamic,I_dynamic = dynamic_embedding(data_torch,embedding_dim)  # Initial dynamic embedding
    
U_dynamic = U_dynamic.to(device)
I_dynamic = I_dynamic.to(device)

model = RODIE(embedding_dim,data_torch,device=device,activation_rnn="tanh").to(device)

Initialisation of dynamic embedding... Done !
Dynamic Embedding shape : Users [7047, 32], 	 Items [97, 32]
Number of users of 7047 
 Number of items 97 

Dataset size [404702, 12]
Initialisation of static embedding... Done !
Static Embedding shape : Users [7047, 7047], 	 Items [97, 97]
Initialisation of rnn's with tanh activation function... Done !
Initialisation of MLP... Done !


In [57]:
# Its important to add this to the loss, because the dataset is unbalanced
dropout_ratio = len(df_train['next_state_user'])/(1+np.sum(df_train['next_state_user']))
weight_ratio = torch.Tensor([1,dropout_ratio]).to(device)
print(weight_ratio)

tensor([ 1.0000, 99.4843], device='cuda:0')


##### Train Loop

In [58]:
# Test le modèle sur peu de données
#import itertools
#t_batches_ = dict(itertools.islice(t_batches.items(), 3000))

In [59]:
import torch
from  torch import nn
from torch.nn import RNNCell
from torch.nn.functional import one_hot
import math
from torch.nn import functional as F


## This custom class of Linear, enables to initialize the weights of the layer to belong to a normal distribution ##

class NormalLinear(nn.Linear):
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.normal_(0, stdv)
        if self.bias is not None:
            self.bias.data.normal_(0, stdv)

## This function enables to create the dynamic embedding of each node ##
def dynamic_embedding(data,embedding_dim):
        num_users = len(torch.unique(data[:,0]))
        num_items = len(torch.unique(data[:,1]))
        dynamic_users_embedding = F.normalize(torch.randn(num_users,embedding_dim))
        dynamic_items_embedding = F.normalize(torch.randn(num_items,embedding_dim))

        print("Initialisation of dynamic embedding... Done !")
        print("Dynamic Embedding shape : Users {}, \t Items {}".format(list(dynamic_users_embedding.size()),list(dynamic_items_embedding.size())))

        return dynamic_users_embedding,dynamic_items_embedding
        

class RODIE(torch.nn.Module):

    def __init__(self,embedding_dim,data,device,activation_rnn="relu",MLP_h_dim=50,option="user_state"):
        super(RODIE, self).__init__()
        self.option = option
        self.embedding_dim = embedding_dim
        self.activation_rnn = activation_rnn
        self.data = data
        self.MLP_h_dim = MLP_h_dim  # The dimension of the hidden layer of the MLP used for the classification of Users 
        # Select features of the data
        self.features = self.data[:,8:]
        self.dim_features = self.features.shape[1]
        # Number of users and number of items
        num_users = len(torch.unique(data[:,0]))

        num_items = len(torch.unique(data[:,1]))


        print("Number of users of {} \n Number of items {} \n".format(num_users,num_items))
        print("Dataset size {}".format(list(self.data.size())))
        # Initialize static  embeddings
        self.static_users_embedding = one_hot(torch.arange(0,num_users))
        self.static_items_embedding = one_hot(torch.arange(0,num_items))
        static_user_embedding_dim = self.static_users_embedding.shape[1]
        static_item_embedding_dim = self.static_items_embedding.shape[1]
        print("Initialisation of static embedding... Done !")
        print("Static Embedding shape : Users {}, \t Items {}".format(list(self.static_users_embedding.size()),list(self.static_items_embedding.size())))

        # Initialize dynamic  embeddings
        # In JODIE official implementation, authors decided to attribute the SAME initial dynamic embedding 
        



        input_rnn_user_dim =  self.embedding_dim + self.dim_features + 1

        input_rnn_item_dim =  self.embedding_dim + self.dim_features + 1

        self.item_rnn = RNNCell(input_rnn_user_dim, self.embedding_dim, nonlinearity = self.activation_rnn)


        self.user_rnn = RNNCell(input_rnn_item_dim,self.embedding_dim, nonlinearity = self.activation_rnn)

        print("Initialisation of rnn's with {} activation function... Done !".format(self.activation_rnn))

        # Projection layer -> projection operation   
        self.projection_layer = nn.Linear(1,self.embedding_dim, bias=False)
        # Predict next item embedding layer
        self.predictItem_layer = nn.Linear(static_item_embedding_dim + static_user_embedding_dim  + 2*self.embedding_dim, static_item_embedding_dim + self.embedding_dim, bias=True)

        self.predictStateUser_MLP = torch.nn.Sequential(
            nn.Linear(self.embedding_dim,self.MLP_h_dim),
            torch.nn.ReLU(),
            nn.Linear(self.MLP_h_dim,2)
            )
        print("Initialisation of MLP... Done !")


    ######## Predicting next item embedding  ########
    def update_item_rnn(self,
                          dynamic_item_embedding, # at t-1
                          dynamic_user_embedding,# at t-1
                          features,
                          delta_i,
):

      concat_input = torch.concat([
                                  dynamic_user_embedding,
                                  delta_i.reshape(-1,1),
                                  features,
      ],axis=1)
                                 

      
      concat_input = F.normalize(concat_input)
      dynamic_item_embedding = F.normalize(dynamic_item_embedding)  
      return F.normalize(self.item_rnn(concat_input,dynamic_item_embedding))



    ######## Predicting next user embedding  ########
    def update_user_rnn(self,
                        dynamic_user_embedding, # at t-1
                        dynamic_item_embedding,# at t-1
                        features,
                        delta_u):
      concat_input = torch.concat([
                                  dynamic_item_embedding,
                                  delta_u.reshape(-1,1),
                                   features],
                                  axis=1)
      

      concat_input = F.normalize(concat_input)
      dynamic_user_embedding = F.normalize(dynamic_user_embedding)  


      return F.normalize(self.user_rnn(concat_input,dynamic_user_embedding))

    

    ######## Projecting the embedding the new dynamic embedding of the user at a future time  ########
    def projection_operation(self,
                            dynamic_user_embedding,
                            delta_u):
        u_projection =  dynamic_user_embedding * (1 + self.projection_layer(delta_u.reshape(-1,1)))

        return u_projection
        
    ######## Predicting next potential item, the specific user will interact with  ########
    
    def predict_item_embedding(self,
        u_projection,
        u_static,
        i_dynamic,
        i_static
        ):
        concatenated_input = torch.concat([u_projection,i_dynamic,i_static,u_static],axis=1)
        j_tilde = self.predictItem_layer(concatenated_input)
        return j_tilde

    ######## Predicting next user state  ########

    def predict_user_state(self,dynamic_user_embedding):
        u_state = self.predictStateUser_MLP(dynamic_user_embedding)

        return u_state

    def forward(self,
                actual_item_embedding,
                actual_user_embedding,
                u_static,
                i_static,
                f,
                delta_u,
                delta_i,
                next_state_label,
                next_item_dynamic_embedding,
                next_item_static_embedding
                ):
      ######## New  Dynamic Embeddings ########
      # New dynamic embedding of the user
      future_user_embedding= self.update_user_rnn(actual_user_embedding,actual_item_embedding,f,delta_u)
      # New dynamic embedding of the item
      future_item_embedding= self.update_item_rnn(actual_item_embedding,actual_user_embedding,f,delta_i)     


      # Projection of the user

      projected_user_embedding = self.projection_operation(future_user_embedding,delta_u)

      # Predict next item
      j_tilde = self.predict_item_embedding(
        projected_user_embedding,
        u_static,
        future_item_embedding,
        i_static)
      
      # The real next item embedding j_true, is the concatenation of the static and dynamic embedding of the next item 
      j_true = torch.concat([next_item_dynamic_embedding,next_item_static_embedding],axis=1).detach()
        # Prediction of next state of the user using an MLP at the end
      U_pred_state = self.predict_user_state(future_user_embedding)

      return future_user_embedding,future_item_embedding,U_pred_state,j_tilde,j_true


In [60]:

from torch.nn import MSELoss, HuberLoss,L1Loss,CrossEntropyLoss
from preprocessing import *
import torch
from  torch import nn
from torch.nn import RNNCell
from torch.nn.functional import one_hot
import math
from tqdm import tqdm
from torch.nn import MSELoss, HuberLoss,L1Loss,CrossEntropyLoss
from torch.nn import functional as F

def regularizer(actual_user_embedding,future_user_embedding,lambda_u,
                               actual_item_embedding,future_item_embedding,lambda_i
                               ):
    u_regularization_loss =  MSELoss()(actual_user_embedding,future_user_embedding)
    i_regularization_loss =  MSELoss()(actual_item_embedding,future_item_embedding)
    return lambda_u* u_regularization_loss + lambda_i* i_regularization_loss 


def train_rodie(t_batches,
          data,
          U,
          I,
          weight_ratio,
          model,
          optimizer,
          n_epochs,
          lambda_u,
          lambda_i,
          device,

          ):
  print("Training...")
 # U_copy = U.clone().detach()
 # I_copy = I.clone().detach()

  for e in range(n_epochs):
    l = 0
    
    for (_,rows),_ in zip(t_batches.items(),tqdm(range(len(t_batches)), position=0, leave=True)):
      optimizer.zero_grad()
      users_idx,items_idx = extractItemUserId(data,rows)

      state_label,delta_u,delta_i,f = extractFeatures(data,rows)

      next_state,next_item = extractNextStateItem(data,rows)

      u_static, i_static = model.static_users_embedding[users_idx], model.static_items_embedding[items_idx]

      user_embedding, item_embedding = U[users_idx], I[items_idx]
      next_item_static_embedding, next_item_dynamic_embedding = model.static_items_embedding[[int(x) for x in next_item]], I[[int(x) for x in next_item]]

     # next_state = next_state.type(torch.LongTensor).to(device)
      item_embedding = item_embedding.to(device)
      user_embedding  = user_embedding.to(device)
      u_static = u_static.to(device)
      i_static = i_static.to(device)
      f = f.to(device)
      delta_u = delta_u.to(device)
      delta_i = delta_i.to(device)
      next_state = next_state.type(torch.LongTensor).to(device)
      next_item_dynamic_embedding = next_item_dynamic_embedding.to(device)
      next_item_static_embedding = next_item_static_embedding.to(device)
      
      # The forward pass of the model : extract dynamic embeddings (user+item ), and predicted user state and predicted item embedding
      future_user_embedding,future_item_embedding,U_pred_state,j_tilde,j_true  = model(item_embedding,
                user_embedding,
                u_static,
                i_static,
                f,
                delta_u,
                delta_i,
                next_state,
                next_item_dynamic_embedding,
                next_item_static_embedding)
      # Add the new embedding to the placeholder U and I
      U[users_idx] = future_user_embedding.detach().clone()
      I[items_idx] = future_item_embedding.detach().clone() 
      
      # Return loss value between the predicted embedding "j_tilde" and the real next item embedding j_true
      loss = MSELoss()(j_tilde,j_true)#.detach()
      loss += regularizer(user_embedding,future_user_embedding,lambda_u,
                            item_embedding,future_item_embedding,lambda_i
                            )
        
      loss += CrossEntropyLoss(weight_ratio)(U_pred_state,next_state)

      #print(I[0])
      loss.backward()
      l += loss.item()
      torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.)
      optimizer.step()
    print(I[0])
    print("Epoch {} Loss {}".format(e,l))
    #print(I[0])
    #print(U[0])
  return model,U,I


In [None]:
n_epochs = 10
lambda_u = 1e-3
lambda_i = 1e-3
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3,weight_decay=1e-5)

model_,U,I = train_rodie(t_batches_train,
          data_torch,
          U_dynamic,
          I_dynamic,
          weight_ratio,
          model,
          optimizer,
          n_epochs,
          lambda_u,
          lambda_i,
          device
          )

Training...


100%|█████████▉| 33650/33651 [05:25<00:00, 103.22it/s]


tensor([ 9.8860e-01,  5.5337e-02, -4.7108e-02, -4.5311e-05,  1.0243e-02,
         5.6318e-02,  1.2318e-02,  1.0192e-03,  3.8686e-03, -5.5764e-03,
        -1.5759e-02, -8.0964e-03,  7.6300e-03,  2.7499e-03, -3.6310e-02,
         2.4301e-02, -8.0339e-03,  5.3727e-02, -2.8305e-02, -3.1376e-03,
        -2.9317e-02, -2.1991e-02, -2.2163e-03,  1.9257e-02,  2.7269e-02,
         6.5157e-02,  1.6053e-02, -2.8265e-02,  1.2832e-03,  3.3686e-03,
         8.3686e-03, -4.3995e-03], device='cuda:0')
Epoch 0 Loss 4401450.301294162


100%|█████████▉| 33650/33651 [05:27<00:00, 102.81it/s]


tensor([ 9.8644e-01, -1.4825e-03, -7.4219e-03, -2.6905e-02, -4.7341e-03,
        -2.3056e-02,  5.5169e-02,  3.7053e-02,  1.8710e-02,  7.3854e-03,
         5.4933e-02, -1.4141e-02, -4.0840e-02,  1.3960e-02,  1.9729e-02,
         5.4289e-02,  4.9004e-02,  3.2589e-03, -4.5193e-02, -4.5946e-02,
         3.1026e-02, -2.6330e-02, -2.0435e-02,  8.2604e-03,  1.5846e-02,
         1.6439e-03,  8.8992e-04,  4.3858e-02,  1.7390e-02,  2.3897e-02,
         1.8052e-03, -2.4710e-02], device='cuda:0')
Epoch 1 Loss 11066.51004626276


100%|█████████▉| 33650/33651 [05:20<00:00, 104.90it/s]


tensor([ 0.9910, -0.0111,  0.0186,  0.0188, -0.0117, -0.0020,  0.0277, -0.0107,
        -0.0534,  0.0280,  0.0097,  0.0122,  0.0030, -0.0149,  0.0066, -0.0190,
         0.0482, -0.0160, -0.0398, -0.0512,  0.0057, -0.0153,  0.0131,  0.0141,
        -0.0268, -0.0016,  0.0246,  0.0469, -0.0098,  0.0104,  0.0112,  0.0159],
       device='cuda:0')
Epoch 2 Loss 10766.419765417464


 30%|██▉       | 10023/33651 [01:37<03:52, 101.71it/s]

## TSNE

In [None]:
l = []
for x,y in t_batches_train.items():
  l.append(y)


dd = sum(l, [])
ff = data.iloc[dd,:].copy()
list_of_change = ff[ff['next_state_user'] == 1]['user_id'].values


data_  = (U.detach().cpu().clone()).numpy()

list_of_change = ff[ff['next_state_user'] == 1]['user_id'].values

df = pd.DataFrame(data_)
df['label'] = np.zeros((7047,1))

for index, row in df.iterrows():
    for d in list_of_change:
      if index == d:
        df.iloc[index,-1] = 1


tsne =TSNE(2)

data_tsne = tsne.fit_transform(data_)


plt.scatter(data_tsne[:,0],data_tsne[:,1],c=df['label'])