### Thesis notebook 4.3. - NOVA IMS

#### LSTM - Temporal data representation

In this notebook, we will finally start our application of temporal representation using LSTMs.
The argument for the usage of Deep Learning stems from the fact that sequences themselves encode information that can be extracted using Recurrent Neural Networks and, more specifically, Long Short Term Memory Units.

#### First Step: Setup a PyTorch environment that enables the use of GPU for training. 

The following cell wll confirm that the GPU will be the default device to use.

In [1]:
import torch
import pycuda.driver as cuda

cuda.init()
## Get Id of default device
torch.cuda.current_device()
# 0
cuda.Device(0).name() # '0' is the id of your GPU

#set all tensors to gpu
torch.set_default_tensor_type('torch.cuda.FloatTensor')

#### Second Step: Import the relevant packages and declare global variables

In [2]:
#import necessary modules/libraries
import numpy as np
import scipy
import pandas as pd
import datetime as dt
import warnings
import time

#tqdm to monitor progress
from tqdm.notebook import tqdm, trange
tqdm.pandas(desc="Progress")

#time related features
from datetime import timedelta
from copy import copy, deepcopy

#vizualization
import matplotlib.pyplot as plt
import seaborn as sns

#imblearn, scalers, kfold and metrics 
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer,PowerTransformer
from sklearn.model_selection import train_test_split, RepeatedKFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, recall_score, classification_report, average_precision_score, precision_recall_curve

#import torch related
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable 
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler


#and optimizer of learning rate
from torch.optim.lr_scheduler import ReduceLROnPlateau

#import pytorch modules
warnings.filterwarnings('ignore')

In [3]:
#global variables that may come in handy
#course threshold sets the % duration that will be considered (1 = 100%)
duration_threshold = [0.1, 0.25, 0.33, 0.5, 1]

#colors for vizualizations
nova_ims_colors = ['#BFD72F', '#5C666C']

#standard color for student aggregates
student_color = '#474838'

#standard color for course aggragates
course_color = '#1B3D2F'

#standard continuous colormap
standard_cmap = 'viridis_r'

#Function designed to deal with multiindex and flatten it
def flattenHierarchicalCol(col,sep = '_'):
    '''converts multiindex columns into single index columns while retaining the hierarchical components'''
    if not type(col) is tuple:
        return col
    else:
        new_col = ''
        for leveli,level in enumerate(col):
            if not level == '':
                if not leveli == 0:
                    new_col += sep
                new_col += level
        return new_col
    
#number of replicas - number of repeats of stratified k fold - in this case 10
replicas = 1

#names to display on result figures
date_names = {
             'Date_threshold_10': '10% of Course Duration',   
             'Date_threshold_25': '25% of Course Duration', 
             'Date_threshold_33': '33% of Course Duration', 
             'Date_threshold_50': '50% of Course Duration', 
             'Date_threshold_100':'100% of Course Duration', 
            }

target_names = {
                'exam_fail' : 'At risk - Exam Grade',
                'final_fail' : 'At risk - Final Grade', 
                'exam_gifted' : 'High performer - Exam Grade', 
                'final_gifted': 'High performer - Final Grade'
                }

#targets
targets = ['exam_fail', 'exam_gifted']

#set the indexes to use for later
index = ["course_encoding", "cd_curso", "semestre", "courseid", "userid", 'exam_gifted', 'exam_fail']

#categories of objecctables
objects = ["course", "resource", "forum", "url", "folder", "quiz", "grade_grades", 
           "assignments", "groups", "user", "turnitintooltwo", "page", "choice", "other"]          

#### Step 3: Import data and take a preliminary look at it 

In [4]:
#imports dataframes
course_programs = pd.read_excel("../Data/Modeling Stage/Nova_IMS_Temporal_Datasets_daily_clicks.xlsx", 
                                dtype = {
                                    'course_encoding' : int,
                                    'userid' : int},
                               sheet_name = None)

#save tables 
student_list = pd.read_csv('../Data/Modeling Stage/Nova_IMS_Filtered_targets.csv', 
                         dtype = {
                                   'course_encoding': int,
                                   'userid' : int,
                                   })

#drop unnamed 0 column
for i in course_programs:
        
    #merge with the targets we calculated on the other 
    course_programs[i] = course_programs[i].merge(student_list, on = ['course_encoding', 'userid'], how = 'inner')
    course_programs[i].drop(['Unnamed: 0', 'exam_mark', 'final_mark'], axis = 1, inplace = True)
    
    #convert results to object and need to convert column names to string
    course_programs[i]['course_encoding'], course_programs[i]['userid'] = course_programs[i]['course_encoding'].astype(object), course_programs[i]['userid'].astype(object)
    course_programs[i].columns = course_programs[i].columns.astype(str)

In [5]:
course_programs['Date_threshold_100'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63171 entries, 0 to 63170
Columns: 149 entries, course_encoding to exam_gifted
dtypes: float64(95), int64(50), object(4)
memory usage: 72.3+ MB


In [6]:
course_programs['Date_threshold_100'].describe(include = 'all')

Unnamed: 0,course_encoding,cd_curso,semestre,courseid,userid,objecttable,1,2,3,4,...,134,135,136,137,138,139,140,141,exam_fail,exam_gifted
count,63171.0,63171.0,63171,63171.0,63171.0,63171,63171.0,63171.0,63171.0,63171.0,...,41654.0,39423.0,37318.0,34876.0,32366.0,27005.0,19821.0,1054.0,63171.0,63171.0
unique,138.0,,6,,1590.0,14,,,,,...,,,,,,,,,,
top,150.0,,S1,,6826.0,course,,,,,...,,,,,,,,,,
freq,1821.0,,28407,,93.0,9295,,,,,...,,,,,,,,,,
mean,,7906.809375,,185361.92229,,,0.019803,0.028716,0.045638,0.052113,...,0.522903,0.293027,0.235329,0.867387,0.250695,0.844844,0.33969,0.001898,0.190784,0.287331
std,,1986.226115,,80819.428212,,,0.276235,0.308394,0.458564,0.550856,...,4.502103,2.288393,1.885921,9.354308,1.631869,8.256497,3.054003,0.04354,0.392922,0.452521
min,,859.0,,100001.0,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,7512.0,,100091.0,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,9155.0,,200165.0,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,9434.0,,200193.0,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In our first attempt, we will use the absolute number of clicks made by each student - scaled using standard scaler. 
Therefore, we can start by immediately placing our course encoding/userid pairings into the index.

In [7]:
def normalize(train, test, scaler):
    
    if scaler == 'MinMax':
        pt = MinMaxScaler()
    elif scaler == 'Standard':
        pt = StandardScaler()
    elif scaler == 'Robust':
        pt = RobustScaler()
    elif scaler == 'Quantile':
        pt = QuantileTransformer()
    else:
        pt = PowerTransformer(method='yeo-johnson')
    
    data_train = pt.fit_transform(train)
    data_test = pt.transform(test)
    # convert the array back to a dataframe
    normalized_train = pd.DataFrame(data_train,columns=train.columns)
    normalized_test = pd.DataFrame(data_test,columns=test.columns)
        
    return normalized_train, normalized_test 

In [8]:
course_programs['Date_threshold_100'].objecttable.value_counts()

course             9295
resource           8757
forum              7005
url                6769
folder             5758
quiz               4870
grade_grades       4670
assignments        3858
groups             3525
user               2344
turnitintooltwo    1998
page               1728
choice             1373
other              1221
Name: objecttable, dtype: int64

In [9]:
test = course_programs['Date_threshold_100'].copy()

#The first 6 columns are index - column 141 is fully empty - no other df has more columns than df_100
columns = test.drop(targets, axis = 1).columns[6:146]

In [10]:
#create first pivot
placeholder_pivot = pd.pivot_table(test, index = index, values = columns, columns = "objecttable",
                  aggfunc = 'first')


#applies the function that removes multiindex
placeholder_pivot.columns = placeholder_pivot.columns.map(flattenHierarchicalCol)

#also saving index for reindexing of the remaining stuff
save_index = placeholder_pivot.index.copy()

#we will need to create the tensors multidimensional tensors
placeholder_dict = {}

#create dataset for targets
df_targets = placeholder_pivot.reset_index().copy()[index]
df_targets.set_index(["course_encoding", "cd_curso", "semestre", "courseid", "userid"], inplace = True)

#initialize empty 3d array
nd_array_100 = np.zeros((
                               len(objects), #nbr of dimensions
                               len(placeholder_pivot), #nbr of rows
                               len(columns), #nbr of columns 
                              ))

#likely inefficient, but should do the trick
counter = 0

#create multiple dataframes based on regex - this will create ndarray for the 100 duration
for i in objects:
    #create the objects
    placeholder_dict[f'{i}'] = placeholder_pivot.filter(regex=f'_{i}')
    
    #remove text, convert column name back to numbers and sort numbers to ensure sequence
    placeholder_dict[f'{i}'].columns = placeholder_dict[f'{i}'].columns.str.replace(r"\D+", "", regex=True) 
    placeholder_dict[f'{i}'].columns = placeholder_dict[f'{i}'].columns.astype(int)
    placeholder_dict[f'{i}'] = placeholder_dict[f'{i}'][sorted(placeholder_dict[f'{i}'].columns)].fillna(0)
    
    #converting df to nd array
    nd_array_100[counter] = placeholder_dict[f'{i}'].values
    counter += 1

    #reshape to samples, rows, columns

#switching to rows, columns, features
nd_array_100 = nd_array_100.transpose(1,2,0)

In [11]:
nd_array_100.shape

(9296, 140, 14)

#### Implementing Cross-Validation with Deep Learning Model

**1. Create the Deep Learning Model**

In this instance, we will follow-up with on the approach used in Chen & Cui - CrossEntropyLoss with applied over a softmax layer.

In [12]:
class LSTM_Uni(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
        super(LSTM_Uni, self).__init__()
        self.num_classes = num_classes #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first = True) #lstm
        
        self.dropout = nn.Dropout(p = 0.5)
    
        self.fc = nn.Linear(self.hidden_size, num_classes) #fully connected last layer

    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
        
        #Xavier_init for both H_0 and C_0
        torch.nn.init.xavier_normal_(h_0)
        torch.nn.init.xavier_normal_(c_0)
        
        # Propagate input through LSTM
        lstm_out, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
        last_output = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        
        #we are interested in only keeping the last output
        drop_out = self.dropout(last_output)
        pre_softmax = self.fc(drop_out) #Final Output - dense
        return pre_softmax

**2. Define the train and validation Functions**

In [13]:
def train_epoch(model,dataloader,loss_fn,optimizer):
    
    train_loss,train_correct=0.0,0 
    model.train()
    for X, labels in dataloader:

        optimizer.zero_grad()
        output = model(X)
        loss = loss_fn(output,labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X.size(0)
        scores, predictions = torch.max(F.log_softmax(output.data), 1)
        train_correct += (predictions == labels).sum().item()
        
    return train_loss,train_correct
  
def valid_epoch(model,dataloader,loss_fn):
    valid_loss, val_correct = 0.0, 0
    targets = []
    y_pred = []
    probability_1 = []
    
    model.eval()
    for X, labels in dataloader:

        output = model(X)
        loss=loss_fn(output,labels)
        valid_loss+=loss.item()*X.size(0)
        probability_1.append(F.softmax(output.data)[:,1])
        predictions = torch.argmax(output, dim=1)
        val_correct+=(predictions == labels).sum().item()
        targets.append(labels)
        y_pred.append(predictions)
    
    #concat all results
    targets = torch.cat(targets).data.cpu().numpy()
    y_pred = torch.cat(y_pred).data.cpu().numpy()
    probability_1 = torch.cat(probability_1).data.cpu().numpy()
    
    #calculate precision, recall and AUC score
    
    precision = precision_score(targets, y_pred)
    recall = recall_score(targets, y_pred)
    f1 = f1_score(targets, y_pred, average = 'micro')
    auroc = roc_auc_score(targets, probability_1)
    
    #return all
    return valid_loss,val_correct, precision, recall, auroc, f1

**3. Define main hyperparameters of the model, including splits**

In [14]:
#Model
num_epochs = 200 #200 epochs
learning_rate = 0.001 #0.01 lr
input_size = 14 #number of features
hidden_size = 128 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers

#Shape of Output as required for SoftMax Classifier
num_classes = 2 #output shape

batch_size = 128

k=5
splits= StratifiedKFold(n_splits=k, random_state=15, shuffle = True) #kfold of 10 with 30 replicas
criterion = nn.CrossEntropyLoss()    # cross-entropy for classification

In [15]:
#test

In [17]:
for k in tqdm(targets):
    print(k)
    
    y = df_targets[k].values

    #create a list containing one value per row
    all_indices = list(range(len(df_targets)))
    
    #using train test split to later apply the rule accordingly
    train_ind, test_ind = train_test_split(all_indices, test_size=0.2, 
                                           random_state = 5, stratify = y)
    
    #applied train_test_split rules accordingly
    X_train_val = nd_array_100[train_ind,:,:]
    y_train_val = y[train_ind]
    
    X_test = nd_array_100[test_ind, :, :]
    y_test = y[test_ind]    
        
    #create dict to store fold performance
    foldperf={}
        
    #reset "best accuracy for treshold i and target k"
    best_accuracy = 0
        
    for repeat in range(replicas):
        print('Replica {}'.format(repeat + 1))
        
        #make train_val split
        for fold, (train_idx,val_idx) in tqdm(enumerate(splits.split(X_train_val, y_train_val))):
            print('Split {}'.format(fold + 1))
            
            #make split between train and Val
            X_train, y_train = X_train_val[train_idx], y_train_val[train_idx]
            X_val, y_val = X_train_val[val_idx], y_train_val[val_idx]
            
            #scaling requires one scaler per channel (feature)
            scalers = {}
            for feature in range(X_train.shape[2]):
                           
                scalers[feature] = StandardScaler()
                X_train[:, :, feature] = scalers[feature].fit_transform(X_train[:, :, feature]) 

            for col in range(X_val.shape[2]):
                X_val[:, :, feature] = scalers[feature].transform(X_val[:, :, feature]) 
            
            #second, convert everything to pytorch tensor - we will convert to tensor dataset and 
            X_train_tensors = torch.from_numpy(X_train)
            X_val_tensors = torch.from_numpy(X_val)
            
            #convert X tensors to format FloatTensor
            X_train_tensors = X_train_tensors.type(torch.cuda.FloatTensor)
            X_val_tensors = X_val_tensors.type(torch.cuda.FloatTensor)
            
            #create y_tensor
            y_train_tensors = torch.from_numpy(y_train)
            y_val_tensors = torch.from_numpy(y_val)
        
            #convert y tensors to format longtensor
            y_train_tensors = y_train_tensors.type(torch.cuda.LongTensor)
            y_val_tensors = y_val_tensors.type(torch.cuda.LongTensor)
            
            #create Tensor Datasets and dataloaders for both Train and Val
            train_dataset = TensorDataset(X_train_tensors, y_train_tensors)
            val_dataset = TensorDataset(X_val_tensors, y_val_tensors)
            train_loader = DataLoader(train_dataset, batch_size=batch_size)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
            #creates new model for each 
            model = LSTM_Uni(num_classes, input_size, hidden_size, num_layers, X_train_tensors.shape[1]).to('cuda') #our lstm class
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
            #scheduler = ReduceLROnPlateau(optimizer, 
            #                      'min', 
            #                      patience = 10,
            #                      cooldown = 20,
            #                     verbose = True)
    
            history = {'train_loss': [], 'val_loss': [],'train_acc':[],'val_acc':[], 'precision': [],
                      'recall' : [], 'auroc': [], 'f1_score' : []}

            for epoch in tqdm(range(num_epochs)):
                train_loss, train_correct=train_epoch(model,train_loader,criterion,optimizer)
                val_loss, val_correct, precision, recall, auroc, f1 = valid_epoch(model,val_loader,criterion)

                train_loss = train_loss / len(train_loader.sampler)
                train_acc = train_correct / len(train_loader.sampler) * 100
                val_loss = val_loss / len(val_loader.sampler)
                val_acc = val_correct / len(val_loader.sampler) * 100
        
        
                if (epoch+1) % 10 == 0: 
                 print("Epoch:{}/{} AVG Training Loss:{:.3f} AVG Validation Loss:{:.3f} AVG Training Acc {:.2f} % AVG Validation Acc {:.2f} %".format(epoch + 1,
                                                                                                             num_epochs,
                                                                                                             train_loss,
                                                                                                             val_loss,
                                                                                                             train_acc,
                                                                                                             val_acc))
                history['train_loss'].append(train_loss)
                history['val_loss'].append(val_loss)
                history['train_acc'].append(train_acc)
                history['val_acc'].append(val_acc)
                history['precision'].append(precision)
                history['recall'].append(recall)
                history['auroc'].append(auroc)
                history['f1_score'].append(f1)
                #scheduler.step(val_loss)
    
                if val_acc > best_accuracy:
            
                #replace best accuracy and save best model
                    print(f'New Best Accuracy found: {val_acc:.2f}%\nEpoch: {epoch + 1}')
                    best_accuracy = val_acc
                    best = deepcopy(model)
                    curr_epoch = epoch + 1
                    
            #store fold performance
            foldperf['fold{}'.format(fold+1)] = history
        
#     #saves fold performance for target 
#     threshold_dict[k] = pd.DataFrame.from_dict(foldperf, orient='index') # convert dict to dataframe
        
#      #explode to get eacxh epoch as a row
#     threshold_dict[k] = threshold_dict[k].explode(list(threshold_dict[k].columns))
#     torch.save(best,f"../Models/{i}/Nova_IMS_best_{k}_{curr_epoch}_epochs.h")
        
#     # from pandas.io.parsers import ExcelWriter
#     with pd.ExcelWriter(f"../Data/Modeling Stage/Results/IMS/Clicks per day/daily_clicks_{i}_{replicas}_replicas.xlsx") as writer:  
#         for sheet in targets:
#             threshold_dict[sheet].to_excel(writer, sheet_name=str(sheet))

  0%|          | 0/2 [00:00<?, ?it/s]

exam_fail
Replica 1


0it [00:00, ?it/s]

Split 1


  0%|          | 0/200 [00:00<?, ?it/s]

New Best Accuracy found: 79.84%
Epoch: 1
New Best Accuracy found: 79.91%
Epoch: 6
New Best Accuracy found: 80.11%
Epoch: 9
Epoch:10/200 AVG Training Loss:0.480 AVG Validation Loss:0.511 AVG Training Acc 80.28 % AVG Validation Acc 79.97 %
Epoch:20/200 AVG Training Loss:0.444 AVG Validation Loss:0.552 AVG Training Acc 81.62 % AVG Validation Acc 79.64 %
Epoch:30/200 AVG Training Loss:0.397 AVG Validation Loss:0.561 AVG Training Acc 83.49 % AVG Validation Acc 79.57 %
Epoch:40/200 AVG Training Loss:0.362 AVG Validation Loss:0.536 AVG Training Acc 84.85 % AVG Validation Acc 79.70 %
Epoch:50/200 AVG Training Loss:0.325 AVG Validation Loss:0.502 AVG Training Acc 86.08 % AVG Validation Acc 80.11 %
Epoch:60/200 AVG Training Loss:0.286 AVG Validation Loss:0.569 AVG Training Acc 88.16 % AVG Validation Acc 79.23 %
Epoch:70/200 AVG Training Loss:0.273 AVG Validation Loss:0.525 AVG Training Acc 88.21 % AVG Validation Acc 79.91 %
Epoch:80/200 AVG Training Loss:0.230 AVG Validation Loss:0.645 AVG Train

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.480 AVG Validation Loss:0.507 AVG Training Acc 80.52 % AVG Validation Acc 79.83 %
Epoch:20/200 AVG Training Loss:0.439 AVG Validation Loss:0.514 AVG Training Acc 81.95 % AVG Validation Acc 79.56 %
Epoch:30/200 AVG Training Loss:0.382 AVG Validation Loss:0.508 AVG Training Acc 83.90 % AVG Validation Acc 79.69 %
Epoch:40/200 AVG Training Loss:0.340 AVG Validation Loss:0.505 AVG Training Acc 85.41 % AVG Validation Acc 79.62 %
Epoch:50/200 AVG Training Loss:0.312 AVG Validation Loss:0.511 AVG Training Acc 86.12 % AVG Validation Acc 79.62 %
Epoch:60/200 AVG Training Loss:0.307 AVG Validation Loss:0.517 AVG Training Acc 86.89 % AVG Validation Acc 79.42 %
Epoch:70/200 AVG Training Loss:0.242 AVG Validation Loss:0.762 AVG Training Acc 89.12 % AVG Validation Acc 25.22 %
Epoch:80/200 AVG Training Loss:0.206 AVG Validation Loss:1.380 AVG Training Acc 90.54 % AVG Validation Acc 20.31 %
Epoch:90/200 AVG Training Loss:0.197 AVG Validation Loss:0.610 AVG Training Acc 

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.482 AVG Validation Loss:0.632 AVG Training Acc 80.28 % AVG Validation Acc 79.29 %
Epoch:20/200 AVG Training Loss:0.459 AVG Validation Loss:0.724 AVG Training Acc 81.36 % AVG Validation Acc 27.17 %
Epoch:30/200 AVG Training Loss:0.408 AVG Validation Loss:0.842 AVG Training Acc 83.24 % AVG Validation Acc 20.51 %
Epoch:40/200 AVG Training Loss:0.357 AVG Validation Loss:0.757 AVG Training Acc 84.72 % AVG Validation Acc 25.62 %
Epoch:50/200 AVG Training Loss:0.308 AVG Validation Loss:1.255 AVG Training Acc 86.49 % AVG Validation Acc 20.24 %
Epoch:60/200 AVG Training Loss:0.311 AVG Validation Loss:2.085 AVG Training Acc 86.33 % AVG Validation Acc 20.11 %
Epoch:70/200 AVG Training Loss:0.243 AVG Validation Loss:0.773 AVG Training Acc 89.31 % AVG Validation Acc 28.18 %
Epoch:80/200 AVG Training Loss:0.227 AVG Validation Loss:0.832 AVG Training Acc 89.61 % AVG Validation Acc 20.58 %
Epoch:90/200 AVG Training Loss:0.194 AVG Validation Loss:0.512 AVG Training Acc 

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.478 AVG Validation Loss:0.751 AVG Training Acc 80.47 % AVG Validation Acc 29.99 %
Epoch:20/200 AVG Training Loss:0.427 AVG Validation Loss:1.257 AVG Training Acc 81.98 % AVG Validation Acc 20.65 %
Epoch:30/200 AVG Training Loss:0.385 AVG Validation Loss:1.894 AVG Training Acc 83.34 % AVG Validation Acc 20.11 %
Epoch:40/200 AVG Training Loss:0.331 AVG Validation Loss:3.266 AVG Training Acc 86.01 % AVG Validation Acc 20.11 %
Epoch:50/200 AVG Training Loss:0.297 AVG Validation Loss:3.175 AVG Training Acc 87.04 % AVG Validation Acc 20.11 %
Epoch:60/200 AVG Training Loss:0.281 AVG Validation Loss:1.171 AVG Training Acc 87.49 % AVG Validation Acc 29.12 %
Epoch:70/200 AVG Training Loss:0.207 AVG Validation Loss:1.863 AVG Training Acc 90.64 % AVG Validation Acc 20.31 %
Epoch:80/200 AVG Training Loss:0.208 AVG Validation Loss:0.563 AVG Training Acc 90.55 % AVG Validation Acc 77.67 %
Epoch:90/200 AVG Training Loss:0.180 AVG Validation Loss:1.099 AVG Training Acc 

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.485 AVG Validation Loss:0.511 AVG Training Acc 80.43 % AVG Validation Acc 79.76 %
Epoch:20/200 AVG Training Loss:0.441 AVG Validation Loss:0.619 AVG Training Acc 81.64 % AVG Validation Acc 79.42 %
Epoch:30/200 AVG Training Loss:0.378 AVG Validation Loss:0.558 AVG Training Acc 83.54 % AVG Validation Acc 79.62 %
Epoch:40/200 AVG Training Loss:0.328 AVG Validation Loss:0.562 AVG Training Acc 85.09 % AVG Validation Acc 79.62 %
Epoch:50/200 AVG Training Loss:0.363 AVG Validation Loss:0.565 AVG Training Acc 83.39 % AVG Validation Acc 79.29 %
Epoch:60/200 AVG Training Loss:0.276 AVG Validation Loss:0.531 AVG Training Acc 86.15 % AVG Validation Acc 79.69 %
Epoch:70/200 AVG Training Loss:0.245 AVG Validation Loss:0.517 AVG Training Acc 87.54 % AVG Validation Acc 79.56 %
Epoch:80/200 AVG Training Loss:0.213 AVG Validation Loss:0.921 AVG Training Acc 89.23 % AVG Validation Acc 23.54 %
Epoch:90/200 AVG Training Loss:0.215 AVG Validation Loss:0.652 AVG Training Acc 

0it [00:00, ?it/s]

Split 1


  0%|          | 0/200 [00:00<?, ?it/s]

New Best Accuracy found: 72.31%
Epoch: 1
Epoch:10/200 AVG Training Loss:0.578 AVG Validation Loss:0.656 AVG Training Acc 72.65 % AVG Validation Acc 71.71 %
Epoch:20/200 AVG Training Loss:0.535 AVG Validation Loss:0.714 AVG Training Acc 74.53 % AVG Validation Acc 65.52 %
Epoch:30/200 AVG Training Loss:0.501 AVG Validation Loss:0.752 AVG Training Acc 75.94 % AVG Validation Acc 65.99 %
Epoch:40/200 AVG Training Loss:0.451 AVG Validation Loss:0.705 AVG Training Acc 78.60 % AVG Validation Acc 70.16 %
Epoch:50/200 AVG Training Loss:0.439 AVG Validation Loss:0.630 AVG Training Acc 79.34 % AVG Validation Acc 71.91 %
Epoch:60/200 AVG Training Loss:0.403 AVG Validation Loss:0.775 AVG Training Acc 80.87 % AVG Validation Acc 64.52 %
Epoch:70/200 AVG Training Loss:0.378 AVG Validation Loss:0.760 AVG Training Acc 82.21 % AVG Validation Acc 39.05 %
Epoch:80/200 AVG Training Loss:0.391 AVG Validation Loss:0.741 AVG Training Acc 81.67 % AVG Validation Acc 45.83 %
Epoch:90/200 AVG Training Loss:0.389 AV

  0%|          | 0/200 [00:00<?, ?it/s]

New Best Accuracy found: 72.36%
Epoch: 1
Epoch:10/200 AVG Training Loss:0.580 AVG Validation Loss:0.747 AVG Training Acc 72.73 % AVG Validation Acc 27.91 %
Epoch:20/200 AVG Training Loss:0.550 AVG Validation Loss:0.634 AVG Training Acc 73.76 % AVG Validation Acc 72.23 %
Epoch:30/200 AVG Training Loss:0.508 AVG Validation Loss:1.139 AVG Training Acc 75.74 % AVG Validation Acc 27.64 %
Epoch:40/200 AVG Training Loss:0.480 AVG Validation Loss:0.698 AVG Training Acc 77.12 % AVG Validation Acc 37.86 %
New Best Accuracy found: 72.43%
Epoch: 45
Epoch:50/200 AVG Training Loss:0.455 AVG Validation Loss:0.629 AVG Training Acc 78.25 % AVG Validation Acc 70.95 %
Epoch:60/200 AVG Training Loss:0.435 AVG Validation Loss:0.604 AVG Training Acc 78.97 % AVG Validation Acc 72.43 %
New Best Accuracy found: 72.63%
Epoch: 67
Epoch:70/200 AVG Training Loss:0.440 AVG Validation Loss:0.709 AVG Training Acc 79.00 % AVG Validation Acc 72.36 %
Epoch:80/200 AVG Training Loss:0.381 AVG Validation Loss:0.601 AVG Tra

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.573 AVG Validation Loss:0.727 AVG Training Acc 72.77 % AVG Validation Acc 33.02 %
Epoch:20/200 AVG Training Loss:0.532 AVG Validation Loss:0.702 AVG Training Acc 74.57 % AVG Validation Acc 36.18 %
Epoch:30/200 AVG Training Loss:0.497 AVG Validation Loss:0.597 AVG Training Acc 76.06 % AVG Validation Acc 72.23 %
Epoch:40/200 AVG Training Loss:0.515 AVG Validation Loss:0.663 AVG Training Acc 75.12 % AVG Validation Acc 72.29 %
Epoch:50/200 AVG Training Loss:0.445 AVG Validation Loss:0.631 AVG Training Acc 78.55 % AVG Validation Acc 72.16 %
Epoch:60/200 AVG Training Loss:0.506 AVG Validation Loss:0.601 AVG Training Acc 76.16 % AVG Validation Acc 72.36 %
Epoch:70/200 AVG Training Loss:0.466 AVG Validation Loss:0.669 AVG Training Acc 77.73 % AVG Validation Acc 71.89 %
Epoch:80/200 AVG Training Loss:0.445 AVG Validation Loss:0.767 AVG Training Acc 78.89 % AVG Validation Acc 33.69 %
Epoch:90/200 AVG Training Loss:0.432 AVG Validation Loss:0.885 AVG Training Acc 

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.578 AVG Validation Loss:0.599 AVG Training Acc 72.70 % AVG Validation Acc 72.36 %
Epoch:20/200 AVG Training Loss:0.550 AVG Validation Loss:0.605 AVG Training Acc 73.71 % AVG Validation Acc 72.29 %
Epoch:30/200 AVG Training Loss:0.515 AVG Validation Loss:1.299 AVG Training Acc 75.02 % AVG Validation Acc 27.71 %
Epoch:40/200 AVG Training Loss:0.466 AVG Validation Loss:0.925 AVG Training Acc 77.61 % AVG Validation Acc 28.45 %
Epoch:50/200 AVG Training Loss:0.434 AVG Validation Loss:0.950 AVG Training Acc 79.21 % AVG Validation Acc 27.91 %
Epoch:60/200 AVG Training Loss:0.407 AVG Validation Loss:1.228 AVG Training Acc 81.32 % AVG Validation Acc 28.04 %
Epoch:70/200 AVG Training Loss:0.472 AVG Validation Loss:1.165 AVG Training Acc 77.26 % AVG Validation Acc 27.98 %
Epoch:80/200 AVG Training Loss:0.376 AVG Validation Loss:2.010 AVG Training Acc 82.59 % AVG Validation Acc 27.91 %
Epoch:90/200 AVG Training Loss:0.345 AVG Validation Loss:0.930 AVG Training Acc 

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch:10/200 AVG Training Loss:0.574 AVG Validation Loss:0.682 AVG Training Acc 73.00 % AVG Validation Acc 71.15 %
Epoch:20/200 AVG Training Loss:0.535 AVG Validation Loss:0.779 AVG Training Acc 74.84 % AVG Validation Acc 29.59 %
Epoch:30/200 AVG Training Loss:0.497 AVG Validation Loss:0.689 AVG Training Acc 76.43 % AVG Validation Acc 70.48 %
Epoch:40/200 AVG Training Loss:0.470 AVG Validation Loss:0.888 AVG Training Acc 78.01 % AVG Validation Acc 70.48 %
Epoch:50/200 AVG Training Loss:0.427 AVG Validation Loss:0.724 AVG Training Acc 79.83 % AVG Validation Acc 70.28 %
Epoch:60/200 AVG Training Loss:0.413 AVG Validation Loss:1.456 AVG Training Acc 80.45 % AVG Validation Acc 72.16 %
Epoch:70/200 AVG Training Loss:0.377 AVG Validation Loss:1.318 AVG Training Acc 82.18 % AVG Validation Acc 72.16 %
Epoch:80/200 AVG Training Loss:0.402 AVG Validation Loss:1.136 AVG Training Acc 81.06 % AVG Validation Acc 71.76 %
Epoch:90/200 AVG Training Loss:0.339 AVG Validation Loss:1.410 AVG Training Acc 

In [None]:
X_train_tensors.shape

**4. Make the splits and Start Training**

In [None]:
for i in tqdm(list(course_programs.keys())[1:]):
    
    print(i)
    threshold_dict = {} #dict to store information in for each threshold
    data = deepcopy(course_programs[i])
    
    data.set_index(['course_encoding', 'userid'], drop = True, inplace = True)
    data.fillna(0, inplace = True)
    
    #set X and Y columns
    X = data[data.columns[:25]] #different timesteps
    y = data[data.columns[-4:]] #the 4 different putative targets
    
    for k in tqdm(targets):
        print(k)
        
        #Start with train test split
        X_train_val, X_test, y_train_val, y_test, = train_test_split(
                                   X,
                                   y[k], #replace when going for multi-target 
                                   test_size = 0.20,
                                   random_state = 15,
                                   shuffle=True,
                                   stratify = y[k] #replace when going for multi-target
                                    )
        
        #create dict to store fold performance
        foldperf={}
        
        #reset "best accuracy for treshold i and target k"
        best_accuracy = 0

        #make train_val split
        for fold, (train_idx,val_idx) in tqdm(enumerate(splits.split(X_train_val, y_train_val))):

            print('Split {}'.format(fold + 1))
            
            #make split between train and Val
            X_train, y_train = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
            X_val, y_val = X_train_val.iloc[val_idx], y_train_val.iloc[val_idx]
            
            #apply scaling after 
            X_train, X_val = normalize(X_train, X_val, 'MinMax')
            
            #second, convert everything to pytorch tensor - we will convert to tensor dataset and 
            X_train_tensors = Variable(torch.Tensor(X_train.values))
            X_val_tensors = Variable(torch.Tensor(X_val.values))

            y_train_tensors = Variable(torch.Tensor(y_train.values))
            y_val_tensors = Variable(torch.Tensor(y_val.values)) 

            #reshaping to rows, timestamps, features 
            X_train_tensors = torch.reshape(X_train_tensors,   (X_train_tensors.shape[0], X_train_tensors.shape[1], 1))
            X_val_tensors = torch.reshape(X_val_tensors,  (X_val_tensors.shape[0], X_val_tensors.shape[1], 1))
        
            #convert y tensors to format longtensor
            y_train_tensors = y_train_tensors.type(torch.cuda.LongTensor)
            y_val_tensors = y_val_tensors.type(torch.cuda.LongTensor)
            
            #create Tensor Datasets and dataloaders for both Train and Val
            train_dataset = TensorDataset(X_train_tensors, y_train_tensors)
            val_dataset = TensorDataset(X_val_tensors, y_val_tensors)
            train_loader = DataLoader(train_dataset, batch_size=batch_size)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
            #creates new model for each 
            model = LSTM_Uni(num_classes, input_size, hidden_size, num_layers, X_train_tensors.shape[1]).to('cuda') #our lstm class
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
            scheduler = ReduceLROnPlateau(optimizer, 
                                  'min', 
                                  patience = 10,
                                  cooldown = 20,
                                 verbose = True)
    
            history = {'train_loss': [], 'val_loss': [],'train_acc':[],'val_acc':[], 'precision': [],
                      'recall' : [], 'auroc': []}

            for epoch in tqdm(range(num_epochs)):
                train_loss, train_correct=train_epoch(model,train_loader,criterion,optimizer)
                val_loss, val_correct, precision, recall, auroc = valid_epoch(model,val_loader,criterion)

                train_loss = train_loss / len(train_loader.sampler)
                train_acc = train_correct / len(train_loader.sampler) * 100
                val_loss = val_loss / len(val_loader.sampler)
                val_acc = val_correct / len(val_loader.sampler) * 100
        
        
                if (epoch+1) % 10 == 0: 
                    print("Epoch:{}/{} AVG Training Loss:{:.3f} AVG Validation Loss:{:.3f} AVG Training Acc {:.2f} % AVG Validation Acc {:.2f} %".format(epoch + 1,
                                                                                                             num_epochs,
                                                                                                             train_loss,
                                                                                                             val_loss,
                                                                                                             train_acc,
                                                                                                             val_acc))
                history['train_loss'].append(train_loss)
                history['val_loss'].append(val_loss)
                history['train_acc'].append(train_acc)
                history['val_acc'].append(val_acc)
                history['precision'].append(precision)
                history['recall'].append(recall)
                history['auroc'].append(auroc)
                scheduler.step(val_loss)
    
                if val_acc > best_accuracy:
            
                #replace best accuracy and save best model
                    print(f'New Best Accuracy found: {val_acc:.2f}%\nEpoch: {epoch + 1}')
                    best_accuracy = val_acc
                    best = deepcopy(model)
                    curr_epoch = epoch + 1
                    
            #store fold performance
            foldperf['fold{}'.format(fold+1)] = history
        
        #saves fold performance for target 
        threshold_dict[k] = pd.DataFrame.from_dict(foldperf, orient='index') # convert dict to dataframe
        
        #explode to get eacxh epoch as a row
        threshold_dict[k] = threshold_dict[k].explode(list(threshold_dict[k].columns))
        torch.save(best,f"../Models/{i}/Nova_IMS_best_{k}_{curr_epoch}_epochs.h")
        
    # from pandas.io.parsers import ExcelWriter
    with pd.ExcelWriter(f"../Data/Modeling Stage/Results/IMS/Clicks per day/daily_clicks_{i}_{replicas}_replicas.xlsx") as writer:  
        for sheet in targets:
                threshold_dict[sheet].to_excel(writer, sheet_name=str(sheet))