In [2]:
#import needed libraries 
from urllib.request import urlretrieve
import zipfile, os
import time, sys, copy
import pandas as pd
import scipy.sparse as sps
import numpy as np
from collections import defaultdict
import math

In [3]:
#clone repositeries
!git clone https://github.com/shalini1194/RKT
#!git clone https://github.com/lyf-1/PEBG.git
!git clone https://github.com/jhljx/GKT.git    
#copy repositories in working directory
!cp -r ./GKT/* ./
#!cp -r ./PEBG/assist09/* ./
!cp -r ./RKT/* ./
!cp -r ../input/skillbuilder-data-2009-2010/2012-2013-data-with-predictions-4-final.csv ./
!cp -r ../input/assesments-12-13-precessed-data/* ./

Cloning into 'RKT'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 53 (delta 9), reused 44 (delta 7), pack-reused 0[K
Unpacking objects: 100% (53/53), done.
Cloning into 'GKT'...
remote: Enumerating objects: 357, done.[K
remote: Counting objects: 100% (357/357), done.[K
remote: Compressing objects: 100% (247/247), done.[K
remote: Total 357 (delta 216), reused 236 (delta 107), pack-reused 0[K
Receiving objects: 100% (357/357), 17.02 MiB | 11.31 MiB/s, done.
Resolving deltas: 100% (216/216), done.


In [4]:
#import from github cloned repositories
from RKT import utils

import copy
import math
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F


def future_mask(seq_length):
    future_mask = np.triu(np.ones((1, seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


def clone(module, num):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num)])


def attention(query, key, value, rel, l1, l2, timestamp, mask=None, dropout=None):
    """Compute scaled dot product attention.
    """
    rel = rel * mask.to(torch.float) # future masking of correlation matrix.
    rel_attn = rel.masked_fill(rel == 0, -10000)
    rel_attn = nn.Softmax(dim=-1)(rel_attn)
    scores = torch.matmul(query, key.transpose(-2, -1))
    scores = scores / math.sqrt(query.size(-1))
    if mask is not None:
        scores = scores.masked_fill(mask, -1e9)

        time_stamp= torch.exp(-torch.abs(timestamp.float()))
        #
        time_stamp=time_stamp.masked_fill(mask,-np.inf)


    prob_attn = F.softmax(scores, dim=-1)
    time_attn = F.softmax(time_stamp,dim=-1)
    prob_attn = (1-l2)*prob_attn+l2*time_attn
    # prob_attn = F.softmax(prob_attn + rel_attn, dim=-1)

    prob_attn = (1-l1)*prob_attn + (l1)*rel_attn
    if dropout is not None:
        prob_attn = dropout(prob_attn)
    return torch.matmul(prob_attn, value), prob_attn


def relative_attention(query, key, value, rel, l1, l2, pos_key_embeds, pos_value_embeds, mask=None, dropout=None):
    """Compute scaled dot product attention with relative position embeddings.
    (https://arxiv.org/pdf/1803.02155.pdf)
    """
    assert pos_key_embeds.num_embeddings == pos_value_embeds.num_embeddings

    scores = torch.matmul(query, key.transpose(-2, -1))

    idxs = torch.arange(scores.size(-1))
    if query.is_cuda:
        idxs = idxs.cuda()
    idxs = idxs.view(-1, 1) - idxs.view(1, -1)
    idxs = torch.clamp(idxs, 0, pos_key_embeds.num_embeddings - 1)

    pos_key = pos_key_embeds(idxs).transpose(-2, -1)
    pos_scores = torch.matmul(query.unsqueeze(-2), pos_key)
    scores = scores.unsqueeze(-2) + pos_scores
    scores = scores / math.sqrt(query.size(-1))

    pos_value = pos_value_embeds(idxs)
    value = value.unsqueeze(-3) + pos_value

    if mask is not None:
        scores = scores.masked_fill(mask.unsqueeze(-2), -1e9)
    prob_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        prob_attn = dropout(prob_attn)

    output = torch.matmul(prob_attn, value).unsqueeze(-2)
    prob_attn = prob_attn.unsqueeze(-2)
    return output, prob_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, total_size, num_heads, drop_prob):
        super(MultiHeadedAttention, self).__init__()
        assert total_size % num_heads == 0
        self.total_size = total_size
        self.head_size = total_size // num_heads
        self.num_heads = num_heads
        self.linear_layers = clone(nn.Linear(total_size, total_size), 3)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, query, key, value, rel, l1, l2, timestamp, encode_pos, pos_key_embeds, pos_value_embeds, mask=None):
        batch_size, seq_length = query.shape[:2]

        # Apply mask to all heads
        if mask is not None:
            mask = mask.unsqueeze(1)

        # Project inputs
        rel = rel.unsqueeze(1).repeat(1,self.num_heads,1,1)
        timestamp = timestamp.unsqueeze(1).repeat(1,self.num_heads,1,1)
        query, key, value = [l(x).view(batch_size, seq_length, self.num_heads, self.head_size).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # Apply attention
        if encode_pos:
            out, self.prob_attn = relative_attention(
                query, key, value, rel, l1, l2, timestamp, pos_key_embeds, pos_value_embeds,  mask, self.dropout)
        else:
            out, self.prob_attn = attention(query, key, value, rel, l1, l2, timestamp, mask, self.dropout)

        out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, self.total_size)
        return out, self.prob_attn


class RKT(nn.Module):
    def __init__(self, num_items,  embed_size, num_attn_layers, num_heads,
                 encode_pos, max_pos, drop_prob, l1, l2):
        """Self-attentive knowledge tracing.
        Arguments:
            num_items (int): number of items
            num_skills (int): number of skills
            embed_size (int): input embedding and attention dot-product dimension
            num_attn_layers (int): number of attention layers
            num_heads (int): number of parallel attention heads
            encode_pos (bool): if True, use relative position embeddings
            max_pos (int): number of position embeddings to use
            drop_prob (float): dropout probability
        """
        super(RKT, self).__init__()
        self.embed_size = embed_size
        self.encode_pos = encode_pos

        self.item_embeds = nn.Embedding(num_items + 1, embed_size , padding_idx=0)
        # self.skill_embeds = nn.Embedding(num_skills + 1, embed_size // 2, padding_idx=0)

        self.pos_key_embeds = nn.Embedding(max_pos, embed_size // num_heads)
        self.pos_value_embeds = nn.Embedding(max_pos, embed_size // num_heads)

        self.lin_in = nn.Linear(2*embed_size, embed_size)
        self.attn_layers = clone(MultiHeadedAttention(embed_size, num_heads, drop_prob), num_attn_layers)
        self.dropout = nn.Dropout(p=drop_prob)
        self.lin_out = nn.Linear(embed_size, 1)
        self.l1 = nn.Parameter(torch.tensor(l1))
        self.l2 = nn.Parameter(torch.tensor(l2))

    def get_inputs(self, item_inputs, label_inputs):
        item_inputs = self.item_embeds(item_inputs)
        # skill_inputs = self.skill_embeds(skill_inputs)
        label_inputs = label_inputs.unsqueeze(-1).float()

        inputs = torch.cat([item_inputs, item_inputs], dim=-1)
        inputs[..., :self.embed_size] *= label_inputs
        inputs[..., self.embed_size:] *= 1 - label_inputs
        return inputs

    def get_query(self, item_ids):
        item_ids = self.item_embeds(item_ids)
        # skill_ids = self.skill_embeds(skill_ids)
        query = torch.cat([item_ids], dim=-1)
        return query

    def forward(self, item_inputs, label_inputs, item_ids, rel, timestamp):
        inputs = self.get_inputs(item_inputs, label_inputs)

        inputs = F.relu(self.lin_in(inputs))

        query = self.get_query(item_ids)

        mask = future_mask(inputs.size(-2))
        if inputs.is_cuda:
            mask = mask.cuda()
        outputs, attn  = self.attn_layers[0](query, inputs, inputs, rel, self.l1, self.l2, timestamp, self.encode_pos,
                                                   self.pos_key_embeds, self.pos_value_embeds, mask)
        outputs = self.dropout(outputs)
        for l in self.attn_layers[1:]:
            residual, attn = l(query, outputs, outputs, rel, self.l1, self.l2, self.encode_pos, timestamp, self.pos_key_embeds,
                         self.pos_value_embeds, mask)
            outputs = self.dropout(outputs + F.relu(residual))

        return self.lin_out(outputs), attn




In [5]:
import torch

class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, data, labels):
        'Initialization'
        self.labels = labels
        self.data = data


    def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        X = self.data[index]

        # Load data and get label
        y = self.labels[index]

        return X, y
    
    


In [None]:
#Passages needed to recompute pro_pro_skills
#Initially I will use file with the ones already computed by RKT authors
#In the future I will modify the code and try to compute better ones.

import os 
import pandas as pd
import numpy as np
from scipy import sparse


class DataProcess():
    def __init__(self, data_folder='assist09', file_name='skill_builder_data_corrected_collapsed.csv', min_inter_num=3):
        print("Process Dataset %s" % data_folder)
        self.min_inter_num = min_inter_num
        self.data_folder = data_folder
        self.file_name = file_name

    def process_csv(self):
        #pre-process original csv file for assist dataset

        # read csv file
        data_path = os.path.join(self.data_folder, self.file_name)
        df = pd.read_csv(data_path, low_memory=False, encoding="ISO-8859-1")
        print('original records number %d' % len(df))

        # delete empty skill_id
        df = df.dropna(subset=['skill_id'])
        df = df[~df['skill_id'].isin(['noskill'])]
        print('After removing empty skill_id, records number %d' % len(df))

        # delete scaffolding problems
        df = df[df['original'].isin([1])]
        print('After removing scaffolding problems, records number %d' % len(df))

        #delete the users whose interaction number is less than min_inter_num
        users = df.groupby(['user_id'], as_index=True)
        delete_users = []
        for u in users:
            if len(u[1]) < self.min_inter_num:
                delete_users.append(u[0])
        print('deleted user number based min-inters %d' % len(delete_users))
        df = df[~df['user_id'].isin(delete_users)]
        print('After deleting some users, records number %d' % len(df))
        # print('features: ', df['assistment_id'].unique(), df['answer_type'].unique())

        df.to_csv(os.path.join(self.data_folder, '%s_processed.csv'%self.file_name))


    def pro_skill_graph(self):
        df = pd.read_csv(os.path.join(self.data_folder, '%s_processed.csv'%self.file_name),low_memory=False, encoding="ISO-8859-1")
        problems = df['problem_id'].unique()
        pro_id_dict = dict(zip(problems, range(len(problems))))
        print('problem number %d' % len(problems))

        pro_type = df['problem_type'].unique()
        pro_type_dict = dict(zip(pro_type, range(len(pro_type))))
        print('problem type: ', pro_type_dict)

        pro_feat = []
        pro_skill_adj = []
        skill_id_dict, skill_cnt = {}, 0
        for pro_id in range(len(problems)):            
            tmp_df = df[df['problem_id']==problems[pro_id]]
            tmp_df_0 = tmp_df.iloc[0]

            # pro_feature: [ms_of_response, answer_type, mean_correct_num]
            ms = tmp_df['ms_first_response'].abs().mean()
            p = tmp_df['correct'].mean()
            pro_type_id = pro_type_dict[tmp_df_0['problem_type']] 
            tmp_pro_feat = [0.] * (len(pro_type_dict)+2)
            tmp_pro_feat[0] = ms
            tmp_pro_feat[pro_type_id+1] = 1.
            tmp_pro_feat[-1] = p
            pro_feat.append(tmp_pro_feat)

            # build problem-skill bipartite
            s = tmp_df_0['skill_id']
            skill_id_dict[s] = skill_cnt
            skill_cnt += 1
            pro_skill_adj.append([pro_id, skill_id_dict[s], 1])

        pro_skill_adj = np.array(pro_skill_adj).astype(np.int32)
        pro_feat = np.array(pro_feat).astype(np.float32)
        pro_feat[:, 0] = (pro_feat[:, 0] - np.min(pro_feat[:, 0])) / (np.max(pro_feat[:, 0])-np.min(pro_feat[:, 0]))
        pro_num = np.max(pro_skill_adj[:, 0]) + 1
        skill_num = np.max(pro_skill_adj[:, 1]) + 1
        print('problem number %d, skill number %d' % (pro_num, skill_num))

        # save pro-skill-graph in sparse matrix form
        pro_skill_sparse = sparse.coo_matrix((pro_skill_adj[:, 2].astype(np.float32), (pro_skill_adj[:, 0], pro_skill_adj[:, 1])), shape=(pro_num, skill_num))
        sparse.save_npz(os.path.join(self.data_folder, 'pro_skill_sparse.npz'), pro_skill_sparse)

        # save pro-id-dict, skill-id-dict
        self.save_dict(pro_id_dict, os.path.join(self.data_folder, 'pro_id_dict.txt'))
        self.save_dict(skill_id_dict, os.path.join(self.data_folder, 'skill_id_dict.txt'))

        # save pro_feat_arr
        np.savez(os.path.join(self.data_folder, 'pro_feat.npz'), pro_feat=pro_feat)

    def generate_user_sequence(self, seq_file):
        # generate user interaction sequence
        # and write to data.txt

        df = pd.read_csv(os.path.join(self.data_folder, '%s_processed.csv'%self.file_name), low_memory=False, encoding="ISO-8859-1")
        ui_df = df.groupby(['user_id'], as_index=True)   
        print('user number %d' % len(ui_df))

        user_inters = []
        cnt = 0
        for ui in ui_df:
            tmp_user, tmp_inter = ui[0], ui[1]
            tmp_problems = list(tmp_inter['problem_id'])
            tmp_skills = list(tmp_inter['skill_id'])
            tmp_ans = list(tmp_inter['correct'])
            tmp_end_time = list(tmp_inter['end_time'])
            user_inters.append([[len(tmp_inter)], tmp_skills, tmp_problems, tmp_ans, tmp_end_time])
        
        write_file = os.path.join(self.data_folder, seq_file)
        self.write_txt(write_file, user_inters)


    def save_dict(self, dict_name, file_name):
        f = open(file_name, 'w')
        f.write(str(dict_name))
        f.close


    def write_txt(self, file, data):
        with open(file, 'w') as f:
            for dd in data:
                for d in dd:
                    f.write(str(d)+'\n')


    def read_user_sequence(self, filename, max_len=200, min_len=3, shuffle_flag=True):
        with open(filename, 'r') as f:
            lines = f.readlines()
        with open(os.path.join(self.data_folder, 'skill_id_dict.txt'), 'r') as f:
            skill_id_dict = eval(f.read()) 
        with open(os.path.join(self.data_folder, 'pro_id_dict.txt'), 'r') as f:
            pro_id_dict = eval(f.read())
        

        y, skill, problem, real_len, timestamp = [], [], [], [], []
        skill_num, pro_num = len(skill_id_dict), len(pro_id_dict)
        print('skill num, pro num, ', skill_num, pro_num)

        index = 0
        while index < len(lines):
            num = eval(lines[index])[0]
            tmp_skills = eval(lines[index+1])[:max_len]
            # tmp_skills = [skill_id_dict[ele]+1 for ele in tmp_skills]     # for assist09
            tmp_skills = [ele+1 for ele in tmp_skills]                      # for assist12 
            tmp_pro = eval(lines[index+2])[:max_len]
            tmp_pro = [pro_id_dict[ele]+1 for ele in tmp_pro]
            tmp_ans = eval(lines[index+3])[:max_len]
            tmp_time = eval(lines[index+4])[:max_len]

            if num>=min_len:
                tmp_real_len = len(tmp_skills)
                # Completion sequence
                tmp_ans += [-1]*(max_len-tmp_real_len)
                tmp_skills += [0]*(max_len-tmp_real_len)
                tmp_pro += [0]*(max_len-tmp_real_len)
                tmp_time += [-1]*(max_len-tmp_real_len)

                y.append(tmp_ans)
                skill.append(tmp_skills)
                problem.append(tmp_pro)
                real_len.append(tmp_real_len)
                timestamp.append(tmp_time)

            index += 5
        
        y = np.array(y).astype(np.float32)
        skill = np.array(skill).astype(np.int32)
        problem = np.array(problem).astype(np.int32)
        real_len = np.array(real_len).astype(np.int32)
        timestamp = np.array(timestamp).astype(np.datetime64)

        print(skill.shape, problem.shape, y.shape, real_len.shape)      
        print(np.max(y), np.min(y))
        print(np.max(real_len), np.min(real_len))  
        print(np.max(skill), np.min(skill))
        print(np.max(problem), np.min(problem))

        np.savez(os.path.join(self.data_folder, "%s.npz"%self.file_name), problem=problem, y=y, skill=skill, time = timestamp, real_len=real_len, skill_num=skill_num, problem_num=pro_num)



data_folder = './'
min_inter_num = 3
file_name='2012-2013-data-with-predictions-4-final.csv'
DP = DataProcess(data_folder, file_name, min_inter_num)

DP.process_csv()
DP.pro_skill_graph()
DP.generate_user_sequence('data.txt')
DP.read_user_sequence(os.path.join(data_folder, 'data.txt')) 



In [6]:
import os 
import pandas as pd
import numpy as np
from scipy import sparse
data_folder = './'
pro_skill_coo = sparse.load_npz(os.path.join(data_folder, 'pro_skill_sparse.npz'))
[pro_num, skill_num] = pro_skill_coo.toarray().shape
print('problem number %d, skill number %d' % (pro_num, skill_num))
pro_skill_csc = pro_skill_coo.tocsc()
pro_skill_csr = pro_skill_coo.tocsr()


def extract_pro_pro_sim():
    # extract pro-pro similarity sparse matrix
    pro_pro_adj = []
    for p in range(pro_num):
        tmp_skills = pro_skill_csr.getrow(p).indices
        similar_pros = pro_skill_csc[:, tmp_skills].indices
        zipped = zip([p] * similar_pros.shape[0], similar_pros)
        pro_pro_adj += list(zipped)

    pro_pro_adj = list(set(pro_pro_adj))
    pro_pro_adj = np.array(pro_pro_adj).astype(np.int32)
    data = np.ones(pro_pro_adj.shape[0]).astype(np.float32)
    pro_pro_sparse = sparse.coo_matrix((data, (pro_pro_adj[:, 0], pro_pro_adj[:, 1])), shape=(pro_num, pro_num))
    sparse.save_npz(os.path.join(data_folder, 'pro_pro_sparse.npz'), pro_pro_sparse)


def extract_skill_skill_sim():
    # extract skill-skill similarity sparse matrix
    skill_skill_adj = []
    for s in range(skill_num):
        tmp_pros = pro_skill_csc.getcol(s).indices
        similar_skills = pro_skill_csr[tmp_pros, :].indices
        zipped = zip([s] * similar_skills.shape[0], similar_skills)
        skill_skill_adj += list(zipped)

    skill_skill_adj = list(set(skill_skill_adj))
    skill_skill_adj = np.array(skill_skill_adj).astype(np.int32)
    data = np.ones(skill_skill_adj.shape[0]).astype(np.float32)
    skill_skill_sparse = sparse.coo_matrix((data, (skill_skill_adj[:, 0], skill_skill_adj[:, 1])), shape=(skill_num, skill_num))
    sparse.save_npz(os.path.join(data_folder, 'skill_skill_sparse.npz'), skill_skill_sparse)


extract_pro_pro_sim()
extract_skill_skill_sim()

problem number 47104, skill number 47104


In [20]:
import argparse
import psutil
import gc
import pandas as pd
from random import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy import sparse
import torch.nn as nn
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence
from collections import  defaultdict
from sys import getsizeof
import tensorflow.compat.v2 as tf
from datetime import datetime

from RKT.utils import *

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

print(torch.cuda.is_available())
dt = datetime.utcnow()
    
def compute_corr(prob_seq, next_seq, corr_dic):
    corr= np.zeros((prob_seq.shape[0],prob_seq.shape[1], prob_seq.shape[1]))
    for i in range(0,prob_seq.shape[0]):
        for  j in range(0,next_seq.shape[1] ):
            for k in range(j+1):
                corr[i][j][k]=corr_dic[next_seq[i][j]][prob_seq[i][k]]
    return corr

def get_corr_data_assistments(pro_num):
    pro_pro_sparse = sparse.load_npz('./pro_pro_sparse.npz')
    pro_pro_coo = pro_pro_sparse.tocoo()
    # print(pro_skill_csr)
    pro_pro_dense = pro_pro_coo.toarray()
    return pro_pro_dense

def get_corr_data(pro_num):
    pro_pro_dense = np.zeros((pro_num, pro_num))
    pro_pro_ = open('../input/ednet-dataset/ednet_corr.csv')
    for i in pro_pro_:
        j = i.strip().split(',')
        pro_pro_dense[int(j[0])][int(j[1])] += int(float(j[2]))
    return pro_pro_dense

def get_data_assistments(batch_size=64):
    """Extract sequences from dataframe.
    Arguments:
        df (pandas Dataframe): output by prepare_data.py
        max_length (int): maximum length of a sequence chunk
        train_split (float): proportion of data to use for training
    """
    
    params = {'batch_size': batch_size,
          'shuffle': True}
    process = psutil.Process(os.getpid())
    gc.enable()
    data = np.load('../input/assesments-12-13-precessed-data/2012-2013-data-with-predictions-4-final.csv.npz')
    y, skill, problem, timestamps, real_len = data['y'], data['skill'], data['problem'], data['time'] , data['real_len']
    skill_num, pro_num = data['skill_num'], data['problem_num']
    print(timestamps)
    item_ids = [torch.tensor(i).type(torch.cuda.LongTensor) for i in problem]
    timestamp = [torch.tensor( [(t - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's') for t in timestamp] ).type(torch.cuda.LongTensor) for timestamp in timestamps]  
    labels = [torch.tensor(i).type(torch.cuda.LongTensor) for i in y]
    item_inputs = [torch.cat((torch.zeros(1, dtype=torch.long).cuda(), i))[:-1] for i in item_ids]
    # skill_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), s))[:-1] for s in skill_ids]
    label_inputs = [torch.cat((torch.zeros(1, dtype=torch.long).cuda(), l))[:-1] for l in labels]

    batches = list(zip(item_inputs, label_inputs, item_ids, timestamp, labels))   
    seq_lists = list(zip(*batches))
    inputs_and_ids = [pad_sequence(seqs, batch_first=True, padding_value=0)
                      for seqs in seq_lists[0:4]]
    labels = pad_sequence(seq_lists[-1], batch_first=True, padding_value=-1)  # Pad labels with -1
    train_data, test_data, training_labels, test_labels = train_test_split(data=list(zip(*inputs_and_ids)), labels= labels, split=0.8)
    print("Corr_data computation")
    corr_data = get_corr_data_assistments(pro_num) 
    training_set = Dataset(train_data, training_labels)
    #training_generator = torch.utils.data.DataLoader(training_set, **params)
    test_set = Dataset(test_data, test_labels)
    #test_generator = torch.utils.data.DataLoader(test_set, **params)
    #validation_set = Dataset(val_data, val_labels)
    #validation_generator = torch.utils.data.DataLoader(validation_set, **params)
    
    return (training_set, test_set, corr_data, pro_num, timestamps)


def get_data_Ednet(batch_size=64):
    """Extract sequences from dataframe.
    Arguments:
        df (pandas Dataframe): output by prepare_data.py
        max_length (int): maximum length of a sequence chunk
        train_split (float): proportion of data to use for training
    """
    
    process = psutil.Process(os.getpid())
    gc.enable()
    data = np.load('../input/ednet-dataset/ednet.npz')

    y, skill, problem, timestamps, real_len = data['y'], data['skill'], data['problem'], data['time'] , data['real_len']
    skill_num, pro_num = data['skill_num'], data['problem_num']
    
    item_ids = [torch.tensor(i).type(torch.cuda.LongTensor) for i in problem]
    timestamp = [torch.from_numpy(np.array(timestamp)).cuda() for timestamp in timestamps]  
    labels = [torch.tensor(i).type(torch.cuda.LongTensor) for i in y]
    item_inputs = [torch.cat((torch.zeros(1, dtype=torch.long).cuda(), i))[:-1] for i in item_ids]
    # skill_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), s))[:-1] for s in skill_ids]
    label_inputs = [torch.cat((torch.zeros(1, dtype=torch.long).cuda(), l))[:-1] for l in labels]

    batches = list(zip(item_inputs, label_inputs, item_ids, timestamp, labels))   
    seq_lists = list(zip(*batches))
    inputs_and_ids = [pad_sequence(seqs, batch_first=True, padding_value=0)
                      for seqs in seq_lists[0:4]]
    labels = pad_sequence(seq_lists[-1], batch_first=True, padding_value=-1)  # Pad labels with -1
    train_data, test_data, training_labels, test_labels = train_test_split(data=list(zip(*inputs_and_ids)), labels= labels, split=0.8)
    train_data, val_data, training_labels, val_labels = train_test_split(data=train_data, labels=training_labels, split=0.8)
    print("Corr_data computation")
    corr_data = get_corr_data(pro_num) 
    training_set = Dataset(train_data, training_labels)
    training_generator = torch.utils.data.DataLoader(training_set, **params)
    test_set = Dataset(test_data, test_labels)
    test_generator = torch.utils.data.DataLoader(test_set, **params)
    validation_set = Dataset(val_data, val_labels)
    validation_generator = torch.utils.data.DataLoader(validation_set, **params)
    
    return (training_generator, validation_generator, test_generator, corr_data, pro_num, timestamps)

#def load_data_and_prepare_batches();
    




def train_test_split(data, labels, split=0.8):
    n_samples = len(data)
    # x is your dataset
    training_data, test_data = data[:int(n_samples*split)], data[int(n_samples*split):]
    training_labels, test_labels = labels[:int(n_samples*split)], labels[int(n_samples*split):]
    return training_data, test_data, training_labels, test_labels


def compute_auc(preds, labels):
    preds = preds[labels >= 0].flatten()
    labels = labels[labels >= 0].float()
    if len(torch.unique(labels)) == 1:  # Only one class
        auc = accuracy_score(labels, preds.round())
        acc = auc
    else:
        auc = roc_auc_score(labels, preds)
        acc = accuracy_score(labels, preds.round())
    return auc, acc


def compute_loss(preds, labels, criterion):
    preds = preds[labels >= 0].flatten()
    labels = labels[labels >= 0].float()
    return criterion(preds, labels)
def computeRePos(time_seq, time_span):
    batch_size = time_seq.shape[0]
    size = time_seq.shape[1]

    time_matrix= (torch.abs(torch.unsqueeze(time_seq, axis=1).repeat(1,size,1).reshape((batch_size, size*size,1)) - \
                 torch.unsqueeze(time_seq,axis=-1).repeat(1, 1, size,).reshape((batch_size, size*size,1))))

    # time_matrix[time_matrix>time_span] = time_span
    time_matrix = time_matrix.reshape((batch_size,size,size))


    return (time_matrix)



True


In [None]:
#Code from RKT train with few changes for performance




def train(train_data, val_data, pro_num, corr_data, timestamp, timespan,  model, optimizer, logger, saver, num_epochs, batch_size, grad_clip):
    """Train SAKT model.
    Arguments:
        train_data (list of tuples of torch Tensor)
        val_data (list of tuples of torch Tensor)
        model (torch Module)
        optimizer (torch optimizer)
        logger: wrapper for TensorboardX logger
        saver: wrapper for torch saving
        num_epochs (int): number of epochs to train for
        batch_size (int)
        grad_clip (float): max norm of the gradients
    """
    
    params = {'batch_size': batch_size,
          'shuffle': True}
    process = psutil.Process(os.getpid())
    print('entered train', process.memory_info().rss)
    criterion = nn.BCEWithLogitsLoss()
    step = 0
    metrics = Metrics()
    test_generator = torch.utils.data.DataLoader(val_data, **params)
    print('PB memory used: ', process.memory_info().rss)
    for epoch in range(num_epochs):
        training_generator = torch.utils.data.DataLoader(train_data, **params)
        print("in epoch"+str(epoch))
        print("Prepare batches train")
        #train_batches = prepare_batches(train_data, batch_size)
        print("Prepare batches val")
        #val_batches = prepare_batches(val_data, batch_size)
        i=0
        # Training
        for data, labels in training_generator:
            item_inputs, label_inputs, item_ids, timestamp = data
            
            # rel = compute_corr(item_inputs, item_ids, corr_data)
            rel = torch.Tensor(corr_data[(item_ids-1).cpu().unsqueeze(1).repeat(1,item_ids.shape[-1],1),(item_inputs-1).cpu().unsqueeze(-1).repeat(1,1,item_inputs.shape[-1])]).cuda()
            time = computeRePos(timestamp, timespan)
            # skill_inputs = skill_inputs.cuda()
            # skill_ids = skill_ids.cuda()
            #item_ids = item_ids.cuda()
            preds, weights = model(item_inputs, label_inputs, item_ids, rel, time)
            loss = compute_loss(preds, labels, criterion)
            preds = torch.sigmoid(preds).detach().cpu()
            train_auc, train_acc = compute_auc(preds, labels.cpu())
            model.zero_grad()
            loss.backward()
            clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            step += 1
            metrics.store({'loss/train': loss.item()})
            metrics.store({'auc/train': train_auc})

            # print(step)
            if step % 1000 == 0:
                print(metrics.average())
                print(step)

                # weights = {"weight/" + name: param for name, param in model.named_parameters()}
                # grads = {"grad/" + name: param.grad
                #         for name, param in model.named_parameters() if param.grad is not None}
                # logger.log_histograms(weights, step)
                # logger.log_histograms(grads, step)            
        # Logging
        torch.save(weights, 'weight_tensor_rel')
        # Validation

        model.eval()
        for data, labels in test_generator:
            item_inputs, label_inputs, item_ids, timestamp = data
            # rel = compute_corr(item_inputs, item_ids, corr_data)
            rel = torch.Tensor(corr_data[(item_ids-1).cpu().unsqueeze(1).repeat(1,item_ids.shape[-1],1),(item_inputs-1).cpu().unsqueeze(-1).repeat(1,1,item_inputs.shape[-1])]).cuda()
            time = computeRePos(timestamp, timespan)
            with torch.no_grad():
                preds,weights = model(item_inputs, label_inputs, item_ids, rel, time)
                preds = torch.sigmoid(preds).cpu()
            val_auc, val_acc = compute_auc(preds, labels.cpu())
            metrics.store({'auc/val': val_auc, 'acc/val': val_acc})
            gc.collect()
        model.train()

        # Save model

        average_metrics = metrics.average()
        logger.log_scalars(average_metrics, step)
        print(average_metrics)
        
        stop = saver.save(average_metrics['auc/val'], model)
        if stop:
            break

parser = argparse.ArgumentParser(description='Train RKT.')
parser.add_argument('--dataset', type=str)
parser.add_argument('--logdir', type=str, default='runs/rkt')
parser.add_argument('--savedir', type=str, default='./')
parser.add_argument('--patience', type=int, default=5)
parser.add_argument('--max_length', type=int, default=50)
parser.add_argument('--embed_size', type=int, default=64)
parser.add_argument('--num_attn_layers', type=int, default=1)
parser.add_argument('--num_heads', type=int, default=4)
parser.add_argument('--encode_pos', action='store_true')
parser.add_argument('--max_pos', type=int, default=10)
parser.add_argument('--drop_prob', type=float, default=0.1)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--l1', type=float, default=0.5)
parser.add_argument('--l2', type=float, default=0.5)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--grad_clip', type=float, default=10)
parser.add_argument('--num_epochs', type=int, default=300)
parser.add_argument('--timespan', default=100000, type=int)

args = parser.parse_args(args=[])

# full_df = pd.read_csv('./', sep=",")
# train_df = pd.read_csv('../../KT-GAT/data/ed_net2_train.csv', sep=",")
# test_df = pd.read_csv('../../KT-GAT/data/ed_net2_test.csv', sep=",")
# # train_data_file = '../KT-GAT/data/ed_net.csv'
# print(len(train_data))

train_data, test_data, corr_data, pro_num, timestamp = get_data_assistments(batch_size=args.batch_size)

process = psutil.Process(os.getpid())
gc.enable()
memory_0= process.memory_info().rss
print('Memory for leading train_data and corr_data: ', memory_0)
# num_items = int(full_df["item_id"].max() + 1)
# num_skills = int(full_df["skill_id"].max() + 1)
num_items = pro_num
model = RKT(num_items, args.embed_size, args.num_attn_layers, args.num_heads,
              args.encode_pos, args.max_pos, args.drop_prob, args.l1, args.l2).cuda()
optimizer = Adam(model.parameters(), lr=args.lr)
memory_1= process.memory_info().rss
print('Memory for model definition: ', memory_1-memory_0)


# Reduce batch size until it fits on GPU
while True:
    #try:
        # Train
    param_str = (f'{args.dataset},'
                 f'batch_size={args.batch_size},'
                 f'max_length={args.max_length},'
                 f'encode_pos={args.encode_pos},'
                 f'max_pos={args.max_pos}')
    logger = Logger(os.path.join(args.logdir, param_str))
    saver = Saver(args.savedir, param_str, patience = 20)
    print('before train', process.memory_info().rss)
    train(train_data, test_data, pro_num, corr_data, timestamp, args.timespan, model, optimizer, logger, saver, args.num_epochs,
          args.batch_size, args.grad_clip)
    break
    #except RuntimeError:
     #   args.batch_size = args.batch_size // 2
      #  print(RuntimeError)
       # print(f'Batch does not fit on gpu, reducing size to {args.batch_size}')

logger.close()

param_str = (f'{args.dataset},'
              f'batch_size={args.batch_size},'
              f'max_length={args.max_length},'
              f'encode_pos={args.encode_pos},'
              f'max_pos={args.max_pos}')
saver = Saver(args.savedir, param_str)
model = saver.load()

# Predict on test set
print("pre eval")
model.eval()
print("post eval")
correct = np.empty(0)
i=0
test_preds= np.empty(0)
for data, labels in test_data:
    item_inputs, label_inputs, item_ids, timestamp = data
    rel = torch.Tensor(corr_data[(item_ids-1).cpu().unsqueeze(1).repeat(1,item_ids.shape[-1],1),(item_inputs-1).cpu().unsqueeze(-1).repeat(1,1,item_inputs.shape[-1])]).cuda()
    # skill_inputs = skill_inputs.cuda()
    time = computeRePos(timestamp, args.timespan)
    # skill_ids = skill_ids.cuda()
    with torch.no_grad():
        preds,weights = model(item_inputs, label_inputs, item_ids, rel, time)
        preds = torch.sigmoid(preds[labels >= 0]).flatten().cpu().numpy()
        test_preds = np.concatenate([test_preds, preds])
        if(i%100):
            print(test_preds.shape)
    labels = labels[labels>=0].float()
    correct = np.concatenate([correct, labels.cpu()])
    if(i%100):
        print(correct.shape)
    i+=1

print(correct.shape)
print(test_preds.shape)
print("auc_test = ", roc_auc_score(correct, test_preds))
#print("acc_test = ", accuracy_score(correct, test_preds))

[['2013-05-22T14:44:43.483000' '2013-06-05T13:00:49.989000'
  '2013-06-05T13:00:44.274000' ... '-001-01-01T00:00:00.000000'
  '-001-01-01T00:00:00.000000' '-001-01-01T00:00:00.000000']
 ['2012-10-29T20:10:34.873000' '2012-10-29T20:11:44.094000'
  '2012-10-29T20:11:19.607000' ... '-001-01-01T00:00:00.000000'
  '-001-01-01T00:00:00.000000' '-001-01-01T00:00:00.000000']
 ['2013-05-01T15:27:59.539000' '2013-05-01T15:26:42.812000'
  '2013-05-01T15:23:37.397000' ... '-001-01-01T00:00:00.000000'
  '-001-01-01T00:00:00.000000' '-001-01-01T00:00:00.000000']
 ...
 ['2013-08-31T20:48:18.656000' '2013-08-31T21:26:43.716000'
  '2013-08-31T21:26:51.992000' ... '-001-01-01T00:00:00.000000'
  '-001-01-01T00:00:00.000000' '-001-01-01T00:00:00.000000']
 ['2013-08-31T20:48:15.280000' '2013-08-31T20:53:57.370000'
  '2013-08-31T20:50:45.586000' ... '-001-01-01T00:00:00.000000'
  '-001-01-01T00:00:00.000000' '-001-01-01T00:00:00.000000']
 ['2013-08-31T23:00:25.289000' '2013-08-31T23:02:02.601000'
  '2013-08



Corr_data computation
Memory for leading train_data and corr_data:  3307851776
Memory for model definition:  0
before train 3307851776
entered train 3307851776
PB memory used:  3307851776
in epoch0
Prepare batches train
Prepare batches val
Step 172, {'loss/train': 0.6037166267633438, 'auc/train': 0.5892482111656282, 'auc/val': 0.6532213510857279, 'acc/val': 0.701852509887192}
{'loss/train': 0.6037166267633438, 'auc/train': 0.5892482111656282, 'auc/val': 0.6532213510857279, 'acc/val': 0.701852509887192}
in epoch1
Prepare batches train
Prepare batches val
Step 344, {'loss/train': 0.5769526161426721, 'auc/train': 0.6605854956250131, 'auc/val': 0.6735065843476017, 'acc/val': 0.7050610927701146}
{'loss/train': 0.5769526161426721, 'auc/train': 0.6605854956250131, 'auc/val': 0.6735065843476017, 'acc/val': 0.7050610927701146}
in epoch2
Prepare batches train
Prepare batches val
Step 516, {'loss/train': 0.5574156000863674, 'auc/train': 0.69930380240209, 'auc/val': 0.6889951848486534, 'acc/val': 

Step 4300, {'loss/train': 0.4704232110187065, 'auc/train': 0.8089085062555554, 'auc/val': 0.690045202861349, 'acc/val': 0.7021676946869964}
{'loss/train': 0.4704232110187065, 'auc/train': 0.8089085062555554, 'auc/val': 0.690045202861349, 'acc/val': 0.7021676946869964}
in epoch25
Prepare batches train
Prepare batches val
Step 4472, {'loss/train': 0.4666040730337764, 'auc/train': 0.8127105473886616, 'auc/val': 0.6860974595785896, 'acc/val': 0.6987462955190038}
{'loss/train': 0.4666040730337764, 'auc/train': 0.8127105473886616, 'auc/val': 0.6860974595785896, 'acc/val': 0.6987462955190038}
in epoch26
Prepare batches train
Prepare batches val
