In [None]:
import os
import csv
import sys
sys.path.insert(0, '..')

import pandas as pd
import tensorflow as tf
import numpy as np

from environments.education.DKT_model import get_custom_DKT_model
print(tf.__version__)


# Load in dataset

In [None]:
saved_data_folder = '../education/saved_data'
train_file_name = 'DKT_data/builder_train.csv'
test_file_name = 'DKT_data/builder_test.csv'
all_problems_file_name = f'{saved_data_folder}/all_problems.txt'

In [None]:
all_problems = []

with open(all_problems_file_name, 'r') as filehandle:
    for line in filehandle:
        problem = line[:-1] # remove linebreak which is the last character of the string
        all_problems.append(int(problem))

n_problems = len(all_problems)
n_features = 2*n_problems
batch_size = 100 # Batch size
val_fraction = 0.2
MASK = -1.

In [None]:
def load_file_to_df(file_name, all_problems):
    line_num = 0
    student_number = 0
    skip_student = False

    students_ids = []
    students_problems = []
    students_answers = []

    with open(file_name, 'r', newline = '') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if line_num % 3 == 0:
                num_student_problems = int(row[0])
                if num_student_problems > 1:
                    skip_student = False
                else:
                    skip_student = True
            if line_num % 3 == 1 and not skip_student:
                for i in range(num_student_problems):
                    students_ids.append(student_number)
                    problem = int(row[i])
                    problem_id = all_problems.index(problem)
                    students_problems.append(problem_id)

            if line_num % 3 == 2 and not skip_student:
                for i in range(num_student_problems):
                    students_answers.append(int(row[i]))
                student_number += 1
            line_num += 1
    data_dict = {'student_id': students_ids, 'problem_id': students_problems, 'correctness': students_answers}
    data_df = pd.DataFrame(data_dict)
    return data_df

In [None]:
def create_dataset_from_df(df, n_problems, n_features, batch_size=32, shuffle=True):
    df['problem_answer'] = df['problem_id'] *2 + df['correctness'] #combine problem id and correctness
    
    seq = df.groupby('student_id').apply(
        lambda r: (
            r['problem_answer'].values[:-1],
            r['problem_id'].values[1:],
            r['correctness'].values[1:],
        )
    )
    nb_users = len(seq)

    dataset = tf.data.Dataset.from_generator(
        generator=lambda: seq,
        output_types=(tf.int32, tf.int32, tf.float32)
    )

    if shuffle:
        dataset = dataset.shuffle(buffer_size=nb_users)

    features_depth = n_problems*2
    skill_depth = n_problems

    dataset = dataset.map(
        lambda feat, skill, label: (
            tf.one_hot(feat, depth=features_depth),
            tf.concat( values=[tf.one_hot(skill, depth=skill_depth),tf.expand_dims(label, -1)],axis=-1)
        )
    )

    dataset = dataset.padded_batch(
        batch_size=batch_size,
        padding_values=(MASK, MASK),
        padded_shapes=([None, None], [None, None]),
        drop_remainder=True
    )

    length = nb_users // batch_size
    return dataset, nb_users, length

In [None]:
def train_val_split(dataset, total_length, val_fraction):
    train_length = int(total_length*(1-val_fraction))
    train_set = dataset.take(train_length)
    val_set = dataset.skip(train_length)
    return train_set, val_set, train_length, total_length-train_length

In [None]:
all_train_data_df = load_file_to_df(train_file_name, all_problems)
test_data_df = load_file_to_df(test_file_name, all_problems)
n_features = len(all_problems)*2
n_problems = len(all_problems)

all_train_dataset, all_train_total, all_train_batches = create_dataset_from_df(all_train_data_df, n_problems, n_features, batch_size = batch_size)
test_dataset, test_total, test_batches  = create_dataset_from_df(test_data_df, n_problems, n_features, batch_size = batch_size)

train_dataset, val_dataset, train_batches, val_batches = train_val_split(all_train_dataset, all_train_batches, val_fraction)

In [None]:
print("Training set size: %d" % int(all_train_total*(1-val_fraction)))
print("Validation set size: %d" % int(all_train_total*(val_fraction)))
print("Testing set size: %d" % (test_total))
print("Number of skills: %d" % n_problems)
print("Total number of students: %d" % (all_train_total + test_total))

# Load in Student Model used in experiment

In [None]:
student_model = get_custom_DKT_model(saved_data_folder = saved_data_folder)

### Train model

In [None]:
save_model_folder = 'model_weights'
save_model_name = 'model_weights_test'
try:
    os.mkdir(save_model_folder)
except:
    pass
save_model_file = f"{save_model_folder}/{save_model_name}"
epochs = 20 # Number of epochs to train
verbose = 1
shuffle = True
log_dir = "logs" # Path to save the logs.

In [None]:
callbacks=[ 
    tf.keras.callbacks.CSVLogger(f"{log_dir}/train.log"),
    tf.keras.callbacks.ModelCheckpoint(save_model_file,
                                       save_best_only=True,
                                       save_weights_only=True),
    tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

student_model.train(train_dataset, val_dataset, epochs, verbose, callbacks, shuffle)

### Evaluate Model

In [None]:
student_model.evaluate(test_dataset, verbose)