In [8]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import torch
%pwd
%cd /Users/wenxindong/Desktop/Stanford/CS329P/project/riiid-test-answer-prediction
%ls
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

/Users/wenxindong/Desktop/Stanford/CS329P/project/riiid-test-answer-prediction
[34mAutogluonModels[m[m/                       cv1_valid.pickle.zip
cv1_train.pickle                       cv1_valid_100k.pickle
cv1_train.pickle.zip                   example_sample_submission.csv
cv1_train_10000k.pickle                example_test.csv
cv1_train_10000k_preprocessed.pickle   lectures.csv
cv1_train_1000k.pickle                 questions.csv
cv1_train_1000k_preprocessed.pickle    riiid-test-answer-prediction.zip
cv1_train_100k.pickle                  [34mriiideducation[m[m/
cv1_train_100k_preprocessed.pickle     test_4920_users.pickle
cv1_train_preprocessed.pickle          test_4920_users_preprocessed.pickle
cv1_val_100k_preprocessed.pickle       train.csv
cv1_val_10k.pickle                     train_39360_users.pickle
cv1_val_10k_preprocessed.pickle        train_39360_users_preprocessed.pickle
cv1_val_preprocessed.pickle            valid_4920_users.pickle
cv1_valid.pickle               

In [25]:
train_pickle = 'train_39360_users.pickle'  #about one tenth of the training dataset
valid_pickle = 'test_4920_users.pickle'   # too lazy to change name to test_pickle
question_file = 'questions.csv'
lecture_file = 'lectures.csv'

# Read data
train = pd.read_pickle(train_pickle)
valid = pd.read_pickle(valid_pickle)
lectures = pd.read_csv(lecture_file)
questions = pd.read_csv(question_file)
#subsample
train = train[:len(train)//10]
valid = valid[:len(valid)//10]


In [26]:
#Sort by time
train = train.sort_values(by=['timestamp'])
train = train.reset_index(drop=True)
valid = valid.sort_values(by=['timestamp'])
valid = valid.reset_index(drop=True)

In [27]:
train = train.drop(np.where(train['content_type_id'] == 1)[0], axis=0)
train = train.reset_index(drop=True)
print("# of Training Entries: " + str(train.shape[0]))

valid = valid.drop(np.where(valid['content_type_id'] == 1)[0], axis=0)
valid = valid.reset_index(drop=True)



# of Training Entries: 989400


In [28]:
#This is to map question_id into parts (Note that if we also want to include the lecture part, we can do so by concatenating questions.csv with lectures.csv)
train['part'] = np.array(questions['part'])[np.array(train['content_id'])]
valid['part'] = np.array(questions['part'])[np.array(valid['content_id'])]

In [29]:
#Should run only one time
train = train.drop(['row_id', 'content_id', 'content_type_id', 'task_container_id', 'user_answer', 'prior_question_elapsed_time', 'prior_question_had_explanation'], axis=1)

#Sort lexiographically
train = train.sort_values(by=['user_id', 'timestamp'])
train = train.reset_index(drop=True)

train = train.drop(['timestamp'], axis=1)

In [30]:
#Should run only one time
valid = valid.drop(['row_id', 'content_id', 'content_type_id', 'task_container_id', 'user_answer', 'prior_question_elapsed_time', 'prior_question_had_explanation'], axis=1)

#Sort lexiographically
valid = valid.sort_values(by=['user_id', 'timestamp'])
valid = valid.reset_index(drop=True)

valid = valid.drop(['timestamp'], axis=1)

# Empirical

In [31]:
def prep_data_BKT(data_df, part_id):
  train0 = data_df[data_df['part']==part_id]
  train0.reset_index(inplace=True)
  map, counts = np.unique(train0['user_id'], return_counts=True)
  cum_counts = np.cumsum(counts)
  start_idxs = np.append(0, cum_counts)
  return train0, start_idxs
def evaluate_BKT(train0, C, P_L0, P_T, P_G, P_S, start_idxs):
  #for every student
  predictions = []
  for i in range(len(start_idxs)-1):
    L = P_L0
    student_history = np.array(train0["answered_correctly"][start_idxs[i]: start_idxs[i+1]])
    for answer in student_history:
      prediction = L*(1-P_S) + (1-L)*P_G
      predictions.append(prediction)
      P_L_obs= 0
      if answer==1:
        P_L_obs = (L*(1-P_S)) / (L*(1-P_S)+ (1-L)*P_G)
      else:
        P_L_obs = (L*(P_S)) / (L*(P_S)+ (1-L)*(1-P_G))
      L = P_L_obs + (1-P_L_obs)*P_T
  predictions = np.array(predictions)
  # print(predictions)
  # plt.scatter(C[np.where(C==1)]+np.where(C==1), predictions[np.where(C==1)], c = "red", s = 0.1)
  # plt.scatter(C[np.where(C==0)]+np.where(C==0), predictions[np.where(C==0)], c = "green", s = 0.1)
  # plt.show()
  bce_loss = np.sum(- C*np.log(predictions) - (1-C)*np.log(1-predictions))/len(predictions)
  acc = np.sum(C*(predictions>0.5) + (1-C)*(predictions<0.5))/len(predictions)
  return bce_loss, acc, predictions

def get_best_ki(student_history):
    len_history = len(student_history)
    total_ones = np.sum(student_history)
    max_acc = 0
    best_i = 0
    num_mistakes_before = 0
    num_correct_before = 0
    for i in range(len_history+1):
      #i = position of first green check 
      acc =num_mistakes_before + (total_ones - num_correct_before)
      if acc>max_acc:
        best_i  = i
        max_acc = acc
      if i<len_history and student_history[i] == 0:
        num_mistakes_before+=1
      if i<len_history and student_history[i] == 1:
        num_correct_before+=1
    return best_i

def fit_BKT(train0, start_idxs):

  K = []
  for i in range(len(start_idxs)-1):
    student_history = np.array(train0["answered_correctly"][start_idxs[i]: start_idxs[i+1]])
    best_i = get_best_ki(student_history)
    student_ki = [0 for _ in range(best_i)] + [1 for _ in range(len(student_history) -  best_i)]
    K.extend(student_ki)
  
  K = np.array(K)
  C = np.array(train0["answered_correctly"])
  P_L0 = np.mean(K[start_idxs[:-1]])
  P_T = np.sum(K[1:]*(1-K[:-1])) / np.sum((1-K[:-1]))
  P_G = np.sum(C*(1-K)) / np.sum((1-K))
  P_S = np.sum((1-C)*(K)) / np.sum((K))
  return P_L0, P_T, P_G, P_S, C, K

all_predictions = []
all_targets = []
for part in range(1,8):
  print("training BKT model for part {}".format(part))
  #training
  train0, start_idxs = prep_data_BKT(train, part)
  print("there are {} rows, {} students".format(len(train0), len(start_idxs)))
  P_L0, P_T, P_G, P_S, C, K = fit_BKT(train0, start_idxs)
  bce_loss, acc, predictions = evaluate_BKT(train0, C,  P_L0, P_T, P_G, P_S, start_idxs)
  print("Training BCE losss: {:.3f}, acc: {:.3f}".format(bce_loss, acc))  

  #testing
  valid0, start_idxs_valid = prep_data_BKT(valid, part)
  _, _, _, _, C, _ = fit_BKT(valid0, start_idxs_valid)
  bce_loss, acc, predictions = evaluate_BKT(valid0, C, P_L0, P_T, P_G, P_S, start_idxs_valid)
  all_predictions.extend(predictions)
  all_targets.extend(C)
  print("BCE losss: {:.3f}, acc: {:.3f}".format(bce_loss, acc))

training BKT model for part 1
there are 76299 rows, 2806 students
Training BCE losss: 0.545, acc: 0.761
BCE losss: 0.582, acc: 0.732
training BKT model for part 2
there are 188335 rows, 3471 students
Training BCE losss: 0.583, acc: 0.725
BCE losss: 0.586, acc: 0.723
training BKT model for part 3
there are 87788 rows, 2009 students
Training BCE losss: 0.577, acc: 0.724
BCE losss: 0.566, acc: 0.734
training BKT model for part 4
there are 83160 rows, 1946 students
Training BCE losss: 0.609, acc: 0.693
BCE losss: 0.592, acc: 0.711
training BKT model for part 5
there are 396050 rows, 3750 students
Training BCE losss: 0.650, acc: 0.639
BCE losss: 0.656, acc: 0.630
training BKT model for part 6
there are 108879 rows, 1976 students
Training BCE losss: 0.614, acc: 0.683
BCE losss: 0.625, acc: 0.679
training BKT model for part 7
there are 48889 rows, 1815 students
Training BCE losss: 0.589, acc: 0.711
BCE losss: 0.578, acc: 0.727


In [41]:
#calculate overall accuracy and AUC
accuracy = np.sum((np.array(all_predictions) >0) +0 == np.array(all_targets)+0) / len(all_targets)
print(f"overall accuracy on the test set is {accuracy}")
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(all_targets, all_predictions)
print(f"overall auc score is {auc}")

overall accuracy on the test set is 0.6721045417469034
overall auc score is 0.6094982534491316
