In [3]:
import numpy as np
import pandas as pd
from datetime import date, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import pickle
from scipy import stats
import pylab
import nn_util

In [13]:
with open('df_train.pickle','rb') as f:
    df_train = pickle.load(f)

In [4]:
df_train.head()

Unnamed: 0,AveCarelessness,AveCorrect,AveKnow,AveResBored,AveResConf,AveResEngcon,AveResFrust,AveResGaming,AveResOfftask,ITEST_id,...,timeOver80,timeSinceSkill,timeTaken,totalFrAttempted,totalFrPastWrongCount,totalFrPercentPastWrong,totalFrSkillOpportunities,totalFrSkillOpportunitiesByScaffolding,totalFrTimeOnSkill,totalTimeByPercentCorrectForskill
0,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,49.0,0,0,0.0,0,0.0,0.0,0.0
1,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,4.0,1,0,0.0,1,0.0,49.0,106.0
2,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,6.0,2,0,0.0,0,0.0,0.0,0.0
3,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,18.0,3,1,0.0,1,0.0,0.0,0.0
4,0.183276,0.483902,0.352416,0.208389,0.115905,0.679126,0.112408,0.196561,0.156503,8,...,0,0.0,2.0,3,1,1.0,1,1.0,6.0,77.999999


In [14]:
df_train = df_train[['ITEST_id', 'AveCorrect','skill','problemId','attemptCount',
                     'correct','consecutiveErrorsInRow','frIsHelpRequest',
                     'totalFrPercentPastWrong','frPast5WrongCount','confidence(FRUSTRATED)']]
df_train = df_train[:1024]

In [15]:
TIMESTEPS = 30
BATCH_SIZE = 128
VALIDATION_SPLIT = .2

print('Loading data')
df = df_train
print(str(len(df)) + ' rows')

print('Normalizing data')
numeric_cols = ['confidence(FRUSTRATED)','attemptCount','consecutiveErrorsInRow',]
df = nn_util.z_standardize(df, columns=numeric_cols, clip_magnitude=3)  # Winsorize.
df = nn_util.rescale(df, columns=numeric_cols)  # Rescale to [0, 1] range.
for col in numeric_cols:
    if df[col].std() < .1:
        print('WARNING! Column has low standard deviation: ' + col)
        print(df[col].describe())
for col in numeric_cols:
    df[col] = df[col].replace(np.nan, 0)

print('Making sequences')
X, y_i = nn_util.make_sequences(df, numeric_cols,
                                participant_id_col='ITEST_id',
                                sequence_len=TIMESTEPS,
                                verbose=True)

print('Splitting training/validation')
val_X = X[-int(len(X) / BATCH_SIZE * VALIDATION_SPLIT) * BATCH_SIZE:]
# Round up training set size, so there might be some overlap (< BATCH_SIZE) to ensure full coverage.
train_X = X[:int((len(X) - len(val_X)) / BATCH_SIZE + .999) * BATCH_SIZE]

print('Saving')
np.save('sequences_train-' + str(TIMESTEPS) + 'steps.npy', train_X)
np.save('sequences_val-' + str(TIMESTEPS) + 'steps.npy', val_X)
np.save('sequences_y_i-' + str(TIMESTEPS) + 'steps.npy', y_i)
pd.DataFrame(data=numeric_cols).to_csv('numeric_cols.txt')


Loading data
1024 rows
Normalizing data
Making sequences
Splitting training/validation
Saving


In [18]:
train_X.shape

(896, 30, 2)