In [1]:
import os
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from tsfresh import extract_features

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import gc
import lightgbm as lgb
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
%cd /Users/wenxindong/Desktop/Stanford/CS329P/project/riiid-test-answer-prediction

%pwd

/Users/wenxindong/Desktop/Stanford/CS329P/project/riiid-test-answer-prediction


'/Users/wenxindong/Desktop/Stanford/CS329P/project/riiid-test-answer-prediction'

In [None]:
# Random seed
SEED = 123
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

### Sample users for Train, Valid, Test sets

In [11]:
train = pd.read_csv("train.csv")

In [24]:
users = np.unique(train["user_id"], return_counts = True)[0]
print(f"there are {len(users)} unique users")

there are 393656 unique users


In [67]:
np.random.seed(0)
#randomly sample 1/8 users
num_samples_needed = (len(users)//80)*10
sampled_users = np.random.choice(users, size = (num_samples_needed,), replace=False)
print(sampled_users)
print(f"sampled {num_samples_needed} users")
# train, valid, test split = 8,1,1
train_users = set(sampled_users[:int(num_samples_needed/10*8)])
valid_users = set(sampled_users[int(num_samples_needed/10*8):int(num_samples_needed/10*9)])
test_users = set(sampled_users[int(num_samples_needed/10*9):])
print(f"{len(train_users)} train users, {len(valid_users)} validation users, {len(test_users)} test users")

train_copy = train[train["user_id"].isin(train_users)]
train_copy.reset_index(drop=True)
valid_copy = train[train["user_id"].isin(valid_users)]
valid_copy.reset_index(drop=True)
test_copy = train[train["user_id"].isin(test_users)]
test_copy.reset_index(drop=True)

[ 992044884 1097305404 1460737564 ...  467215695   46051913 1191555820]
sampled 49200 users
39360 train users, 4920 validation users, 4920 test users


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,16608,0,157207,7900,0,0,0,1,,
1,16609,27842,157207,7876,0,1,2,0,18000.0,False
2,16610,51144,157207,175,0,2,1,0,25000.0,False
3,16611,69173,157207,1278,0,3,3,1,21000.0,False
4,16612,195730,157207,2065,0,4,2,1,16000.0,False
...,...,...,...,...,...,...,...,...,...,...
1214910,101215563,380148,2147044811,769,0,9,3,1,19000.0,True
1214911,101215564,418586,2147044811,10687,0,10,0,0,15000.0,True
1214912,101215565,454803,2147044811,1170,0,11,3,0,17000.0,True
1214913,101215566,489799,2147044811,1170,0,12,0,1,16000.0,True


## Save time series data to use in transformer models

In [None]:
train_copy.to_pickle("train_39360_users.pickle")
valid_copy.to_pickle("valid_4920_users.pickle")
test_copy.to_pickle("test_4920_users.pickle")

## Feature engineering

In [83]:
train_copy['answered_correctly']

0          0
1          0
2          0
3          0
4          0
          ..
9897109    0
9897110    1
9897111    0
9897112    0
9897113    0
Name: answered_correctly, Length: 9897114, dtype: int64

In [87]:
valid_copy["prior_question_elapsed_time"]

0          25381.922455
1          23000.000000
2          35000.000000
3          29000.000000
4          20000.000000
               ...     
1267248    51000.000000
1267249    73000.000000
1267250    73000.000000
1267251    73000.000000
1267252    73000.000000
Name: prior_question_elapsed_time, Length: 1267253, dtype: float64

In [88]:
test_copy['prior_question_elapsed_time']

0          25381.922455
1          23000.000000
2          35000.000000
3          29000.000000
4          20000.000000
               ...     
1267248    51000.000000
1267249    73000.000000
1267250    73000.000000
1267251    73000.000000
1267252    73000.000000
Name: prior_question_elapsed_time, Length: 1267253, dtype: float64

In [69]:
# Funcion for user stats with loops
def add_features(df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq, update = True):
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
    # -----------------------------------------------------------------------
    # Question features
    answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
    explanation_q_avg = np.zeros(len(df), dtype = np.float32)
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)
    # -----------------------------------------------------------------------
    
    for num, row in tqdm(enumerate(df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp']].values)):
                
        # Client features assignation
        # ------------------------------------------------------------------
        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            explanation_u_avg[num] = explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]
        else:
            answered_correctly_u_avg[num] = np.nan
            elapsed_time_u_avg[num] = np.nan
            explanation_u_avg[num] = np.nan
            
        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0] 
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]
        
        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]
            
        # ------------------------------------------------------------------
        # Question features assignation
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan

        # ------------------------------------------------------------------
        # Client Question assignation
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            # ------------------------------------------------------------------
            
            
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 'elapsed_time_u_avg': elapsed_time_u_avg, 'explanation_u_avg': explanation_u_avg, 
                            'answered_correctly_q_avg': answered_correctly_q_avg, 'elapsed_time_q_avg': elapsed_time_q_avg, 'explanation_q_avg': explanation_q_avg, 
                            'answered_correctly_uq_count': answered_correctly_uq_count, 'timestamp_u_recency_1': timestamp_u_recency_1, 'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency})
    
    df = pd.concat([df, user_df], axis = 1)
    return df
        
def update_features(df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect):
    for row in df[['user_id', 'answered_correctly', 'content_id', 'content_type_id', 'timestamp']].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            # ------------------------------------------------------------------
            
    return

def read_and_preprocess(train, valid, test, questions_df):
    
    train.memory_usage(deep=True)
    valid.memory_usage(deep=True)
    test.memory_usage(deep=True)

    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop = True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop = True)
    test = test.loc[test.content_type_id == False].reset_index(drop = True)

    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    test['prior_question_had_explanation'] = test.prior_question_had_explanation.fillna(False).astype('int8')

    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    test['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)

    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)
    questions_df['part'] = questions_df['part'].astype(np.int32)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    
    train = pd.merge(train, questions_df[['question_id', 'part','tags']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    valid = pd.merge(valid, questions_df[['question_id', 'part','tags']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    test = pd.merge(test, questions_df[['question_id', 'part','tags']], left_on = 'content_id', right_on = 'question_id', how = 'left')

    # Client dictionaries
    answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    
    # Question dictionaries
    answered_correctly_q_count = defaultdict(int)
    answered_correctly_q_sum = defaultdict(int)

    elapsed_time_q_sum = defaultdict(int)
    explanation_q_sum = defaultdict(int)
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(train, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq)
    valid = add_features(valid, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq)
    test = add_features(test, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_uq)

    gc.collect()
    print('User feature calculation completed...')
    print('\n')
    
    features_dicts = {
        'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        'answered_correctly_q_count': answered_correctly_q_count,
        'answered_correctly_q_sum': answered_correctly_q_sum,
        'elapsed_time_q_sum': elapsed_time_q_sum,
        'explanation_q_sum': explanation_q_sum,
        'answered_correctly_uq': answered_correctly_uq,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect
    }
    
    return train, valid, test, questions_df, prior_question_elapsed_time_mean, features_dicts


In [93]:
%%time

train_pickle = 'train_39360_users.pickle'
valid_pickle = 'valid_4920_users.pickle'
test_pickle = 'test_4920_users.pickle'
question_file = 'questions.csv'
#
# Read data
feld_needed = ['timestamp', 'user_id',  'content_id', 'content_type_id', 'answered_correctly','prior_question_elapsed_time', 'prior_question_had_explanation']
train_raw = pd.read_pickle(train_pickle)[feld_needed]
valid_raw = pd.read_pickle(valid_pickle)[feld_needed]
test_raw = pd.read_pickle(test_pickle)[feld_needed]
questions_df = pd.read_csv(question_file)
train_raw['prior_question_had_explanation'] = train_raw['prior_question_had_explanation'].astype('boolean')
valid_raw['prior_question_had_explanation'] = valid_raw['prior_question_had_explanation'].astype('boolean')
test_raw['prior_question_had_explanation'] = test_raw['prior_question_had_explanation'].astype('boolean')

train_ready, valid_ready, test_ready, questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(train_raw, valid_raw, test_raw, questions_df)
train_ready = train_ready.fillna(0)
valid_ready = valid_ready.fillna(0)
test_ready = test_ready.fillna(0)

User feature calculation started...




9897114it [08:16, 19936.39it/s]
1267253it [01:01, 20585.93it/s]
1191168it [01:00, 19787.30it/s]


User feature calculation completed...


CPU times: user 8min 24s, sys: 22.6 s, total: 8min 47s
Wall time: 10min 59s


In [94]:
def prepare_data(train, valid, test):
    TARGET = 'answered_correctly'
    # Features to train and predict
    FEATURES = ['prior_question_elapsed_time', 'prior_question_had_explanation', 'part', 'answered_correctly_u_avg', 'elapsed_time_u_avg', 'explanation_u_avg',
                'answered_correctly_q_avg', 'elapsed_time_q_avg', 'explanation_q_avg', 'answered_correctly_uq_count', 'timestamp_u_recency_1', 'timestamp_u_recency_2', 'timestamp_u_recency_3', 
                'timestamp_u_incorrect_recency'] #answer_correctness_at_t
    print(f'Traning with {train.shape[0]} rows and {len(FEATURES)} features')    
    
    drop_cols = list(set(train.columns) - set(FEATURES) - set([TARGET])) 
    train_copy = train.drop(drop_cols, axis = 1)
    valid_copy = valid.drop(drop_cols, axis = 1)
    test_copy = test.drop(drop_cols, axis = 1)

    gc.collect()   
    return train_copy, valid_copy, test_copy
   


In [95]:
train_copy, valid_copy, test_copy = prepare_data(train_ready, valid_ready, test_ready)
train_copy.describe()
train_copy.to_pickle("train_39360_users_preprocessed.pickle")
valid_copy.to_pickle("valid_4920_users_preprocessed.pickle")
test_copy.to_pickle("test_4920_users_preprocessed.pickle")

Traning with 9897114 rows and 14 features
