# Riiid Project: Minimally Viable Product (MVP)

## About the Project

- Build a better and more equitable model for education in a post-COVID-19 world.

## Goals

- Create algorithms for "Knowledge Tracing," the modeling of student knowledge over time. 
- Accurately predict how students will perform on future interactions. 

# Imports

In [1]:
# General Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

# Train Validate Test
from sklearn.model_selection import train_test_split

# Modeling Imports
from sklearn.cluster import KMeans

# Acquire and Prepare Files
import acquire
from prepare import prep_riiid

from sklearn.preprocessing import MinMaxScaler

# Warnings 
import warnings
warnings.filterwarnings("ignore")

# Acquire

In [2]:
df_train = pd.read_csv('train.csv')
df_validate = pd.read_csv('validate.csv')
df_test = pd.read_csv('test.csv')

df_train.shape, df_validate.shape, df_test.shape

((411517, 18), (50842, 18), (52868, 18))

In [3]:
# Train Data
df_train.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,0,1864702,5720,0,0,1,1,,,,,,,5720.0,5720.0,1.0,5.0,115
1,45951,1864702,5204,0,1,1,0,inf,False,,,,,5204.0,5204.0,3.0,5.0,173


In [4]:
# Validate Data
df_validate.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,15625124241,1864702,3209,0,2152,3,1,34660.0,True,,,,,3209.0,3207.0,3.0,4.0,157 169 162 38
1,15625124241,1864702,3208,0,2152,3,1,34660.0,True,,,,,3208.0,3207.0,3.0,4.0,113 169 162 38


In [5]:
# Test Data
df_test.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,20170378604,1864702,8811,0,2494,1,1,3000.0,True,,,,,8811.0,8811.0,1.0,5.0,45
1,20170394313,1864702,8643,0,2495,3,1,4000.0,True,,,,,8643.0,8643.0,3.0,5.0,15


# Prepare

### Running Function that does all prep at once

In [3]:
train, validate, test, train_s, validate_s, test_s = prep_riiid(df_train, df_validate, df_test)

In [4]:
train

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,1,0,0.630049,0,11917302.0,0.818182,0.682248,6.526086e+09,21594.667829
1,0,0,0.630049,0,11917302.0,0.550000,0.534988,6.526086e+09,21594.667829
2,1,0,0.630049,0,11917302.0,0.444444,0.445216,6.526086e+09,21594.667829
3,1,0,0.630049,0,11917302.0,0.406250,0.543241,6.526086e+09,21594.667829
4,0,0,0.630049,0,11917302.0,0.687500,0.485282,6.526086e+09,21594.667829
...,...,...,...,...,...,...,...,...,...
411512,1,0,0.833333,0,21937.0,0.438849,0.594718,3.284642e+05,17300.000000
411513,0,0,0.833333,0,21937.0,0.546392,0.591160,3.284642e+05,17300.000000
411514,0,0,0.833333,0,21937.0,0.359574,0.606805,3.284642e+05,17300.000000
411515,0,0,0.833333,0,21937.0,0.061728,0.568949,3.284642e+05,17300.000000


In [5]:
train_s

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,0,0.630049,0.818182,0.682248,0.120073,0.500746,0.0,0.002404
1,0,0,0.630049,0.550000,0.534988,0.120073,0.500746,0.0,0.002404
2,1,0,0.630049,0.444444,0.445216,0.120073,0.500746,0.0,0.002404
3,1,0,0.630049,0.406250,0.543241,0.120073,0.500746,0.0,0.002404
4,0,0,0.630049,0.687500,0.485282,0.120073,0.500746,0.0,0.002404
...,...,...,...,...,...,...,...,...,...
411512,1,0,0.833333,0.438849,0.594718,0.000005,0.401159,0.0,0.000003
411513,0,0,0.833333,0.546392,0.591160,0.000005,0.401159,0.0,0.000003
411514,0,0,0.833333,0.359574,0.606805,0.000005,0.401159,0.0,0.000003
411515,0,0,0.833333,0.061728,0.568949,0.000005,0.401159,0.0,0.000003


## Drop the columns merged from questions.csv and lectures.csv

In [62]:
# Print the columns in train/validate/test
df_train.columns

Index(['timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'lecture_id', 'tag', 'lecture_part', 'type_of', 'question_id',
       'bundle_id', 'correct_answer', 'question_part', 'tags'],
      dtype='object')

In [63]:
# Drop the columns merged from questions.csv and lectures.csv

cols = ['lecture_id', 'tag', 'lecture_part', 'type_of', 'question_id',
        'bundle_id', 'correct_answer', 'question_part', 'tags']

df_train = df_train.drop(columns = cols)
df_validate = df_validate.drop(columns = cols)
df_test = df_test.drop(columns = cols)

In [64]:
# Count the missing values in train
df_train.isnull().sum(axis=0)

timestamp                             0
user_id                               0
content_id                            0
content_type_id                       0
task_container_id                     0
user_answer                           0
answered_correctly                    0
prior_question_elapsed_time       10133
prior_question_had_explanation     1993
dtype: int64

In [65]:
df_validate.isnull().sum(axis=0)

timestamp                           0
user_id                             0
content_id                          0
content_type_id                     0
task_container_id                   0
user_answer                         0
answered_correctly                  0
prior_question_elapsed_time       897
prior_question_had_explanation      0
dtype: int64

In [66]:
df_test.isnull().sum(axis=0)

timestamp                           0
user_id                             0
content_id                          0
content_type_id                     0
task_container_id                   0
user_answer                         0
answered_correctly                  0
prior_question_elapsed_time       897
prior_question_had_explanation      0
dtype: int64

## Merge with new features - Part I

In [67]:
# Take a peek at the new features

train = MVP_Davila.sam_train_features(df_train)
validate = MVP_Davila.sam_valtest_features(train, df_validate)
test = MVP_Davila.sam_valtest_features(train, df_test)

train.shape, validate.shape, test.shape

((411517, 13), (50842, 12), (52868, 12))

## Handle the nulls in columns
- Fill the nulls in prior_question_had_explanation with False
- Fill the nulls in the prior_question_elapsed_time with string 0

In [68]:
# Handle the null values

train = MVP_Shi.handle_null(train)
validate = MVP_Shi.handle_null(validate)
test = MVP_Shi.handle_null(test)

## Handle the np.inf in column prior_question_elapsed_time

In [69]:
# Handle the inf values

train = MVP_Shi.handle_inf(train)
validate = MVP_Shi.handle_inf(validate)
test = MVP_Shi.handle_inf(test)

In [70]:
train.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,last_q_time,avg_user_q_time
0,0,1864702,5720,0,0,1,1,0.0,False,0.630049,0,0.0,11917302.0
1,45951,1864702,5204,0,1,1,0,0.0,False,0.630049,0,45951.0,11917302.0
2,74342,1864702,4094,0,2,1,1,43000.0,False,0.630049,0,28391.0,11917302.0
3,96778,1864702,9699,0,3,0,1,25000.0,False,0.630049,0,22436.0,11917302.0
4,132969,1864702,5889,0,4,2,0,19000.0,False,0.630049,0,36191.0,11917302.0


## Merge with new features - Part II

In [71]:
# Merge the new features genereated from Shi

train = MVP_Shi.merge_with_stats_train(train)
validate = MVP_Shi.merge_with_stats_valortest(train, test)
test = MVP_Shi.merge_with_stats_valortest(train, test)

train.shape, validate.shape, test.shape

((411517, 17), (52868, 16), (52868, 16))

In [72]:
train.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,last_q_time,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,0,1864702,5720,0,0,1,1,0.0,False,0.630049,0,0.0,11917302.0,0.818182,0.682248,6526086000.0,21594.667829
1,45951,1864702,5204,0,1,1,0,0.0,False,0.630049,0,45951.0,11917302.0,0.55,0.534988,6526086000.0,21594.667829
2,74342,1864702,4094,0,2,1,1,43000.0,False,0.630049,0,28391.0,11917302.0,0.444444,0.445216,6526086000.0,21594.667829
3,96778,1864702,9699,0,3,0,1,25000.0,False,0.630049,0,22436.0,11917302.0,0.40625,0.543241,6526086000.0,21594.667829
4,132969,1864702,5889,0,4,2,0,19000.0,False,0.630049,0,36191.0,11917302.0,0.6875,0.485282,6526086000.0,21594.667829


In [73]:
validate.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,20170378604,1864702,8811,0,2494,1,1,3000.0,True,0.630049,0,11917302.0,0.46875,0.529412,6526086000.0,21594.667829
1,20170394313,1864702,8643,0,2495,3,1,4000.0,True,0.630049,0,11917302.0,0.290323,0.578947,6526086000.0,21594.667829
2,20170433302,1864702,8609,0,2496,3,0,11000.0,True,0.630049,0,11917302.0,0.387755,0.588235,6526086000.0,21594.667829
3,20170477063,1864702,6183,0,2497,3,1,16000.0,True,0.630049,0,11917302.0,0.740741,0.5,6526086000.0,21594.667829
4,20170518116,1864702,8393,0,2498,2,0,18000.0,True,0.630049,0,11917302.0,0.428571,0.588235,6526086000.0,21594.667829


In [74]:
test.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,20170378604,1864702,8811,0,2494,1,1,3000.0,True,0.630049,0,11917302.0,0.46875,0.529412,6526086000.0,21594.667829
1,20170394313,1864702,8643,0,2495,3,1,4000.0,True,0.630049,0,11917302.0,0.290323,0.578947,6526086000.0,21594.667829
2,20170433302,1864702,8609,0,2496,3,0,11000.0,True,0.630049,0,11917302.0,0.387755,0.588235,6526086000.0,21594.667829
3,20170477063,1864702,6183,0,2497,3,1,16000.0,True,0.630049,0,11917302.0,0.740741,0.5,6526086000.0,21594.667829
4,20170518116,1864702,8393,0,2498,2,0,18000.0,True,0.630049,0,11917302.0,0.428571,0.588235,6526086000.0,21594.667829


# Explore

## Drop the lecture rows

In [75]:
print("The shape of train before dropping the lecture rows: ", train.shape)
train = MVP_Shi.drop_lecture_rows(train)
print("The shape of train before dropping the lecture rows: ", train.shape)

The shape of train before dropping the lecture rows:  (411517, 17)
The shape of train before dropping the lecture rows:  (403377, 17)


In [76]:
print("The shape of test before dropping the lecture rows: ", validate.shape)
validate = MVP_Shi.drop_lecture_rows(validate)
print("The shape of test before dropping the lecture rows: ", validate.shape)

The shape of test before dropping the lecture rows:  (52868, 16)
The shape of test before dropping the lecture rows:  (51971, 16)


In [77]:
print("The shape of test before dropping the lecture rows: ", test.shape)
test = MVP_Shi.drop_lecture_rows(test)
print("The shape of test before dropping the lecture rows: ", test.shape)

The shape of test before dropping the lecture rows:  (52868, 16)
The shape of test before dropping the lecture rows:  (51971, 16)


## Drop the redundant columns

In [78]:
print("The shape of the train BEFORE dropping the redundant columns: ", train.shape)
train = MVP_Shi.drop_columns_train(train)
print("The shape of the train AFTER dropping the redundant columns: ", train.shape)

The shape of the train BEFORE dropping the redundant columns:  (403377, 17)
The shape of the train AFTER dropping the redundant columns:  (403377, 9)


In [79]:
print("The shape of the test BEFORE dropping the redundant columns: ", validate.shape)
validate = MVP_Shi.drop_columns_valortest(validate)
print("The shape of the test AFTER dropping the redundant columns: ", validate.shape)

The shape of the test BEFORE dropping the redundant columns:  (51971, 16)
The shape of the test AFTER dropping the redundant columns:  (51971, 9)


In [80]:
print("The shape of the test BEFORE dropping the redundant columns: ", test.shape)
test = MVP_Shi.drop_columns_valortest(test)
print("The shape of the test AFTER dropping the redundant columns: ", test.shape)

The shape of the test BEFORE dropping the redundant columns:  (51971, 16)
The shape of the test AFTER dropping the redundant columns:  (51971, 9)


## Fill the missing values
- mean_content_accuracy
- mean_task_accuracy

In [81]:
train.isnull().sum(axis=0)

answered_correctly                0
prior_question_had_explanation    0
user_acc_mean                     0
user_lectures_running_total       0
avg_user_q_time                   0
mean_content_accuracy             0
mean_task_accuracy                0
mean_timestamp_accuracy           0
mean_priortime_accuracy           0
dtype: int64

In [82]:
validate.isnull().sum(axis=0)

answered_correctly                   0
prior_question_had_explanation       0
user_acc_mean                        0
user_lectures_running_total          0
avg_user_q_time                      0
mean_content_accuracy              303
mean_task_accuracy                1391
mean_timestamp_accuracy              0
mean_priortime_accuracy              0
dtype: int64

In [83]:
test.isnull().sum(axis=0)

answered_correctly                   0
prior_question_had_explanation       0
user_acc_mean                        0
user_lectures_running_total          0
avg_user_q_time                      0
mean_content_accuracy              303
mean_task_accuracy                1391
mean_timestamp_accuracy              0
mean_priortime_accuracy              0
dtype: int64

## Fill the missing values generated from merging
- So far, fill the null values with 0.5

In [84]:
validate = MVP_Shi.fill_nulls(validate)
test = MVP_Shi.fill_nulls(test)

In [85]:
# Check the nulls in the validate
validate.isnull().sum(axis=0)

answered_correctly                0
prior_question_had_explanation    0
user_acc_mean                     0
user_lectures_running_total       0
avg_user_q_time                   0
mean_content_accuracy             0
mean_task_accuracy                0
mean_timestamp_accuracy           0
mean_priortime_accuracy           0
dtype: int64

In [86]:
# Check the nulls in the test
test.isnull().sum(axis=0)

answered_correctly                0
prior_question_had_explanation    0
user_acc_mean                     0
user_lectures_running_total       0
avg_user_q_time                   0
mean_content_accuracy             0
mean_task_accuracy                0
mean_timestamp_accuracy           0
mean_priortime_accuracy           0
dtype: int64

## Scale the columns 

In [87]:
columns_to_scale = ['mean_timestamp_accuracy', 'mean_priortime_accuracy',
                    'user_lectures_running_total', 'avg_user_q_time']

scaler, train, validate, test = MVP_Shi.scale(train, validate, test, columns_to_scale)

In [88]:
# Take a peek at the train
train.head()

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,False,0.630049,0.818182,0.682248,0.120073,0.500746,0.0,0.002404
1,0,False,0.630049,0.55,0.534988,0.120073,0.500746,0.0,0.002404
2,1,False,0.630049,0.444444,0.445216,0.120073,0.500746,0.0,0.002404
3,1,False,0.630049,0.40625,0.543241,0.120073,0.500746,0.0,0.002404
4,0,False,0.630049,0.6875,0.485282,0.120073,0.500746,0.0,0.002404


In [89]:
# Take a peek at the validate
validate.head()

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,True,0.630049,0.46875,0.529412,0.120073,0.500746,0.0,0.002404
1,1,True,0.630049,0.290323,0.578947,0.120073,0.500746,0.0,0.002404
2,0,True,0.630049,0.387755,0.588235,0.120073,0.500746,0.0,0.002404
3,1,True,0.630049,0.740741,0.5,0.120073,0.500746,0.0,0.002404
4,0,True,0.630049,0.428571,0.588235,0.120073,0.500746,0.0,0.002404


In [90]:
# Take a peek at the test
test.head()

Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,True,0.630049,0.46875,0.529412,0.120073,0.500746,0.0,0.002404
1,1,True,0.630049,0.290323,0.578947,0.120073,0.500746,0.0,0.002404
2,0,True,0.630049,0.387755,0.588235,0.120073,0.500746,0.0,0.002404
3,1,True,0.630049,0.740741,0.5,0.120073,0.500746,0.0,0.002404
4,0,True,0.630049,0.428571,0.588235,0.120073,0.500746,0.0,0.002404


## Convert the boolean values to 0 and 1

In [91]:
# Count the number of unique values in the column prior_question_had_explanation
train.prior_question_had_explanation.value_counts()

True     360193
False     43184
Name: prior_question_had_explanation, dtype: int64

In [92]:
# Count the number of unique values in the column prior_question_had_explanation
validate.prior_question_had_explanation.value_counts()

True     49754
False     2217
Name: prior_question_had_explanation, dtype: int64

In [93]:
# Count the number of unique values in the column prior_question_had_explanation
test.prior_question_had_explanation.value_counts()

True     49754
False     2217
Name: prior_question_had_explanation, dtype: int64

In [94]:
# Conver the boolean values to 0 and 1

train = MVP_Shi.boolean_to_num(train)
validate = MVP_Shi.boolean_to_num(validate)
test = MVP_Shi.boolean_to_num(test)

In [95]:
# After conversion
# Count the number of unique values in the column prior_question_had_explanation
train.prior_question_had_explanation.value_counts()

1    360193
0     43184
Name: prior_question_had_explanation, dtype: int64

In [96]:
# After conversion
# Count the number of unique values in the column prior_question_had_explanation
validate.prior_question_had_explanation.value_counts()

1    49754
0     2217
Name: prior_question_had_explanation, dtype: int64

In [97]:
# After conversion
# Count the number of unique values in the column prior_question_had_explanation
test.prior_question_had_explanation.value_counts()

1    49754
0     2217
Name: prior_question_had_explanation, dtype: int64

# Model

## Calculate the baseline

In [None]:
# Generate 

# Conclusions

### Future Investigations

### Resources

- https://www.kaggle.com/c/riiid-test-answer-prediction/overview/description
- https://www.kaggle.com/isaienkov/riiid-answer-correctness-prediction-eda-modeling

### Contact Us

Dani Bojado
- daniella.bojado@gmail.com 

Samuel Davila
- samuelrdavila@gmail.com

Yongliang Shi
- yongliang.michael.shi@gmail.com

Christopher Logan Ortiz
- christopher.logan.ortiz@gmail.com