# Riiid Project: Minimally Viable Product (MVP)

## About the Project

- Build a better and more equitable model for education in a post-COVID-19 world.

## Goals

- Create algorithms for "Knowledge Tracing," the modeling of student knowledge over time. 
- Accurately predict how students will perform on future interactions. 

# Imports

In [1]:
# General Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

# Train Validate Test
from sklearn.model_selection import train_test_split

# Modeling Imports
from sklearn.cluster import KMeans

# Acquire and Prepare Files
import acquire
from prepare import prep_riiid

from sklearn.preprocessing import MinMaxScaler

# Warnings 
import warnings
warnings.filterwarnings("ignore")

# Acquire

In [2]:
df_train = pd.read_csv('train.csv')
df_validate = pd.read_csv('validate.csv')
df_test = pd.read_csv('test.csv')

df_train.shape, df_validate.shape, df_test.shape

((411517, 18), (50842, 18), (52868, 18))

In [3]:
# Train Data
df_train.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,0,1864702,5720,0,0,1,1,,,,,,,5720.0,5720.0,1.0,5.0,115
1,45951,1864702,5204,0,1,1,0,inf,False,,,,,5204.0,5204.0,3.0,5.0,173


In [4]:
# Validate Data
df_validate.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,15625124241,1864702,3209,0,2152,3,1,34660.0,True,,,,,3209.0,3207.0,3.0,4.0,157 169 162 38
1,15625124241,1864702,3208,0,2152,3,1,34660.0,True,,,,,3208.0,3207.0,3.0,4.0,113 169 162 38


In [5]:
# Test Data
df_test.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,lecture_id,tag,lecture_part,type_of,question_id,bundle_id,correct_answer,question_part,tags
0,20170378604,1864702,8811,0,2494,1,1,3000.0,True,,,,,8811.0,8811.0,1.0,5.0,45
1,20170394313,1864702,8643,0,2495,3,1,4000.0,True,,,,,8643.0,8643.0,3.0,5.0,15


# Prepare

### Running Function that does all prep at once

In [6]:
# Produce train/validate/test
# Produce scaled train/validdate/test

train, validate, test, train_s, validate_s, test_s = prep_riiid(df_train, df_validate, df_test)

In [16]:
# Print the shape of the train
print(train.shape)

# Take a peek at the train
train.head()

(403377, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,1,0,0.630049,0,11917302.0,0.818182,0.682248,6526086000.0,21594.667829
1,0,0,0.630049,0,11917302.0,0.55,0.534988,6526086000.0,21594.667829
2,1,0,0.630049,0,11917302.0,0.444444,0.445216,6526086000.0,21594.667829
3,1,0,0.630049,0,11917302.0,0.40625,0.543241,6526086000.0,21594.667829
4,0,0,0.630049,0,11917302.0,0.6875,0.485282,6526086000.0,21594.667829


In [17]:
# Print the shape of the train
print(train_s.shape)

# Take a peek at the train
train_s.head()

(403377, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,0,0.630049,0.818182,0.682248,0.120073,0.500746,0.0,0.002404
1,0,0,0.630049,0.55,0.534988,0.120073,0.500746,0.0,0.002404
2,1,0,0.630049,0.444444,0.445216,0.120073,0.500746,0.0,0.002404
3,1,0,0.630049,0.40625,0.543241,0.120073,0.500746,0.0,0.002404
4,0,0,0.630049,0.6875,0.485282,0.120073,0.500746,0.0,0.002404


In [18]:
# Print the shape of the train
print(validate.shape)

# Take a peek at the train
validate.head()

(51971, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,1,1,0.630049,0,11917302.0,0.46875,0.529412,6526086000.0,21594.667829
1,1,1,0.630049,0,11917302.0,0.290323,0.578947,6526086000.0,21594.667829
2,0,1,0.630049,0,11917302.0,0.387755,0.588235,6526086000.0,21594.667829
3,1,1,0.630049,0,11917302.0,0.740741,0.5,6526086000.0,21594.667829
4,0,1,0.630049,0,11917302.0,0.428571,0.588235,6526086000.0,21594.667829


In [21]:
# Print the shape of the train
print(validate_s.shape)

# Take a peek at the train
validate_s.head()

(51971, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,1,0.630049,0.46875,0.529412,0.120073,0.500746,0.0,0.002404
1,1,1,0.630049,0.290323,0.578947,0.120073,0.500746,0.0,0.002404
2,0,1,0.630049,0.387755,0.588235,0.120073,0.500746,0.0,0.002404
3,1,1,0.630049,0.740741,0.5,0.120073,0.500746,0.0,0.002404
4,0,1,0.630049,0.428571,0.588235,0.120073,0.500746,0.0,0.002404


In [22]:
# Print the shape of the train
print(test.shape)

# Take a peek at the train
test.head()

(51971, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy,mean_priortime_accuracy
0,1,1,0.630049,0,11917302.0,0.46875,0.529412,6526086000.0,21594.667829
1,1,1,0.630049,0,11917302.0,0.290323,0.578947,6526086000.0,21594.667829
2,0,1,0.630049,0,11917302.0,0.387755,0.588235,6526086000.0,21594.667829
3,1,1,0.630049,0,11917302.0,0.740741,0.5,6526086000.0,21594.667829
4,0,1,0.630049,0,11917302.0,0.428571,0.588235,6526086000.0,21594.667829


In [23]:
# Print the shape of the train
print(test_s.shape)

# Take a peek at the train
test_s.head()

(51971, 9)


Unnamed: 0,answered_correctly,prior_question_had_explanation,user_acc_mean,mean_content_accuracy,mean_task_accuracy,mean_timestamp_accuracy_scaled,mean_priortime_accuracy_scaled,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,1,0.630049,0.46875,0.529412,0.120073,0.500746,0.0,0.002404
1,1,1,0.630049,0.290323,0.578947,0.120073,0.500746,0.0,0.002404
2,0,1,0.630049,0.387755,0.588235,0.120073,0.500746,0.0,0.002404
3,1,1,0.630049,0.740741,0.5,0.120073,0.500746,0.0,0.002404
4,0,1,0.630049,0.428571,0.588235,0.120073,0.500746,0.0,0.002404


# Feature Engineering

# Modeling

# Conclusions

### Future Investigations

### Resources

- https://www.kaggle.com/c/riiid-test-answer-prediction/overview/description
- https://www.kaggle.com/isaienkov/riiid-answer-correctness-prediction-eda-modeling

### Contact Us

Dani Bojado
- daniella.bojado@gmail.com 

Samuel Davila
- samuelrdavila@gmail.com

Yongliang Shi
- yongliang.michael.shi@gmail.com

Christopher Logan Ortiz
- christopher.logan.ortiz@gmail.com