# Introduction

In this competition, your challenge is to create algorithms for "Knowledge Tracing," the modeling of student knowledge over time. The goal is to accurately predict how students will perform on future interactions. You will pair your machine learning skills using Riiid’s EdNet data.

In this notebook we will first take a general look at the data. After the exploratory analysis of the data, I will create some models to compare their accuracy.

<div class="alert alert-block alert-info">
Should you like this notebook or was it useful, please do UPVOTE! 👍.
</div>

### Importing libraries

In [None]:
import numpy as np
import pandas as pd

# Plot
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns

# Training and test data
from sklearn.model_selection import train_test_split

# AUC score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Model
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Submission
import riiideducation

import warnings
warnings.filterwarnings("ignore")

# Loading Data

### Training data

row_id: ID code for the row.

timestamp: the time between this user interaction and the first event from that user.

user_id: ID code for the user.

content_id: ID code for the user interaction

content_type_id: 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

task_container_id: Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id. Monotonically increasing for each user.

user_answer: the user's answer to the question, if any. Read -1 as null, for lectures.

answered_correctly: if the user responded correctly. Read -1 as null, for lectures.

prior_question_elapsed_time: How long it took a user to answer their previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Note that the time is the total time a user took to solve all the questions in the previous bundle.

prior_question_had_explanation: Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

Since the training data is very large and the kaggle memory does not support it, so I will generate a sample of 1M observations.

In [None]:
df_train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
                       nrows=10**6,
                       dtype={'row_id': 'int64', 
                              'timestamp': 'int64', 
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'content_type_id': 'int8',
                              'task_container_id': 'int16',
                              'user_answer': 'int8',
                              'answered_correctly': 'int8',
                              'prior_question_elapsed_time': 'float32',
                              'prior_question_had_explanation': 'boolean'})

# Exploratory Data Analysis

Summary table of training data. Showing data type, missing, unique values and their first three values.

In [None]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    return summary

In [None]:
resumetable(df_train)

## Features overview

In [None]:
plt.figure(figsize=(15, 5))

plt.suptitle('Time between this interaction and first event', fontsize = 18)
plt.hist(df_train['timestamp'], bins = 50, color = "skyblue")
plt.ylabel('Count', fontsize = 15)
plt.xlabel('timestamp', fontsize = 15)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

p = sns.distplot(df_train['user_id'])
p.set_title("Code for the user", fontsize=18)
p.set_xlabel("user_id", fontsize = 15)
p.set_ylabel("Probability", fontsize = 15)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

p = sns.distplot(df_train['content_id'])
p.set_title("The user interaction", fontsize = 18)
p.set_xlabel("content_id", fontsize = 15)
p.set_ylabel("Probability", fontsize = 15)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

p3 = sns.distplot(df_train['task_container_id'])
p3.set_title("Code for the batch of questions or lectures", fontsize = 18)
p3.set_xlabel("task_container_id", fontsize = 15)
p3.set_ylabel("Probability", fontsize = 15)

plt.show()

In [None]:
plt.figure(figsize=(15, 5))

p3 = sns.distplot(df_train['prior_question_elapsed_time'].dropna())
p3.set_title("How long it took a user to answer their previous question bundle", fontsize = 18)
p3.set_xlabel("prior_question_elapsed_time", fontsize = 15)
p3.set_ylabel("Probability", fontsize = 15)

plt.show()

In [None]:
plt.figure(figsize=(12, 5))

freq = len(df_train)

g = sns.countplot(df_train['content_type_id'])
g.set_title("", fontsize = 18)
g.set_xlabel("content_type_id", fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

In [None]:
plt.figure(figsize=(15, 5))

freq = len(df_train)

g = sns.countplot(df_train['user_answer'])
g.set_title("The user's answer to the question", fontsize = 18)
g.set_xlabel("user_answer", fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

In [None]:
plt.figure(figsize=(15, 5))

freq = len(df_train)

g = sns.countplot(df_train['answered_correctly'])
g.set_title("If the user responded correctly", fontsize = 18)
g.set_xlabel("answered_correctly", fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

In [None]:
plt.figure(figsize=(12, 5))

freq = len(df_train)

g = sns.countplot(df_train['prior_question_had_explanation'])
g.set_title("Whether or not the user saw an explanation and the correct response (s) \n after answering the previous question bundle",
            fontsize = 18)
g.set_xlabel("prior_question_had_explanation", fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

In [None]:
plt.figure(figsize=(12, 5))
g = sns.scatterplot(data = df_train, x = "timestamp", y = "prior_question_elapsed_time", hue = "prior_question_had_explanation", 
                style = "prior_question_had_explanation")
g.set_xlabel("timestamp", fontsize = 15)
g.set_ylabel("prior_question_elapsed_time", fontsize = 15)

plt.show()

Removing the null value from the answered_correctly variable

In [None]:
train = df_train[df_train['answered_correctly']!=-1]

In [None]:
plt.figure(figsize=(15, 5))

sns.relplot(
    data= train, x = "timestamp", y = "prior_question_elapsed_time",
    col = "prior_question_had_explanation", hue = "answered_correctly", style = "answered_correctly",
    kind="scatter"
);

# Preparing features

In [None]:
used_data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16'
}

train_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols = used_data_types_dict.keys(),
    dtype=used_data_types_dict, 
    index_col = 0
)

In [None]:
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

In [None]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std']}).copy()
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy']

In [None]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std'] }).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked', 'std_accuracy']

In [None]:
import gc

del features_df
del grouped_by_user_df
del grouped_by_content_df

gc.collect()

In [None]:
features = [
    'timestamp',
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy',
    'prior_question_elapsed_time'
]
target = 'answered_correctly'

In [None]:
train_df = train_df[train_df[target] != -1]

In [None]:
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df

In [None]:
train_df = train_df[features + [target]]

In [None]:
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0)
train_df

### Reference:
https://www.kaggle.com/isaienkov/riiid-answer-correctness-prediction-eda-modeling

In [None]:
# Function to reduce the df size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Reducing memory
train_df = reduce_mem_usage(train_df)

# Model

I will use some models and compare them.

In [None]:
# Training and test data
train_df, test_df = train_test_split(train_df, random_state = 123, test_size = 0.2)

### Logistic Regression

In [None]:
# Creating the model
model_LR = LogisticRegression()

# Training the model
model_LR.fit(train_df[features], train_df[target])

In [None]:
ns_probs = [0 for _ in range(len(train_df[target]))]

In [None]:
# predict probabilities
LR_probs = model_LR.predict_proba(train_df[features])

# keep probabilities for the positive outcome only
LR_probs = LR_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(train_df[target], ns_probs)
LR_auc = roc_auc_score(train_df[target], LR_probs)

# result print
print('Logistic: ROC AUC = %.3f' % (LR_auc * 100))

In [None]:
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(train_df[target], ns_probs)
LR_fpr, LR_tpr, _ = roc_curve(train_df[target], LR_probs)

# figure size
plt.rcParams["figure.figsize"] = (9, 5)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle = '--', label = 'No Skill')
pyplot.plot(LR_fpr, LR_tpr, linestyle = '-', label = 'Logistic')

# axis labels
pyplot.xlabel('False Positive Rate', fontsize = 15)
pyplot.ylabel('True Positive Rate', fontsize = 15)

# show the legend
pyplot.legend(fontsize = 15)

# show the plot
pyplot.show()

### Extreme Gradient Boosting - XGBoost

In [None]:
# Creating the model
model_XGB = XGBClassifier()

# Training the model
model_XGB.fit(train_df[features], train_df[target])

In [None]:
# predict probabilities
XGB_probs = model_XGB.predict_proba(train_df[features])

# keep probabilities for the positive outcome only
XGB_probs = XGB_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(train_df[target], ns_probs)
XGB_auc = roc_auc_score(train_df[target], XGB_probs)

# result print
print('XGBoost: ROC AUC = %.3f' % (XGB_auc * 100))

In [None]:
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(train_df[target], ns_probs)
XGB_fpr, XGB_tpr, _ = roc_curve(train_df[target], XGB_probs)

# figure size
plt.rcParams["figure.figsize"] = (9, 5)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle = '--', label = 'No Skill')
pyplot.plot(XGB_fpr, XGB_tpr, linestyle = '-', label = 'XGBoost', color = "red")

# axis labels
pyplot.xlabel('False Positive Rate', fontsize = 15)
pyplot.ylabel('True Positive Rate', fontsize = 15)

# show the legend
pyplot.legend(fontsize = 15)

# show the plot
pyplot.show()

### LightGBM

In [None]:
# Creating the model
model_LGBM = LGBMClassifier()

# Training the model
model_LGBM.fit(train_df[features], train_df[target])

In [None]:
# predict probabilities
LGBM_probs = model_LGBM.predict_proba(train_df[features])

# keep probabilities for the positive outcome only
LGBM_probs = LGBM_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(train_df[target], ns_probs)
LGBM_auc = roc_auc_score(train_df[target], LGBM_probs)

# result print
print('Logistic: ROC AUC = %.3f' % (LGBM_auc * 100))

In [None]:
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(train_df[target], ns_probs)
LGBM_fpr, LGBM_tpr, _ = roc_curve(train_df[target], LGBM_probs)

# figure size
plt.rcParams["figure.figsize"] = (9, 5)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle = '--', label = 'No Skill')
pyplot.plot(LGBM_fpr, LGBM_tpr, linestyle = '-', label = 'LGBM', color = "green")

# axis labels
pyplot.xlabel('False Positive Rate', fontsize = 15)
pyplot.ylabel('True Positive Rate', fontsize = 15)

# show the legend
pyplot.legend(fontsize = 12)

# show the plot
pyplot.show()

* The XGboost and LightGBM models showed very close accuracy, with a slight advantage for the XGboost. However, XGboost's processing time is much longer than LightGBM, which has a disadvantage.
* With the time gain in LightGBM processing, I will adjust some parameters to see if we have an increase in accuracy.
* But before creating a new model with new parameters, let's see which features were most important for the previous LightGBM model.

In [None]:
feat_importance = pd.DataFrame()
feat_importance["feature"] = train_df[features].columns
feat_importance["value"] = model_LGBM.feature_importances_
feat_importance.sort_values(by='value', ascending=False, inplace=True)

plt.figure(figsize=(8,10))
ax = sns.barplot(y="feature", x="value", data=feat_importance)

In [None]:
new_feat = [
    'timestamp',
    'mean_accuracy', 
    'question_asked',
    'prior_question_elapsed_time'
]

train_df_new = train_df[new_feat]

In [None]:
# Creating the model
model_LGBM_par = LGBMClassifier(
    objective='binary',
    boosting='gbdt',
    learning_rate = 0.05,
    max_depth = 8,
    num_leaves = 80,
    n_estimators = 400,
    bagging_fraction = 0.8,
    feature_fraction = 0.9)

# Training the model
model_LGBM_par.fit(train_df_new, train_df[target])

In [None]:
# predict probabilities
LGBM_par_probs = model_LGBM_par.predict_proba(train_df_new)

# keep probabilities for the positive outcome only
LGBM_par_probs = LGBM_par_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(train_df[target], ns_probs)
LGBM_par_auc = roc_auc_score(train_df[target], LGBM_par_probs)

# result print
print('Logistic: ROC AUC = %.3f' % (LGBM_par_auc * 100))

## Submission

In [None]:
env = riiideducation.make_env()

iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_answers_df, how = 'left', on = 'user_id')
    test_df = test_df.merge(content_answers_df, how = 'left', on = 'content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df.fillna(value = -1, inplace = True)
    
    test_df['answered_correctly'] = model_LGBM_par.predict_proba(test_df[new_feat])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

# To be continued...