In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

import riiideducation
from sklearn.metrics import roc_auc_score

from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
env = riiideducation.make_env()

In [None]:
train = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/train.csv',
    usecols=[1, 2, 3, 4, 5, 7, 8, 9],
    dtype={
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'answered_correctly':'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }
)

In [None]:
questions_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/questions.csv',
    usecols=[0,3],
    dtype = {
        "question_id":"int64",
        "part":"int8"
    }
)

In [None]:
lectures_df = pd.read_csv(
    '/kaggle/input/riiid-test-answer-prediction/lectures.csv',
    )

In [None]:
lectures_df["type_of"] = lectures_df["type_of"].replace("solving question", "solving_question")
lectures_df = pd.get_dummies(lectures_df, columns=["part", "type_of"])

In [None]:
part_lectures_columns = [column for column in lectures_df.columns if column.startswith("part")]
type_of_lectures_columns = [column for column in lectures_df.columns if column.startswith("type_of_")]

In [None]:
train_lectures = train[train.content_type_id==True].merge(lectures_df,
                                                          left_on="content_id",
                                                          right_on="lecture_id",
                                                          how="left")

In [None]:
user_lecture_stats_part = train_lectures.groupby("user_id")[part_lectures_columns + type_of_lectures_columns].sum()

In [None]:
user_lecture_stats_part.head()

In [None]:
for column in user_lecture_stats_part.columns:
    bool_column = column + "_boolean"
    user_lecture_stats_part[bool_column] = (user_lecture_stats_part[column] > 0).astype(int)

In [None]:
user_lecture_stats_part.head()

In [None]:
del train_lectures
gc.collect()

In [None]:
train = train[train.content_type_id == False].sort_values("timestamp").reset_index(drop=True)

In [None]:
train.head()

In [None]:
elapsed_mean = train.prior_question_elapsed_time.mean()

In [None]:
#average numbers of seeing questions per user
group1 = train[["task_container_id", "user_id"]].groupby("task_container_id").agg(["count"])
group1.columns = ["avg_questions"]
group2 = train[["task_container_id", "user_id"]].groupby("task_container_id").agg(["nunique"])
group2.columns = ["avg_questions"]

group3 = group1 / group2

In [None]:
group3["avg_question_seen"] = group3.avg_questions.cumsum()

In [None]:
results_u_final = train[["user_id", "answered_correctly"]].groupby("user_id").agg(["mean"])
results_u_final.columns = ["answered_correctly_user"]

In [None]:
results_u2_final = train[["user_id", "prior_question_had_explanation"]].groupby("user_id").agg(["mean"])
results_u2_final.columns = ["explanation_mean_user"]

In [None]:
prior_mean_user = results_u2_final.explanation_mean_user.mean()

In [None]:
train = pd.merge(train, questions_df, left_on="content_id", right_on="question_id", how="left")

In [None]:
results_q_final = train[['question_id','answered_correctly']].groupby(['question_id']).agg(['mean'])
results_q_final.columns = ['quest_pct']

In [None]:
results_q2_final = train[['question_id','part']].groupby(['question_id']).agg(['count'])
results_q2_final.columns = ['quest_count']

In [None]:
question2 = pd.merge(questions_df, results_q2_final, on="question_id", how="left")

In [None]:
question2 = pd.merge(question2, results_q_final, on="question_id", how="left")

In [None]:
question2.quest_pct = round(question2.quest_pct, 5)

In [None]:
train.drop(["timestamp", "content_type_id", "question_id", "part"], axis=1, inplace=True)

In [None]:
#for validation, extract the five recent data of users
validation = train.groupby("user_id").tail(5)

In [None]:
train = train[~train.index.isin(validation.index)]

In [None]:
results_u_val = train[["user_id", "answered_correctly"]].groupby("user_id").agg(["mean"])
results_u_val.columns = ["answered_correctly_user"]

results_u2_val = train[["user_id", "prior_question_had_explanation"]].groupby("user_id").agg(["mean"])
results_u2_val.columns = ["explanation_mean_user"]

In [None]:
X = train.groupby("user_id").tail(18)
train = train[~train.index.isin(X.index)]

In [None]:
len(train) + len(X) + len(validation)

In [None]:
results_u_X = train[["user_id", "answered_correctly"]].groupby("user_id").agg(["mean"])
results_u_X.columns = ["answered_correctly_user"]

results_u2_X = train[["user_id", "prior_question_had_explanation"]].groupby("user_id").agg(["mean"])
results_u2_X.columns = ["explanation_mean_user"]

In [None]:
del(train)
gc.collect()

In [None]:
X = pd.merge(X, group3, left_on="task_container_id", right_index=True, how="left")
X = pd.merge(X, results_u_X, on="user_id", how="left")
X = pd.merge(X, results_u2_X, on="user_id", how="left")

X = pd.merge(X, user_lecture_stats_part, on="user_id", how="left")

In [None]:
validation = pd.merge(validation, group3, left_on="task_container_id", right_index=True, how="left")
validation = pd.merge(validation, results_u_val, on="user_id", how="left")
validation = pd.merge(validation, results_u2_val, on="user_id", how="left")

validation = pd.merge(validation, user_lecture_stats_part, on="user_id", how="left")

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

X.prior_question_had_explanation.fillna(False, inplace=True)
validation.prior_question_had_explanation.fillna(False, inplace=True)

#False->0, True->1
validation["prior_question_had_explanation_enc"] = lb_make.fit_transform(validation["prior_question_had_explanation"])
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])

In [None]:
content_mean = question2.quest_pct.mean()

In [None]:
question2.quest_pct = question2.quest_pct.mask((question2["quest_count"] < 3), .65)

question2.quest_pct = question2.quest_pct.mask((question2["quest_pct"] < .2) & (question2["quest_count"] < 21), .2)

question2.quest_pct = question2.quest_pct.mask((question2["quest_pct"] > .95) & (question2["quest_count"] < 21), .95)

In [None]:
X = pd.merge(X, question2, left_on="content_id", right_on="question_id", how="left")
validation = pd.merge(validation, question2, left_on="content_id", right_on="question_id", how="left")

X.part = X.part -1
validation.part = validation.part -1

In [None]:
y = X["answered_correctly"]
X = X.drop(["answered_correctly"], axis=1)

y_val = validation["answered_correctly"]
X_val = validation.drop(["answered_correctly"], axis=1)

In [None]:
X = X[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
       'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
       'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
       'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
       'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
       'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']]

X_val = X_val[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
               'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
               'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
               'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
               'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
               'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']]

In [None]:
X['answered_correctly_user'].fillna(0.65,  inplace=True)
X['explanation_mean_user'].fillna(prior_mean_user,  inplace=True)
X['quest_pct'].fillna(content_mean, inplace=True)

X['part'].fillna(4, inplace = True)
X['avg_question_seen'].fillna(1, inplace = True)
X['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
X['prior_question_had_explanation_enc'].fillna(0, inplace = True)

X['part_1'].fillna(0, inplace = True)
X['part_2'].fillna(0, inplace = True)
X['part_3'].fillna(0, inplace = True)
X['part_4'].fillna(0, inplace = True)
X['part_5'].fillna(0, inplace = True)
X['part_6'].fillna(0, inplace = True)
X['part_7'].fillna(0, inplace = True)
X['type_of_concept'].fillna(0, inplace = True)
X['type_of_intention'].fillna(0, inplace = True)
X['type_of_solving_question'].fillna(0, inplace = True)
X['type_of_starter'].fillna(0, inplace = True)
X['part_1_boolean'].fillna(0, inplace = True)
X['part_2_boolean'].fillna(0, inplace = True)
X['part_3_boolean'].fillna(0, inplace = True)
X['part_4_boolean'].fillna(0, inplace = True)
X['part_5_boolean'].fillna(0, inplace = True)
X['part_6_boolean'].fillna(0, inplace = True)
X['part_7_boolean'].fillna(0, inplace = True)
X['type_of_concept_boolean'].fillna(0, inplace = True)
X['type_of_intention_boolean'].fillna(0, inplace = True)
X['type_of_solving_question_boolean'].fillna(0, inplace = True)
X['type_of_starter_boolean'].fillna(0, inplace = True)

In [None]:
X_val['answered_correctly_user'].fillna(0.65,  inplace=True)
X_val['explanation_mean_user'].fillna(prior_mean_user,  inplace=True)
X_val['quest_pct'].fillna(content_mean,  inplace=True)

X_val['part'].fillna(4, inplace = True)
X_val['avg_question_seen'].fillna(1, inplace = True)
X_val['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
X_val['prior_question_had_explanation_enc'].fillna(0, inplace = True)

X_val['part_1'].fillna(0, inplace = True)
X_val['part_2'].fillna(0, inplace = True)
X_val['part_3'].fillna(0, inplace = True)
X_val['part_4'].fillna(0, inplace = True)
X_val['part_5'].fillna(0, inplace = True)
X_val['part_6'].fillna(0, inplace = True)
X_val['part_7'].fillna(0, inplace = True)
X_val['type_of_concept'].fillna(0, inplace = True)
X_val['type_of_intention'].fillna(0, inplace = True)
X_val['type_of_solving_question'].fillna(0, inplace = True)
X_val['type_of_starter'].fillna(0, inplace = True)
X_val['part_1_boolean'].fillna(0, inplace = True)
X_val['part_2_boolean'].fillna(0, inplace = True)
X_val['part_3_boolean'].fillna(0, inplace = True)
X_val['part_4_boolean'].fillna(0, inplace = True)
X_val['part_5_boolean'].fillna(0, inplace = True)
X_val['part_6_boolean'].fillna(0, inplace = True)
X_val['part_7_boolean'].fillna(0, inplace = True)
X_val['type_of_concept_boolean'].fillna(0, inplace = True)
X_val['type_of_intention_boolean'].fillna(0, inplace = True)
X_val['type_of_solving_question_boolean'].fillna(0, inplace = True)
X_val['type_of_starter_boolean'].fillna(0, inplace = True)

In [None]:
params = {
    'num_leaves': 31, 
    'n_estimators': 200, 
    'max_depth': 8, 
    'min_child_samples': 356, 
    'learning_rate': 0.2982483634778906, 
    'min_data_in_leaf': 82, 
    'bagging_fraction': 0.6545628633239445, 
    'feature_fraction': 0.9164482379289846,
    'random_state': 666
}

full_model = LGBMClassifier(**params)
full_model.fit(X, y)

preds = full_model.predict_proba(X_val)[:,1]
print("LGB roc auc", roc_auc_score(y_val, preds))

full_xgb = XGBClassifier(random_state=666)
full_xgb.fit(X, y)

preds = full_xgb.predict_proba(X_val)[:,1]
print("XGB roc auc", roc_auc_score(y_val, preds))

full_lr = LogisticRegression(random_state=666)
full_lr.fit(X, y)

preds = full_lr.predict_proba(X_val)[:,1]
print("LR roc auc", roc_auc_score(y_val, preds))

In [None]:
import optuna
from optuna.samplers import TPESampler

In [None]:
rfe = RFE(estimator=DecisionTreeClassifier(random_state=666), n_features_to_select=14)
rfe.fit(X, y)
X = rfe.transform(X)
X_val = rfe.transform(X_val)

In [None]:
sampler = TPESampler(seed=666)

# def create_model(trial):
#     num_leaves = trial.suggest_int("num_leaves", 2, 31)
#     n_estimators = trial.suggest_int("n_estimators", 20, 300)
#     max_depth = trial.suggest_int('max_depth', 3, 9)
#     min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
#     min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
#     bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
#     feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
#     model = LGBMClassifier(
#         num_leaves=num_leaves,
#         n_estimators=n_estimators, 
#         max_depth=max_depth, 
#         min_child_samples=min_child_samples, 
#         min_data_in_leaf=min_data_in_leaf,
#         learning_rate=learning_rate,
#         feature_fraction=feature_fraction,
#         random_state=666
# )
#     return model

# def objective(trial):
#     model = create_model(trial)
#     model.fit(X, y)
#     preds = model.predict_proba(X_val)[:,1]
#     score = roc_auc_score(y_val, preds)
#     return score

# # run optuna 
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=350)
# params = study.best_params
# params['random_state'] = 666

#  ↑ After Trial=286 ended, 9hours run-time-limit was reached.

# Referring to the previous attempt, narrow down the range of hyperparameters
def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 26, 32)
    n_estimators = trial.suggest_int("n_estimators", 280, 350)
    max_depth = trial.suggest_int('max_depth', 7, 9)
    min_child_samples = trial.suggest_int('min_child_samples', 1000, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 0.5)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 25, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.1, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.1, 1.0)
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=666
)
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X, y)
    preds = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, preds)
    return score

# run optuna 
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=200)
# params = study.best_params
# params['random_state'] = 666

In [None]:
params = {'num_leaves': 29,
 'n_estimators': 300,
 'max_depth': 9,
 'min_child_samples': 1089,
 'learning_rate': 0.306030368670154,
 'min_data_in_leaf': 65,
 'bagging_fraction': 0.49498535405259425,
 'feature_fraction': 0.9235503880887722,
 'random_state': 666}

In [None]:
model = LGBMClassifier(**params)
model.fit(X, y)

preds = model.predict_proba(X_val)[:,1]
roc_auc_score(y_val, preds)

In [None]:
X = pd.DataFrame(X)
X_val = pd.DataFrame(X_val)

y = pd.DataFrame(y)
y_val = pd.DataFrame(y_val)

In [None]:
models = []
preds = []
for n, (tr, te) in enumerate(KFold(n_splits=5, random_state=666, shuffle=True).split(y)):
    print(f'Fold {n}')
    model = LGBMClassifier(**params)
    model.fit(X.values[tr], y.values[tr])
    
    pred = model.predict_proba(X_val)[:, 1]
    preds.append(pred)
    print('Fold roc auc:', roc_auc_score(y.values[te], model.predict_proba(X.values[te])[:, 1])) 
    models.append(model)

In [None]:
predictions = preds[0]
for i in range(1,5):
    predictions += preds[i]
predictions /= 5

print("ROC AUC", roc_auc_score(y_val, predictions))

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df['task_container_id'] = test_df.task_container_id.mask(test_df.task_container_id > 9999, 9999)
    test_df = pd.merge(test_df, group3, left_on=['task_container_id'], right_index= True, how="left")
    test_df = pd.merge(test_df, question2, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df = pd.merge(test_df, results_u_final, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_u2_final, on=['user_id'],  how="left")
    
    test_df = pd.merge(test_df, user_lecture_stats_part, on=['user_id'], how="left")
    test_df['part_1'].fillna(0, inplace = True)
    test_df['part_2'].fillna(0, inplace = True)
    test_df['part_3'].fillna(0, inplace = True)
    test_df['part_4'].fillna(0, inplace = True)
    test_df['part_5'].fillna(0, inplace = True)
    test_df['part_6'].fillna(0, inplace = True)
    test_df['part_7'].fillna(0, inplace = True)
    test_df['type_of_concept'].fillna(0, inplace = True)
    test_df['type_of_intention'].fillna(0, inplace = True)
    test_df['type_of_solving_question'].fillna(0, inplace = True)
    test_df['type_of_starter'].fillna(0, inplace = True)
    test_df['part_1_boolean'].fillna(0, inplace = True)
    test_df['part_2_boolean'].fillna(0, inplace = True)
    test_df['part_3_boolean'].fillna(0, inplace = True)
    test_df['part_4_boolean'].fillna(0, inplace = True)
    test_df['part_5_boolean'].fillna(0, inplace = True)
    test_df['part_6_boolean'].fillna(0, inplace = True)
    test_df['part_7_boolean'].fillna(0, inplace = True)
    test_df['type_of_concept_boolean'].fillna(0, inplace = True)
    test_df['type_of_intention_boolean'].fillna(0, inplace = True)
    test_df['type_of_solving_question_boolean'].fillna(0, inplace = True)
    test_df['type_of_starter_boolean'].fillna(0, inplace = True)
    
    test_df['answered_correctly_user'].fillna(0.65,  inplace=True)
    test_df['explanation_mean_user'].fillna(prior_mean_user,  inplace=True)
    test_df['quest_pct'].fillna(content_mean,  inplace=True)
    test_df['part'] = test_df.part - 1

    test_df['part'].fillna(4, inplace = True)
    test_df['avg_question_seen'].fillna(1, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    
    full_preds = full_model.predict_proba(test_df[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
                                                            'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
                                                            'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
                                                            'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
                                                            'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']])[:, 1]
    
    full_preds_xgb = full_xgb.predict_proba(test_df[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
                                                            'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
                                                            'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
                                                            'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
                                                            'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']])[:, 1]
    
    full_preds_lr = full_lr.predict_proba(test_df[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
                                                            'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
                                                            'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
                                                            'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
                                                            'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']])[:, 1]
    


    
    X_test = rfe.transform(test_df[['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_question_seen',
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc', 'part',
                                                            'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
                                                            'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
                                                            'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
                                                            'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']])
    
    preds = [model.predict_proba(X_test)[:,1] for model in models]
    
    predictions = preds[0]
    for i in range(1, 5):
        predictions += preds[i]
    predictions /= 5
    
    test_df['answered_correctly'] =  predictions * 0.75 + full_preds * 0.125 + full_preds_xgb * 0.75 + full_preds_lr * 0.05
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])