## **LightGBM with FeatureV3 Training**

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob

import gc
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy
from riiid_feature_maker import *

import warnings 
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### **feature V3**

In [3]:
SAMPLE_NUM = 50000000

In [4]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

base_dir = os.path.join("..","..")
df = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","train.csv"))

target = "answered_correctly"
features = ['row_id',
            'timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'task_container_id',
            'user_answer',
            'prior_question_elapsed_time',
            'prior_question_had_explanation']

df = df[df[target].notna()]
df = df[df['content_type_id'] == 0]
df = df[df[target] != -1]
df = df.sample(n=SAMPLE_NUM)

print("data:",len(df))
print(df[features+[target]].isna().sum(),"\n")

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3,
                                                    random_state=0,
                                                    stratify=y)
del X,y,df
gc.collect()

X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,
                                                      test_size=0.3,
                                                      random_state=0,
                                                      stratify=y_train)
train_df = X_train
valid_df = X_valid
test_df = X_test

train_df[target] = y_train
valid_df[target] = y_valid
test_df[target] = y_test

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

del X_train, X_valid, y_train, y_valid, X_test, y_test
gc.collect()

data: 50000000
row_id                                 0
timestamp                              0
user_id                                0
content_id                             0
content_type_id                        0
task_container_id                      0
user_answer                            0
prior_question_elapsed_time       198008
prior_question_had_explanation    198008
answered_correctly                     0
dtype: int64 

train: 24500000
row_id                                0
timestamp                             0
user_id                               0
content_id                            0
content_type_id                       0
task_container_id                     0
user_answer                           0
prior_question_elapsed_time       97155
prior_question_had_explanation    97155
answered_correctly                    0
dtype: int64 

test: 15000000
row_id                                0
timestamp                             0
user_id                           

0

In [5]:
features, train_df = make_base_features(train_df)
train_df = reduce_mem_usage(train_df)
_       , test_df = make_base_features(test_df)
test_df = reduce_mem_usage(test_df)
_       , valid_df = make_base_features(valid_df)
valid_df = reduce_mem_usage(valid_df)

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

Mem. usage decreased to 864.51 Mb (0.0% reduction)
Mem. usage decreased to 529.29 Mb (0.0% reduction)
Mem. usage decreased to 370.50 Mb (0.0% reduction)
train: 24500000
row_id                                0
timestamp                             0
user_id                               0
content_id                            0
content_type_id                       0
task_container_id                     0
user_answer                           0
answered_correctly                    0
prior_question_elapsed_time       97155
prior_question_had_explanation    97155
answered_correctly                    0
answered_correctly                    0
dtype: int64 

test: 15000000
row_id                                0
timestamp                             0
user_id                               0
content_id                            0
content_type_id                       0
task_container_id                     0
user_answer                           0
answered_correctly                    0
p

In [6]:
content_dict = init_content_dict(train_df)
new_features, train_df = add_ContentFeatures(train_df,content_dict)
_           , test_df = add_ContentFeatures(test_df,content_dict)
_           , valid_df = add_ContentFeatures(valid_df,content_dict)
features = features + new_features

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

train: 24500000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_type_id                          0
task_container_id                        0
user_answer                              0
answered_correctly                       0
prior_question_elapsed_time          97155
prior_question_had_explanation       97155
answered_correctly                       0
answered_correctly_contents_mean         0
answered_correctly_contents_sum          0
answered_correctly_contents_count        0
answered_correctly                       0
dtype: int64 

test: 15000000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_type_id                          0
task_container_id                        0
user_answer                              0
answered

In [7]:
user_dict = init_user_dict(train_df)
new_features, train_df = add_UserFeatures(train_df,user_dict)
_           , test_df = add_UserFeatures(test_df,user_dict)
_           , valid_df = add_UserFeatures(valid_df,user_dict)
features = features + new_features

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

train: 24500000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_type_id                          0
task_container_id                        0
user_answer                              0
answered_correctly                       0
prior_question_elapsed_time          97155
prior_question_had_explanation       97155
answered_correctly                       0
answered_correctly_contents_mean         0
answered_correctly_contents_sum          0
answered_correctly_contents_count        0
answered_correctly_users_sum             0
answered_correctly_users_count           0
answered_correctly_users_mean            0
answered_correctly                       0
dtype: int64 

test: 15000000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_

In [8]:
lecture_dict = init_lecture_dict(train_df)
new_features, train_df = add_LectureLvFeatures(train_df,lecture_dict)
_           , test_df = add_LectureLvFeatures(test_df,lecture_dict)
_           , valid_df = add_LectureLvFeatures(valid_df,lecture_dict)
features = features + new_features

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

train: 24500000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_type_id                          0
task_container_id                        0
user_answer                              0
answered_correctly                       0
prior_question_elapsed_time          97155
prior_question_had_explanation       97155
answered_correctly                       0
answered_correctly_contents_mean         0
answered_correctly_contents_sum          0
answered_correctly_contents_count        0
answered_correctly_users_sum             0
answered_correctly_users_count           0
answered_correctly_users_mean            0
lecture_user_mean                        0
lecture_user_count                       0
lecture_user_sum                         0
answered_correctly                       0
dtype: int64 

test: 15000000
row_id                                   0
timestam

In [9]:
questions = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","questions.csv"))
new_features ,train_df = add_QuestionFeatures(train_df, questions)
train_df = reduce_mem_usage(train_df)
_            ,test_df = add_QuestionFeatures(test_df,questions)
test_df = reduce_mem_usage(test_df)
_            ,valid_df = add_QuestionFeatures(valid_df,questions)
valid_df = reduce_mem_usage(valid_df)
features = features + new_features

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

Mem. usage decreased to 1285.08 Mb (53.0% reduction)
Mem. usage decreased to 801.09 Mb (52.1% reduction)
Mem. usage decreased to 560.76 Mb (52.1% reduction)
train: 24500000
row_id                                   0
timestamp                                0
user_id                                  0
content_id                               0
content_type_id                          0
task_container_id                        0
user_answer                              0
answered_correctly                       0
prior_question_elapsed_time          97155
prior_question_had_explanation       97155
answered_correctly                       0
answered_correctly_contents_mean         0
answered_correctly_contents_sum          0
answered_correctly_contents_count        0
answered_correctly_users_sum             0
answered_correctly_users_count           0
answered_correctly_users_mean            0
lecture_user_mean                        0
lecture_user_count                       0
lecture_us

In [10]:
lectures = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","lectures.csv"))
new_features ,train_df = add_LectureFeatures(train_df, lectures)
train_df = reduce_mem_usage(train_df)
_            ,test_df = add_LectureFeatures(test_df,lectures)
test_df = reduce_mem_usage(test_df)
_            ,valid_df = add_LectureFeatures(valid_df,lectures)
valid_df = reduce_mem_usage(valid_df)
features = features + new_features

print("train:",len(train_df))
print(train_df[features+[target]].isna().sum(),"\n")
print("test:",len(test_df))
print(test_df[features+[target]].isna().sum(),"\n")
print("valid:",len(valid_df))
print(valid_df[features+[target]].isna().sum(),"\n")

Mem. usage decreased to 1425.27 Mb (22.8% reduction)
Mem. usage decreased to 886.92 Mb (22.5% reduction)
Mem. usage decreased to 620.84 Mb (22.5% reduction)
train: 24500000
row_id                                      0
timestamp                                   0
user_id                                     0
content_id                                  0
content_type_id                             0
task_container_id                           0
user_answer                                 0
answered_correctly                          0
prior_question_elapsed_time             97155
prior_question_had_explanation          97155
answered_correctly                          0
answered_correctly_contents_mean            0
answered_correctly_contents_sum             0
answered_correctly_contents_count           0
answered_correctly_users_sum                0
answered_correctly_users_count              0
answered_correctly_users_mean               0
lecture_user_mean                           0

In [13]:
target = "answered_correctly"
features = [
 #'timestamp',
 'user_id',
 'content_id',
 'content_type_id',
 'task_container_id',
 #'user_answer',
 #'answered_correctly',
 'prior_question_elapsed_time',
 'prior_question_had_explanation',
 #'answered_correctly',
 'answered_correctly_users_mean',
 'answered_correctly_users_sum',
 'answered_correctly_users_count',
 'answered_correctly_contents_mean',
 'answered_correctly_contents_sum',
 'answered_correctly_contents_count',
 "lecture_user_mean",
 "lecture_user_count",
 "lecture_user_sum",
 #"lecture_part",
 #"lecture_type",
 #"lecture_tag"
 "question_part"
]

In [14]:
lgb_train = lgb.Dataset(train_df[features], train_df[target])
lgb_valid = lgb.Dataset(valid_df[features], valid_df[target])

In [15]:
params = {
    'objective': 'binary', 
    'seed': 42,
    'metric': 'auc',
    'num_leaves': 200,
    'feature_fraction': 0.75,
    'bagging_freq': 10,
    'bagging_fraction': 0.80
}
    
model = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 10000,
    valid_sets = [lgb_train, lgb_valid],
    early_stopping_rounds = 50,
    verbose_eval = 5
)

dir = os.path.join(base_dir,"input","riiid-test-answer-prediction","riiid-model-lightgbm")
if not(os.path.exists(dir)):
    os.makedirs(dir)
    
model_path = os.path.join(dir,"featureV3-LightGBM_trained_%dsamples.mdl"%(SAMPLE_NUM) )         
model.save_model(model_path)

Training until validation scores don't improve for 50 rounds
[5]	training's auc: 0.665455	valid_1's auc: 0.640604
[10]	training's auc: 0.669467	valid_1's auc: 0.645452
[15]	training's auc: 0.67015	valid_1's auc: 0.645503
[20]	training's auc: 0.671878	valid_1's auc: 0.647167
[25]	training's auc: 0.672666	valid_1's auc: 0.647742
[30]	training's auc: 0.673353	valid_1's auc: 0.648322
[35]	training's auc: 0.674666	valid_1's auc: 0.649606
[40]	training's auc: 0.6756	valid_1's auc: 0.650523
[45]	training's auc: 0.676462	valid_1's auc: 0.651438
[50]	training's auc: 0.677172	valid_1's auc: 0.652184
[55]	training's auc: 0.677494	valid_1's auc: 0.652451
[60]	training's auc: 0.677878	valid_1's auc: 0.652862
[65]	training's auc: 0.678259	valid_1's auc: 0.653281
[70]	training's auc: 0.678508	valid_1's auc: 0.653505
[75]	training's auc: 0.678702	valid_1's auc: 0.65367
[80]	training's auc: 0.678932	valid_1's auc: 0.653935
[85]	training's auc: 0.67908	valid_1's auc: 0.654032
[90]	training's auc: 0.6792

<lightgbm.basic.Booster at 0x1e29f0dcc48>

In [16]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

y_pred = model.predict(train_df[features])
y_true = train_df[target]
y_pred = y_pred[y_true.notna()]
y_true = y_true[y_true.notna()]

fpr_all, tpr_all, thresholds_all = roc_curve(y_true, y_pred,
                                             drop_intermediate=False)

print("AUC test score:",roc_auc_score(y_true, y_pred))
fig = plt.figure(figsize=(5,5))
plt.plot(fpr_all, tpr_all)
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()
plt.show()

AUC test score: 0.693773298644684


<Figure size 500x500 with 1 Axes>