## **LightGBM Training**

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle

import gc
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy

import lightgbm as lgb
from riiid_feature_maker import *

from sklearn.preprocessing import LabelEncoder

In [3]:
base_dir = os.path.join("..","..")
train_df = pd.read_csv(os.path.join(base_dir,"input","riiid-test-answer-prediction","train.csv"), low_memory=False,index_col=0)

train_df = make_base_features(train_df)
train_df = train_df.dropna()
train_df = reduce_mem_usage(train_df)

  mask |= (ar1 == a)


Mem. usage decreased to 3111.84 Mb (0.0% reduction)


### **training**

In [4]:
target = "answered_correctly"
features = ['timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'task_container_id',
            #'user_answer',
            #'answered_correctly',
            'prior_question_elapsed_time',
            'prior_question_had_explanation']

In [6]:
import gc
from sklearn.model_selection import train_test_split
import lightgbm as lgb

SAMPLE_NUM = 50000000
train_X, valid_X, train_y, valid_y =  train_test_split(train_df[features], 
                                                       train_df[target], 
                                                       test_size=SAMPLE_NUM, 
                                                       random_state=42,
                                                       shuffle=True)


# training XGBoost
lgb_train = lgb.Dataset(train_X, train_y)
lgb_valid = lgb.Dataset(valid_X, valid_y)

del train_df
gc.collect()

print("train:")
display(train_X,train_y)
print()
print("test:")
display(valid_X,valid_y)

train:


Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29968150,412502254,642128146,7920,0,17,25000.0,1.0
39551908,34076,840924702,5423,0,1,37000.0,0.0
73090439,93931834,1554975395,5615,0,43,7000.0,1.0
101090788,32668165017,2145021464,1093,0,1209,48000.0,1.0
70741776,6953849537,1505210223,3539,0,740,24667.0,1.0
...,...,...,...,...,...,...,...
13635087,3567063780,295141054,7307,0,1068,78400.0,0.0
21587207,8641163471,463496241,6140,0,122,14000.0,1.0
58232773,3598106583,1234799703,2161,0,941,25333.0,1.0
58102622,23756086751,1231865084,5989,0,108,22000.0,1.0


row_id
29968150     0
39551908     1
73090439     0
101090788    0
70741776     1
            ..
13635087     0
21587207     0
58232773     1
58102622     0
67243333     1
Name: answered_correctly, Length: 48878794, dtype: int8


test:


Unnamed: 0_level_0,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14462522,8138293542,312223007,4221,0,1130,11000.0,1.0
23782006,1904469150,508723404,6396,0,192,34000.0,1.0
55551688,317850275,1178726418,4501,0,96,10000.0,1.0
99608375,349881076,2114328526,6978,0,40,1200.0,1.0
14972159,19506934890,322752292,3162,0,2501,30000.0,1.0
...,...,...,...,...,...,...,...
35431369,4762198,757419644,10538,0,39,23000.0,1.0
28051853,777394728,598797453,1873,0,216,33000.0,1.0
25749726,19812367896,550460296,218,0,1537,18000.0,1.0
91297628,7123596287,1938140930,3648,0,2102,15000.0,1.0


row_id
14462522    0
23782006    1
55551688    0
99608375    1
14972159    1
           ..
35431369    1
28051853    1
25749726    1
91297628    1
75449473    1
Name: answered_correctly, Length: 50000000, dtype: int8

In [7]:
params = {
    'objective': 'binary', 
    'seed': 42,
    'metric': 'auc',
    'num_leaves': 200,
    'feature_fraction': 0.75,
    'bagging_freq': 10,
    'bagging_fraction': 0.80
}
    
model = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 10000,
    valid_sets = [lgb_train, lgb_valid],
    early_stopping_rounds = 10,
    verbose_eval = 50
)

dir = os.path.join(base_dir,"input","riiid-test-answer-predction","riiid-model-lightgbm")
if not(os.path.exists(dir)):
    os.makedirs(dir)
    
model_path = os.path.join(dir,"featureV1-LightGBM_trained_%dsamples.mdl"%(SAMPLE_NUM) )         
model.save_model(model_path)

Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.621585	valid_1's auc: 0.621217
[100]	training's auc: 0.624885	valid_1's auc: 0.624326
[150]	training's auc: 0.626263	valid_1's auc: 0.625455
[200]	training's auc: 0.627339	valid_1's auc: 0.626257
[250]	training's auc: 0.628259	valid_1's auc: 0.626899
[300]	training's auc: 0.629097	valid_1's auc: 0.627434
[350]	training's auc: 0.629785	valid_1's auc: 0.627865
[400]	training's auc: 0.630356	valid_1's auc: 0.628152
[450]	training's auc: 0.630912	valid_1's auc: 0.628408
[500]	training's auc: 0.631418	valid_1's auc: 0.628623
[550]	training's auc: 0.631954	valid_1's auc: 0.628873
[600]	training's auc: 0.632462	valid_1's auc: 0.629085
[650]	training's auc: 0.632931	valid_1's auc: 0.629262
[700]	training's auc: 0.633405	valid_1's auc: 0.629454
[750]	training's auc: 0.633769	valid_1's auc: 0.629557
[800]	training's auc: 0.634218	valid_1's auc: 0.629715
[850]	training's auc: 0.634669	valid_1's auc: 0.629884
[900]

<lightgbm.basic.Booster at 0x16ebad92e08>

In [10]:
dir = os.path.join(base_dir,"input","riiid-test-answer-prediction","riiid-model-lightgbm")
if not(os.path.exists(dir)):
    os.makedirs(dir)
    
model_path = os.path.join(dir,"featureV1-LightGBM_trained_%dsamples.mdl"%(SAMPLE_NUM) )         
model.save_model(model_path)

<lightgbm.basic.Booster at 0x16ebad92e08>