## **XGBoost Training**

In [16]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

import gc
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy

from sklearn.preprocessing import LabelEncoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### **Utils**

In [17]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [18]:
base_dir = os.path.join("..","..")
train_df = pd.read_csv(os.path.join(base_dir,"input","riiideducation","train.csv"), low_memory=False,index_col=0)

In [19]:
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].astype("float")
train_df = train_df.dropna()
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 3111.84 Mb (58.8% reduction)


### **training**

In [20]:
target = "answered_correctly"
features = ['timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'task_container_id',
            #'user_answer',
            #'answered_correctly',
            'prior_question_elapsed_time',
            'prior_question_had_explanation']

In [22]:
train_df = train_df.sample(n=50000000, random_state=0)

In [23]:
# training XGBoost
model = xgb.XGBClassifier(objective="binary:logistic",
                             tree_method="gpu_hist",
                             random_state=0,
                             verbose=2,
                             n_estimators=1000,
                         )

model.fit(X=train_df[features], y=train_df[target], 
          #sample_weight=None, 
          #base_margin=None, 
          #eval_metric=None, 
          #early_stopping_roun ds=100, 
          verbose=False, 
          eval_metric="logloss"
          #xgb_model=None, 
          #sample_weight_eval_set=None
          )

dir = os.path.join(base_dir,"input","riiideducation","models")
if not(os.path.exists(dir)):
    os.makedirs(dir)
    
model_path = os.path.join(dir,"featureV1-XGBoost.mdl")         
model.save_model(model_path)