## Settings

In [1]:
# 一つ上の階層からモジュールを参照できるようにする
import sys
sys.path.append('..')

In [2]:
# モジュールの変更を自動的に反映する
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from inputs import load_data
# # XGB
from models import XGB
import warnings
warnings.simplefilter('ignore', FutureWarning)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut
from scipy.stats import uniform
# Torch
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from models import TrainingDataset, Torch, transform_labels, restore_labels, train_model, validate_model, \
                   TestDataset, predict_test
# Keras
from keras import Sequential, Input
from keras.utils import to_categorical
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

## XGB

In [4]:
train_x, train_y, test_x = load_data()

In [5]:
model = XGB(random_state=1)

In [6]:
tr_x = train_x#[:80]
va_x = train_x#[80:]
tr_y = train_y#[:80]
va_y = train_y#[80:]

In [7]:
early_stopping = 5
model.fit(tr_x, tr_y, va_x, va_y,
          early_stopping_rounds=early_stopping,
          verbose=False)

In [8]:
preds = model.predict(test_x)

In [9]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [10]:
submission.to_csv('xgb.csv', index=False, header=False)

### Parameter Tuning

#### Grid Search CV

In [11]:
param_grid = {
    'random_state': [1],
    'learning_rate': [0.1],
    'n_estimators': [1000],
    'early_stopping_rounds': [50],
    'max_depth': [20*(i+1) for i in range(5)],
    'min_child_weight': [0.1, 0.5, 1, 2, 3, 5],
    # 'gamma': [0, 0.1, 0.5, 1, 2, 3, 5],
    # 'alpha': [0, 0.1, 0.5, 1, 2, 3, 5],
    # 'subsample': np.linspace(0.1, 1, 10),
    'colsample': np.linspace(0.1, 1, 10),
    'lambda': [0, 0.1, 0.5, 1, 2, 3, 5]
}

In [43]:
model = XGB()
# Stratified KFold を使う場合
gs = GridSearchCV(model, param_grid, cv=4, n_jobs=3, scoring='accuracy', verbose=2, refit=True)
# # LeaveOneOut を使う場合
# gs = GridSearchCV(model, param_grid, cv=LeaveOneOut(), n_jobs=3, scoring='accuracy', verbose=2, refit=True)
gs = gs.fit(tr_x, tr_y)

Fitting 4 folds for each of 2100 candidates, totalling 8400 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   13.4s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   23.7s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:   41.9s
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 1007 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 1452 tasks      | elapsed:  2.4min
[Parallel(n_jobs=3)]: Done 1979 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done 2586 tasks      | elapsed:  4.2min
[Parallel(n_jobs=3)]: Done 3275 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done 4895 tasks      | elapsed:  8.1min
[Parallel(n_jobs=3)]: Done 5826 tasks      | elapsed:  9.7min
[Parallel(n_jobs=3)]: Done 6839 tasks      | elapsed: 11.3min
[Parallel(n_jobs=3)]: Done 7932 tasks      | elapsed: 13.3min
[Parallel(n_jobs=3)]: Done 8400 out of 8400 | elapsed: 14.2mi

Parameters: { colsample, early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [44]:
print(f'best pararms are {gs.best_params_}')
print(f'best score is {gs.best_score_}')

best pararms are {'colsample': 0.1, 'early_stopping_rounds': 50, 'lambda': 1, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 1, 'n_estimators': 1000, 'random_state': 1}
best score is 0.756054131054131


In [45]:
model_best = gs.best_estimator_
preds = model_best.predict(test_x)

In [46]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [47]:
submission.to_csv('xgb_gs.csv', index=False, header=False)

#### Random Search CV

In [12]:
param_distributions = {
    'random_state': [1],
    'learning_rate': uniform(loc=0.0001, scale=1),
    'max_depth': [5*(i+1) for i in range(40)],
    'n_estimators': [5*(i+1) for i in range(40)],
    'early_stopping_rounds': [5*(i+1) for i in range(40)]
}

In [21]:
model = XGB()
# Stratified KFold を使う場合
rs = RandomizedSearchCV(model, param_distributions, n_iter=100, n_jobs=3, scoring='accuracy', cv=4)
# # LeaveOneOut を使う場合
# rs = RandomizedSearchCV(model, param_distributions, n_iter=100, n_jobs=3, scoring='accuracy', cv=LeaveOneOut())
search = rs.fit(tr_x, tr_y)

Parameters: { early_stopping_rounds, leaning_rate } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [22]:
print(f'best pararms are {rs.best_params_}')
print(f'best score is {rs.best_score_}')

best pararms are {'early_stopping_rounds': 80, 'leaning_rate': 0.34780176766029813, 'max_depth': 120, 'n_estimators': 35, 'random_state': 1}
best score is 0.7375356125356125


In [23]:
model_best = rs.best_estimator_
preds = model_best.predict(test_x)

In [24]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [25]:
submission.to_csv('xgb_rs.csv', index=False, header=False)

#### HyperOpt

In [25]:
tr_x = train_x[:85]
va_x = train_x[85:]
tr_y = train_y[:85]
va_y = train_y[85:]

In [26]:
from sklearn.metrics import log_loss
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [27]:
def score(params):
    params['max_depth'] = int(params['max_depth'])
    
    model = XGB()
    model.fit(tr_x, tr_y)
    va_pred = model.predict_proba(va_x)
    # print(f'va_y: {va_y_}')
    # print(f'va_pred: {va_pred}')
    score = log_loss(va_y_, va_pred)
    print(f'params: {params}, logloss:{score:.4f}')
    
    history.append((params, score))
    return {'loss': score, 'status': STATUS_OK}

In [28]:
space = {
    'random_state': hp.choice('random_state', [1]),
    'learning_rate': hp.choice('learning_rate', [0.1]),
    'n_estimators': hp.choice('n_estimators', [1000]),
    'early_stopping_rounds': hp.choice('early_stopping_rounds', [50]),
    'max_depth': hp.quniform('max_depth', 0, 100, 5),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),
    'gamma': hp.uniform('gamma', 0, 10),
    'alpha': hp.uniform('alpha', 0, 10),
    'subsample': hp.quniform('subsample', 0, 1, 0.1),
    'colsample': hp.quniform('colsample', 0, 1, 0.1),
    'lambda': hp.uniform('lambda', 0, 10)
}

In [29]:
transformed = [transform_labels(i) for i in va_y]
va_y_ = np.identity(len(tr_y.unique()))[transformed]

In [30]:
%%capture
max_evals = 500
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals, verbose=False)

In [31]:
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f'best params: {best[0]}, score: {best[1]:.4f}')

best params: {'alpha': 2.000017767073363, 'colsample': 0.5, 'early_stopping_rounds': 50, 'gamma': 5.4447788686676235, 'lambda': 9.954496848033637, 'learning_rate': 0.1, 'max_depth': 80, 'min_child_weight': 0.4577317776079548, 'n_estimators': 1000, 'random_state': 1, 'subsample': 0.30000000000000004}, score: 1.5618


In [22]:
model = XGB()
model.set_params(**best[0])
model.fit(train_x, train_y)
preds = model.predict(test_x)

Parameters: { colsample, early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [120]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [125]:
submission.to_csv('xgb_ho.csv', index=False, header=False)

## Torch

In [39]:
dataset = TrainingDataset(target_transform = transform_labels)

In [40]:
train_ratio = 0.95
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

In [41]:
batch_size = 10

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

for X, y in val_dataloader:
    print('Shape of X', X.shape)
    print('Shape of y', y.shape, y.dtype)
    break

Shape of X torch.Size([6, 9])
Shape of y torch.Size([6]) torch.int64


In [42]:
input_size = 9
output_size = 6
model = Torch(input_size, output_size)

In [43]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [44]:
epochs = 5
for t in range(epochs):
    print(f'Epoch {t+1}\n-------------------------------')
    train_model(train_dataloader, model, loss_fn, optimizer)
    validate_model(val_dataloader, model, loss_fn)
print('Done!')

Epoch 1
-------------------------------
loss: 1.796193  [    0/  101]
loss: 1.775526  [   20/  101]
loss: 1.795517  [   40/  101]
loss: 1.787526  [   60/  101]
loss: 1.784987  [   80/  101]
loss: 1.875845  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.306299 

Epoch 2
-------------------------------
loss: 1.795148  [    0/  101]
loss: 1.775630  [   20/  101]
loss: 1.796246  [   40/  101]
loss: 1.788041  [   60/  101]
loss: 1.785111  [   80/  101]
loss: 1.871614  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305924 

Epoch 3
-------------------------------
loss: 1.794282  [    0/  101]
loss: 1.775714  [   20/  101]
loss: 1.796897  [   40/  101]
loss: 1.788494  [   60/  101]
loss: 1.785205  [   80/  101]
loss: 1.868060  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305613 

Epoch 4
-------------------------------
loss: 1.793576  [    0/  101]
loss: 1.775770  [   20/  101]
loss: 1.797454  [   40/  101]
loss: 1.788868  [   60/  101]
loss: 1.785265  [   80/  10

In [45]:
test_dataset = TestDataset()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
preds = predict_test(test_dataloader, model)

In [46]:
preds = [restore_labels(p) for p in preds]
preds = np.array(preds)

In [47]:
# preds

In [48]:
_, _, test_x = load_data()
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [49]:
submission.to_csv('torch.csv', index=False, header=False)

## Keras

In [212]:
TRAINING_FOR_SUBMISSION = True
validation_split = 0.0 if TRAINING_FOR_SUBMISSION else 0.4

In [213]:
train_x, train_y, test_x = load_data()

In [214]:
num_classes = len(train_y.unique())
train_y = pd.get_dummies(train_y)

In [215]:
input_shape = (train_x.shape[1],)
model = Sequential([
    Input(shape=input_shape),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 64)                640       
_________________________________________________________________
dense_67 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_68 (Dense)             (None, 6)                 390       
Total params: 5,190
Trainable params: 5,190
Non-trainable params: 0
_________________________________________________________________


In [216]:
learning_rate = 1e-3
optimizer = Adam(learning_rate=learning_rate)

In [217]:
batch_size = 10
epochs = 5

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_split=validation_split)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff84bd31290>

In [218]:
classes = train_y.columns.tolist()
preds = model.predict(test_x).argmax(axis=1)
preds = [classes[pred] for pred in preds]
preds = np.array(preds)
preds = preds.reshape(-1, 1)

In [222]:
# preds

In [220]:
index = test_x.index.to_numpy().reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [221]:
submission.to_csv('keras.csv', index=False, header=False)