## Settings

In [1]:
# 一つ上の階層からモジュールを参照できるようにする
import sys
sys.path.append('..')

In [2]:
# モジュールの変更を自動的に反映する
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from inputs import load_data
# # XGB
from models import XGB
# Torch
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from models import TrainingDataset, Torch, transform_labels, restore_labels, train_model, validate_model, \
                   TestDataset, predict_test
# Keras
from keras import Sequential, Input
from keras.utils import to_categorical
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

## XGB

In [4]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [5]:
train_x, train_y, test_x = load_data()

In [6]:
model = XGB(random_state=1)

In [44]:
tr_x = train_x#[:80]
va_x = train_x#[80:]
tr_y = train_y#[:80]
va_y = train_y#[80:]

In [8]:
early_stopping = 5
model.fit(tr_x, tr_y, va_x, va_y,
          early_stopping_rounds=early_stopping,
          verbose=False)

In [9]:
preds = model.predict(test_x)

In [10]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [11]:
submission.to_csv('xgb.csv', index=False, header=False)

### Parameter Tuning

In [36]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut
from scipy.stats import uniform

#### Grid Search CV

In [28]:
param_grid = dict(random_state: [1],
                  learning_rate: [10**(-i) for i in range(5)],
                  max_depth: [20*(i+1) for i in range(5)],
                  n_estimators: [20*(i+1) for i in range(5)],
                  early_stopping_rounds: [20*(i+1) for i in range(5)])

In [30]:
model = XGB()
# Stratified KFold を使う場合
gs = GridSearchCV(model, param_grid, cv=4, n_jobs=3, scoring='accuracy', verbose=2, refit=True)
# # LeaveOneOut を使う場合
# gs = GridSearchCV(model, param_grid, cv=LeaveOneOut(), n_jobs=3, scoring='accuracy', verbose=2, refit=True)
gs = gs.fit(tr_x, tr_y)

Fitting 107 folds for each of 625 candidates, totalling 66875 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  40 tasks      | elapsed:   10.8s
[Parallel(n_jobs=3)]: Done 1842 tasks      | elapsed:   26.8s
[Parallel(n_jobs=3)]: Done 4522 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 6786 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 9706 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done 13266 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 17482 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 22338 tasks      | elapsed:  6.2min
[Parallel(n_jobs=3)]: Done 27850 tasks      | elapsed:  7.8min
[Parallel(n_jobs=3)]: Done 34002 tasks      | elapsed:  9.5min
[Parallel(n_jobs=3)]: Done 40810 tasks      | elapsed: 11.5min
[Parallel(n_jobs=3)]: Done 48258 tasks      | elapsed: 13.6min
[Parallel(n_jobs=3)]: Done 56362 tasks      | elapsed: 15.6min
[Parallel(n_jobs=3)]: Done 64752 tasks      | elapsed: 18.0min
[Parallel(n_jobs=3)]: Done 66875 out of 66875 | e

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [54]:
print(f'best pararms are {gs.best_params_}')
print(f'best score is {gs.best_score_}')

best pararms are {'early_stopping_rounds': 20, 'learning_rate': 1, 'max_depth': 20, 'n_estimators': 20, 'random_state': 1}
best score is 0.7383177570093458


In [55]:
model_best = gs.best_estimator_
preds = model_best.predict(test_x)

In [56]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [57]:
submission.to_csv('xgb_gs.csv', index=False, header=False)

#### Random Search CV

In [49]:
param_distributions = dict(random_state = [1],
                           leaning_rate = uniform(loc=0.0001, scale=1),
                           max_depth = [5*(i+1) for i in range(40)],
                           n_estimators = [5*(i+1) for i in range(40)],
                           early_stopping_rounds = [5*(i+1) for i in range(40)])

In [52]:
model = XGB()
# Stratified KFold を使う場合
rs = RandomizedSearchCV(model, param_distributions, n_iter=100, n_jobs=3, scoring='accuracy', cv=4)
# # LeaveOneOut を使う場合
# rs = RandomizedSearchCV(model, param_distributions, n_iter=100, n_jobs=3, scoring='accuracy', cv=LeaveOneOut())
search = rs.fit(tr_x, tr_y)

Parameters: { early_stopping_rounds, leaning_rate } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [53]:
print(f'best pararms are {rs.best_params_}')
print(f'best score is {rs.best_score_}')

best pararms are {'early_stopping_rounds': 17.524791616074804, 'leaning_rate': 0.3980323864698272, 'max_depth': 5, 'n_estimators': 50, 'random_state': 1}
best score is 0.7383177570093458


In [58]:
model_best = rs.best_estimator_
preds = model_best.predict(test_x)

In [59]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [60]:
submission.to_csv('xgb_rs.csv', index=False, header=False)

#### HyperOpt

## Torch

In [39]:
dataset = TrainingDataset(target_transform = transform_labels)

In [40]:
train_ratio = 0.95
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

In [41]:
batch_size = 10

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

for X, y in val_dataloader:
    print('Shape of X', X.shape)
    print('Shape of y', y.shape, y.dtype)
    break

Shape of X torch.Size([6, 9])
Shape of y torch.Size([6]) torch.int64


In [42]:
input_size = 9
output_size = 6
model = Torch(input_size, output_size)

In [43]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [44]:
epochs = 5
for t in range(epochs):
    print(f'Epoch {t+1}\n-------------------------------')
    train_model(train_dataloader, model, loss_fn, optimizer)
    validate_model(val_dataloader, model, loss_fn)
print('Done!')

Epoch 1
-------------------------------
loss: 1.796193  [    0/  101]
loss: 1.775526  [   20/  101]
loss: 1.795517  [   40/  101]
loss: 1.787526  [   60/  101]
loss: 1.784987  [   80/  101]
loss: 1.875845  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.306299 

Epoch 2
-------------------------------
loss: 1.795148  [    0/  101]
loss: 1.775630  [   20/  101]
loss: 1.796246  [   40/  101]
loss: 1.788041  [   60/  101]
loss: 1.785111  [   80/  101]
loss: 1.871614  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305924 

Epoch 3
-------------------------------
loss: 1.794282  [    0/  101]
loss: 1.775714  [   20/  101]
loss: 1.796897  [   40/  101]
loss: 1.788494  [   60/  101]
loss: 1.785205  [   80/  101]
loss: 1.868060  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305613 

Epoch 4
-------------------------------
loss: 1.793576  [    0/  101]
loss: 1.775770  [   20/  101]
loss: 1.797454  [   40/  101]
loss: 1.788868  [   60/  101]
loss: 1.785265  [   80/  10

In [45]:
test_dataset = TestDataset()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
preds = predict_test(test_dataloader, model)

In [46]:
preds = [restore_labels(p) for p in preds]
preds = np.array(preds)

In [47]:
# preds

In [48]:
_, _, test_x = load_data()
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [49]:
submission.to_csv('torch.csv', index=False, header=False)

## Keras

In [212]:
TRAINING_FOR_SUBMISSION = True
validation_split = 0.0 if TRAINING_FOR_SUBMISSION else 0.4

In [213]:
train_x, train_y, test_x = load_data()

In [214]:
num_classes = len(train_y.unique())
train_y = pd.get_dummies(train_y)

In [215]:
input_shape = (train_x.shape[1],)
model = Sequential([
    Input(shape=input_shape),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 64)                640       
_________________________________________________________________
dense_67 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_68 (Dense)             (None, 6)                 390       
Total params: 5,190
Trainable params: 5,190
Non-trainable params: 0
_________________________________________________________________


In [216]:
learning_rate = 1e-3
optimizer = Adam(learning_rate=learning_rate)

In [217]:
batch_size = 10
epochs = 5

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_split=validation_split)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff84bd31290>

In [218]:
classes = train_y.columns.tolist()
preds = model.predict(test_x).argmax(axis=1)
preds = [classes[pred] for pred in preds]
preds = np.array(preds)
preds = preds.reshape(-1, 1)

In [222]:
# preds

In [220]:
index = test_x.index.to_numpy().reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [221]:
submission.to_csv('keras.csv', index=False, header=False)