## Settings

In [1]:
# 一つ上の階層からモジュールを参照できるようにする
import sys
sys.path.append('..')

In [2]:
# モジュールの変更を自動的に反映する
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
from inputs import load_data
# # XGB
from models import XGB
# Torch
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from models import TrainingDataset, Torch, transform_labels, restore_labels, train_model, validate_model, \
                   TestDataset, predict_test
# Keras
from keras import Sequential, Input
from keras.utils import to_categorical
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

## XGB

In [4]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [5]:
train_x, train_y, test_x = load_data()

In [6]:
model = XGB(random_state=1)

In [7]:
tr_x = train_x#[:80]
va_x = train_x#[80:]
tr_y = train_y#[:80]
va_y = train_y#[80:]

In [8]:
early_stopping = 5
model.fit(tr_x, tr_y, va_x, va_y,
          early_stopping_rounds=early_stopping,
          verbose=False)

In [9]:
preds = model.predict(test_x)

In [10]:
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [11]:
submission.to_csv('xgb.csv', index=False, header=False)

### Grid Search CV

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'random_state': [1],
    'learning_rate': [10**(-i) for i in range(5)],
    'max_depth': [20*(i+1) for i in range(5)],
    'n_estimators': [20*(i+1) for i in range(5)],
    'early_stopping_rounds': [20*(i+1) for i in range(5)]
}

In [33]:
model = XGB()
gs = GridSearchCV(model, param_grid, cv=4, n_jobs=3, scoring='accuracy', verbose=2, refit=True)
gs = gs.fit(tr_x, tr_y)

Fitting 4 folds for each of 625 candidates, totalling 2500 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  40 tasks      | elapsed:    9.3s
[Parallel(n_jobs=3)]: Done 978 tasks      | elapsed:   23.4s


Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[Parallel(n_jobs=3)]: Done 2500 out of 2500 | elapsed:   45.8s finished


### Random Search CV

## Torch

In [39]:
dataset = TrainingDataset(target_transform = transform_labels)

In [40]:
train_ratio = 0.95
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

In [41]:
batch_size = 10

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

for X, y in val_dataloader:
    print('Shape of X', X.shape)
    print('Shape of y', y.shape, y.dtype)
    break

Shape of X torch.Size([6, 9])
Shape of y torch.Size([6]) torch.int64


In [42]:
input_size = 9
output_size = 6
model = Torch(input_size, output_size)

In [43]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [44]:
epochs = 5
for t in range(epochs):
    print(f'Epoch {t+1}\n-------------------------------')
    train_model(train_dataloader, model, loss_fn, optimizer)
    validate_model(val_dataloader, model, loss_fn)
print('Done!')

Epoch 1
-------------------------------
loss: 1.796193  [    0/  101]
loss: 1.775526  [   20/  101]
loss: 1.795517  [   40/  101]
loss: 1.787526  [   60/  101]
loss: 1.784987  [   80/  101]
loss: 1.875845  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.306299 

Epoch 2
-------------------------------
loss: 1.795148  [    0/  101]
loss: 1.775630  [   20/  101]
loss: 1.796246  [   40/  101]
loss: 1.788041  [   60/  101]
loss: 1.785111  [   80/  101]
loss: 1.871614  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305924 

Epoch 3
-------------------------------
loss: 1.794282  [    0/  101]
loss: 1.775714  [   20/  101]
loss: 1.796897  [   40/  101]
loss: 1.788494  [   60/  101]
loss: 1.785205  [   80/  101]
loss: 1.868060  [  100/  101]
Test Error: 
Accuracy: 16.7%, Avg loss: 0.305613 

Epoch 4
-------------------------------
loss: 1.793576  [    0/  101]
loss: 1.775770  [   20/  101]
loss: 1.797454  [   40/  101]
loss: 1.788868  [   60/  101]
loss: 1.785265  [   80/  10

In [45]:
test_dataset = TestDataset()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
preds = predict_test(test_dataloader, model)

In [46]:
preds = [restore_labels(p) for p in preds]
preds = np.array(preds)

In [47]:
# preds

In [48]:
_, _, test_x = load_data()
index = test_x.index.to_numpy().reshape(-1, 1)
preds = preds.reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [49]:
submission.to_csv('torch.csv', index=False, header=False)

## Keras

In [212]:
TRAINING_FOR_SUBMISSION = True
validation_split = 0.0 if TRAINING_FOR_SUBMISSION else 0.4

In [213]:
train_x, train_y, test_x = load_data()

In [214]:
num_classes = len(train_y.unique())
train_y = pd.get_dummies(train_y)

In [215]:
input_shape = (train_x.shape[1],)
model = Sequential([
    Input(shape=input_shape),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 64)                640       
_________________________________________________________________
dense_67 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_68 (Dense)             (None, 6)                 390       
Total params: 5,190
Trainable params: 5,190
Non-trainable params: 0
_________________________________________________________________


In [216]:
learning_rate = 1e-3
optimizer = Adam(learning_rate=learning_rate)

In [217]:
batch_size = 10
epochs = 5

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_split=validation_split)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff84bd31290>

In [218]:
classes = train_y.columns.tolist()
preds = model.predict(test_x).argmax(axis=1)
preds = [classes[pred] for pred in preds]
preds = np.array(preds)
preds = preds.reshape(-1, 1)

In [222]:
# preds

In [220]:
index = test_x.index.to_numpy().reshape(-1, 1)
submission = np.concatenate((index, preds), axis=1)
submission = pd.DataFrame(submission)

In [221]:
submission.to_csv('keras.csv', index=False, header=False)