In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from xgboost import XGBClassifier

import torch
import torch.nn.functional as F
import torch.nn as nn

import logging
FORMAT = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, filename='myLog.log', format=FORMAT)

In [2]:
train_data = pd.read_csv('train_values.csv', index_col=0)
test_data = pd.read_csv('test_values.csv', index_col=0)
print(f'train data size: {train_data.shape[0]}')
print(f'train data size: {test_data.shape[0]}')

train data size: 260601
train data size: 86868


In [3]:
# # Generate report
# import pandas_profiling
# total_data = pd.concat([train_data, test_data])
# total_data.head()
# profile = pandas_profiling.ProfileReport(total_data)
# profile.to_file('total_data_profile.html')

Heal with geo_level_1_id, geo_level_2_id, geo_level_3_id

Use autoencoder method

In [4]:
class TwoOutputNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out_1, D_out_2):
        super(TwoOutputNet, self).__init__()
        self.fc1 = nn.Linear(D_in, H)
        self.fc2 = nn.Linear(H, D_out_1)
        self.fc3 = nn.Linear(H, D_out_2)
        
    def forward(self, x):
        x = self.fc1(x)
        output_1 = torch.sigmoid(self.fc2(x))
        output_2 = torch.sigmoid(self.fc3(x))
        return output_1, output_2, x

def train_geo_autoencoder():
    tmp = pd.concat([train_data,test_data])
    x = pd.get_dummies(tmp['geo_level_3_id']).to_numpy()
    y_1 = pd.get_dummies(tmp['geo_level_2_id']).to_numpy()
    y_2 = pd.get_dummies(tmp['geo_level_1_id']).to_numpy()

    x = torch.from_numpy(x)
    y_1 = torch.from_numpy(y_1)
    y_2 = torch.from_numpy(y_2)
    x = x.type(torch.FloatTensor)
    y_1 = y_1.type(torch.FloatTensor)
    y_2 = y_2.type(torch.FloatTensor)
    
    dataset = torch.utils.data.TensorDataset(x, y_1, y_2)
    dataloader = torch.utils.data.DataLoader(dataset=dataset, 
                                             batch_size=128,
                                             shuffle=True, 
                                             num_workers=2)

    model = TwoOutputNet(x.shape[1], 16, y_1.shape[1], y_2.shape[1])

    loss_f = torch.nn.BCELoss()
    loss_f_1 = torch.nn.BCELoss()

    lr = 1e-2
    optimzer = torch.optim.Adam(model.parameters(), lr=lr)

    for e in range(10):
        for i, (x_data, y_1_data, y_2_data) in enumerate(dataloader):

            out1, out2, _ = model(x_data)

            loss = loss_f(out1, y_1_data) + loss_f_1(out2, y_2_data)
            if not i%1000:
                print(f'epoch: {e}, i: {i}, loss: {loss.item()}')

            optimzer.zero_grad()
            loss.backward()
            optimzer.step()
            
    torch.save(model.state_dict(), 'model.m')
    return model

def output_ae(model, x):
    x = torch.tensor(x)
    x = x.type(torch.FloatTensor)
    return model(x)[-1]

def load_model(model_path):
    w = torch.load(model_path)
    m = TwoOutputNet(w['fc1.weight'].shape[1], 
                     w['fc1.bias'].shape[0], 
                     w['fc2.bias'].shape[0], 
                     w['fc3.bias'].shape[0]
                    )
    m.load_state_dict(torch.load(model_path), strict=False)
    return m

In [5]:
need_train_autoencoder = False

if need_train_autoencoder:
    model = train_geo_autoencoder()
else:
    model = load_model('model.m')

In [6]:
total_data = pd.concat([train_data,test_data])
train_size = train_data.shape[0]
geo3 = pd.get_dummies(total_data['geo_level_3_id']).to_numpy()

tmp = []

for i, v in enumerate(geo3):
    ans = output_ae(model, v)
    ans = ans.detach().numpy() 
    tmp.append(ans)

tmp = np.array(tmp).T

for i in range(16):
    train_data[f'geo_fea_{i}'] = tmp[i][:train_size]
    test_data[f'geo_fea_{i}'] = tmp[i][train_size:]


In [7]:
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)

In [8]:
y = pd.read_csv('train_labels.csv', index_col=0)
y_train_data = pd.merge(train_data, y, on='building_id')

In [9]:
def heal_with_feature(table):
    return pd.get_dummies(table)

def generate_submiss(model, name):
    heal_with_test_data = heal_with_feature(test_data)
    pred = model.predict(heal_with_test_data)
    heal_with_test_data['damage_grade'] = pred
    heal_with_test_data = heal_with_test_data[['damage_grade']]
    heal_with_test_data.to_csv(f'{name}_test.csv')

In [10]:
dum_data = heal_with_feature(y_train_data)
X_train, X_test, y_train, y_test = train_test_split(dum_data.drop(columns=['damage_grade']), 
                                                    dum_data['damage_grade'], 
                                                    test_size=0.2, 
                                                    stratify=dum_data['damage_grade'],
                                                    random_state=1114)

In [None]:
## randomsearchCV
param_grid = {
        'n_estimators': range(500, 1501, 100),
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [2, 4, 6, 8, 10]
}
param = {
#     'colsample_bytree': 0.8,
    'learning_rate': 0.1,
#     'max_depth': 10,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
}
m = XGBClassifier(**param)
gs = RandomizedSearchCV(m, param_grid, n_jobs=-1, cv=6, scoring='f1_micro', n_iter=100)
gs.fit(X_train, y_train)

In [14]:
predictions = gs.predict(X_test)
logging.info(str(gs.best_params_))
print(gs.best_params_)
print(metrics.accuracy_score(y_test, predictions))
print(metrics.f1_score(y_test, predictions, average='micro'))
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

{'subsample': 0.9, 'reg_lambda': 50.0, 'n_estimators': 1000, 'min_child_weight': 5.0, 'max_depth': 10, 'gamma': 0.5, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.7}
0.7523263176071066
0.7523263176071067
[[ 2833  2131    61]
 [ 1111 25107  3434]
 [  119  6053 11272]]
              precision    recall  f1-score   support

           1       0.70      0.56      0.62      5025
           2       0.75      0.85      0.80     29652
           3       0.76      0.65      0.70     17444

    accuracy                           0.75     52121
   macro avg       0.74      0.69      0.71     52121
weighted avg       0.75      0.75      0.75     52121



In [15]:
generate_submiss(gs, 'xgb_autoencoder')