In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')


In [2]:
ec_train = pd.read_csv('../../dataset/processed/targetECTrain.csv')
ec_test = pd.read_csv('../../dataset/processed/targetEC.csv')

In [3]:
X = ec_train.drop(['damage_grade'], axis=1)
y = ec_train['damage_grade']
y -= 1

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69420)

In [5]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [None]:
def objective(trial):

    params = {
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
    }

    model = XGBClassifier(**params, random_state=69420, n_thread=6)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return f1_score(y_test, y_pred, average='macro')


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=6, timeout=3600)


In [16]:
print(f"""\
Best value: {study.best_value} 
Best params: {study.best_params}
""")

Best value: 0.6720495353186127 
Best params: {'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 1055, 'max_depth': 15}



In [18]:
# {'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 1055, 'max_depth': 15, 'random_state' : 48568} -> 0.6720495353186127

params = {'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.02,
          'n_estimators': 1055, 'max_depth': 15, 'random_state': 48568}

In [19]:
xgb_best = XGBClassifier(**params)
xgb_best.fit(X_train, y_train)

y_pred = xgb_best.predict(X_test)

print(f1_score(y_test, y_pred, average='macro'))

0.6741916884077715


In [20]:
xgb_full = XGBClassifier(**params)

xgb_full.fit(X, y)

In [22]:
y_pred_sub = xgb_full.predict(ec_test)

y_pred_sub += 1

submission_df = pd.DataFrame({
    'id': np.arange(ec_test.shape[0]),
    'damage_grade': y_pred_sub
})

submission_df.head()

Unnamed: 0,id,damage_grade
0,0,5
1,1,5
2,2,5
3,3,5
4,4,2


In [23]:
last_best_df = pd.read_csv("../../dataset/submission_ec_xgb_67.csv")

last_best_df.head()

Unnamed: 0,id,damage_grade
0,0,5
1,1,5
2,2,5
3,3,5
4,4,2


In [25]:
print(submission_df["damage_grade"].value_counts())
print(last_best_df["damage_grade"].value_counts())

5    121289
3     41103
2     30464
4     28125
1     21101
Name: damage_grade, dtype: int64
5    122409
3     41195
2     30588
4     27001
1     20889
Name: damage_grade, dtype: int64


In [None]:
submission_df.to_csv('../../dataset/submission_ec_xgb_67-4.csv', index=False)