In [None]:
import pandas as pd
import numpy as np

import preprocessing1

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
import itertools

import warnings
warnings.filterwarnings('ignore')

## import
df_train = pd.read_csv('./raw_data/train.csv')
df_test = pd.read_csv('./raw_data/test.csv')

# test dict
# params_dict = {
#     'max_depth': [10],
#     'n_estimators': [1000],
#     'eta': [0.1],
#     'gamma':[0.5],
#     'subsample': [0.95],
#     'colsample_bytree': [0.8],
#     'min_child_weight':[3],
#     'reg_lambda': [0.01]
# }

params_dict = {
    'max_depth': [8,10,12,15],
    'n_estimators': [800, 1000],
    'eta': [0.01, 0.1, 0.3],
    'gamma':[0.1, 0.3, 0.5, 1],
    'subsample': [0.7, 0.8, 0.9, 0.95],
    'colsample_bytree': [0.8, 0.9, 1],
    'min_child_weight':[1, 3, 5],
    'reg_lambda': [0.01, 0.1, 1],
    'max_delta_step': [1,2,3,5,10]}


best_score = 0
best_params = {}
best_model = None

kf = KFold(n_splits=5, random_state=42, shuffle=True)

for combination in itertools.product(*params_dict.values()):
    params = dict(zip(params_dict.keys(), combination))

    model = xgb.XGBClassifier(
        tree_method='auto',
        enable_categorical=True,
        objective='multi:prob',
        eval_metric='mlogloss',
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        eta=params['eta'],
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        min_child_weight=params['min_child_weight'],
        reg_lambda=params['reg_lambda'])

    for fold, (train_index, valid_index) in enumerate(kf.split(df_train), 0):
        acc_val = []

        train_set, valid_set = df_train.iloc[train_index], df_train.iloc[valid_index]

        ## clean
        train_clean, median_dict, bound_dict, median_neigh, median_prope= preprocessing1.train_clean_1_(train_set) 
        val_clean = preprocessing1.test_clean_1_(valid_set, median_dict, bound_dict, 
                                                    median_neigh, median_prope, test = False)
        X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
        X_valid, y_valid = (val_clean.drop('price', axis=1)).values, (val_clean['price']).values

        ## fit model adn evaluate
        model.fit(X_train, y_train, 
                    early_stopping_rounds=10, eval_set=[(X_valid, y_valid)], verbose=False)
        y_val_pred = model.predict(X_valid)
        acc_val.append(balanced_accuracy_score(y_valid, y_val_pred))

    # Update 
    mean_acc = np.mean(acc_val)
    print('Accuracy: \n  ', mean_acc, '\nParams: \n  ', params)
    if mean_acc > best_score:
        best_score = mean_acc
        best_params = params

print("Best score:", best_score)
print("Best parameters:", best_params)


## test output
train_clean, median_dict, bound_dict, median_neigh, median_prope= preprocessing1.train_clean_1_(df_train)
test_clean = preprocessing1.test_clean_1_(df_test, median_dict, bound_dict, 
                                            median_neigh, median_prope, test = True)
X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
X_test = test_clean.values

best_model = xgb.XGBClassifier(
    tree_method='auto',
    enable_categorical=True,
    objective='multi:softprob',
    eval_metric='mlogloss',
    **best_params  
)

best_model.fit(X_train, y_train, verbose=False)
y_pred = best_model.predict(X_test)

df_predictions = pd.DataFrame({'id': range(len(y_pred)) , 'Price': y_pred})
df_predictions.to_csv('submission1.csv', index=False)
