In [None]:
import pandas as pd
import numpy as np

import preprocessing2

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
# import xgboost as xgb
import itertools

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## import
df_train = pd.read_csv('./raw_data/train.csv')
df_test = pd.read_csv('./raw_data/test.csv')


params_dict = {
    'max_depth': [10, 15, 20],
    'n_estimators': [500, 1000],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4,6],
    'max_features': ['sqrt', 'log2', 0.5],
    'class_weight': ['balanced', 'balanced_subsample']}

# params_dict = {
#     'max_depth': [10],
#     'n_estimators': [1000],
#     'min_samples_split': [5],
#     'min_samples_leaf': [6],
#     'max_features': ['sqrt'],
#     'class_weight': ['balanced']}

best_score = 0
best_params = {}
best_model = None

kf = KFold(n_splits=5, random_state=42, shuffle=True)

for combination in itertools.product(*params_dict.values()):
    params = dict(zip(params_dict.keys(), combination))

    model = RandomForestClassifier(
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        class_weight=params['class_weight'])

    for fold, (train_index, valid_index) in enumerate(kf.split(df_train), 0):
        acc_val = []

        train_set, valid_set = df_train.iloc[train_index], df_train.iloc[valid_index]

        ## clean
        train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(train_set) 
        val_clean = preprocessing2.test_clean_2_(valid_set, median_dict, bound_dict, 
                                                    median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = False)
        X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
        X_valid, y_valid = (val_clean.drop('price', axis=1)).values, (val_clean['price']).values

        ## fit model adn evaluate
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_valid)
        acc_val.append(balanced_accuracy_score(y_valid, y_val_pred))

    # Update 
    mean_acc = np.mean(acc_val)
    print('Accuracy: \n  ', mean_acc, '\nParams: \n  ', params)
    if mean_acc > best_score:
        best_score = mean_acc
        best_params = params

print("Best score:", best_score)
print("Best parameters:", best_params)


## test output
train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(df_train)
test_clean = preprocessing2.test_clean_2_(df_test, median_dict, bound_dict, 
                                            median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = True)
X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
X_test = test_clean.values

best_model = RandomForestClassifier(
    **best_params  
)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

df_predictions = pd.DataFrame({'id': range(len(y_pred)) , 'Price': y_pred})
df_predictions.to_csv('submission2.csv', index=False)