In [2]:
import pandas as pd
import numpy as np

import preprocessing2

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
# import xgboost as xgb
import itertools

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## import
df_train = pd.read_csv('./raw_data/train.csv')
df_test = pd.read_csv('./raw_data/test.csv')


# params_dict = {
#     'max_depth': [10, 15, 20],
#     'n_estimators': [500, 1000],
#     'min_samples_split': [2,5,10],
#     'min_samples_leaf': [1,2,4,6],
#     'max_features': ['sqrt', 'log2', 0.5],
#     'class_weight': ['balanced', 'balanced_subsample']}

params_dict = {
    'max_depth': [20],
    'n_estimators': [1000],
    'min_samples_split': [5],
    'min_samples_leaf': [6],
    'max_features': ['sqrt'],
    'class_weight': ['balanced']}

best_score = 0
best_params = {}
best_model = None

kf = KFold(n_splits=5, random_state=42, shuffle=True)

for combination in itertools.product(*params_dict.values()):
    params = dict(zip(params_dict.keys(), combination))

    model = RandomForestClassifier(
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features'],
        class_weight=params['class_weight'])

    for fold, (train_index, valid_index) in enumerate(kf.split(df_train), 0):
        acc_val = []

        train_set, valid_set = df_train.iloc[train_index], df_train.iloc[valid_index]

        ## clean
        train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(train_set) 
        val_clean = preprocessing2.test_clean_2_(valid_set, median_dict, bound_dict, 
                                                    median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = False)
        X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
        X_valid, y_valid = (val_clean.drop('price', axis=1)).values, (val_clean['price']).values

        ## fit model adn evaluate
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_valid)
        acc_val.append(balanced_accuracy_score(y_valid, y_val_pred))

    # Update 
    mean_acc = np.mean(acc_val)
    print('Accuracy: \n  ', mean_acc, '\nParams: \n  ', params)
    if mean_acc > best_score:
        best_score = mean_acc
        best_params = params

print("Best score:", best_score)
print("Best parameters:", best_params)


## test output
train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(df_train)
test_clean = preprocessing2.test_clean_2_(df_test, median_dict, bound_dict, 
                                            median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = True)
X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
X_test = test_clean.values

best_model = RandomForestClassifier(
    **best_params  
)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

df_predictions = pd.DataFrame({'id': range(len(y_pred)) , 'Price': y_pred})
df_predictions.to_csv('submission2.csv', index=False)





Accuracy: 
   0.5615195341523141 
Params: 
   {'max_depth': 20, 'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'class_weight': 'balanced'}
Best score: 0.5615195341523141
Best parameters: {'max_depth': 20, 'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'class_weight': 'balanced'}


In [3]:
import pandas as pd
import numpy as np

import preprocessing2

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
import itertools

import warnings
warnings.filterwarnings('ignore')

## import
df_train = pd.read_csv('./raw_data/train.csv')
df_test = pd.read_csv('./raw_data/test.csv')

# test dict
params_dict = {
    'max_depth': [10],
    'n_estimators': [1000],
    'eta': [0.01],
    'gamma':[0.01],
    'subsample': [0.6],
    'colsample_bytree': [0.8],
    'min_child_weight':[1],
    'reg_lambda': [0.01],
    'max_delta_step': [7]
}

# params_dict = {
#     'max_depth': [8,10,12,15],
#     'n_estimators': [800, 1000],
#     'eta': [0.01, 0.1, 0.3],
#     'gamma':[0.1, 0.3, 0.5, 1],
#     'subsample': [0.7, 0.8, 0.9, 0.95],
#     'colsample_bytree': [0.8, 0.9, 1],
#     'min_child_weight':[1, 3, 5],
#     'reg_lambda': [0.01, 0.1, 1],
#     'max_delta_step': [1,2,3,5,10]}


best_score = 0
best_params = {}
best_model = None

kf = KFold(n_splits=5, random_state=42, shuffle=True)

for combination in itertools.product(*params_dict.values()):
    params = dict(zip(params_dict.keys(), combination))

    model = xgb.XGBClassifier(
        tree_method='auto',
        enable_categorical=True,
        objective='multi:prob',
        eval_metric='mlogloss',
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        eta=params['eta'],
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        min_child_weight=params['min_child_weight'],
        reg_lambda=params['reg_lambda'])

    for fold, (train_index, valid_index) in enumerate(kf.split(df_train), 0):
        acc_val = []

        train_set, valid_set = df_train.iloc[train_index], df_train.iloc[valid_index]

        ## clean
        train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(train_set) 
        val_clean = preprocessing2.test_clean_2_(valid_set, median_dict, bound_dict, 
                                                    median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = False)
        X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
        X_valid, y_valid = (val_clean.drop('price', axis=1)).values, (val_clean['price']).values

        ## fit model adn evaluate
        model.fit(X_train, y_train, 
                    early_stopping_rounds=10, eval_set=[(X_valid, y_valid)], verbose=False)
        y_val_pred = model.predict(X_valid)
        acc_val.append(balanced_accuracy_score(y_valid, y_val_pred))

    # Update 
    mean_acc = np.mean(acc_val)
    print('Accuracy: \n  ', mean_acc, '\nParams: \n  ', params)
    if mean_acc > best_score:
        best_score = mean_acc
        best_params = params

print("Best score:", best_score)
print("Best parameters:", best_params)


## test output
train_clean, median_dict, bound_dict, median_neigh, median_prope, glob_med_neigh, glob_med_prope= preprocessing2.train_clean_2_(df_train)
test_clean = preprocessing2.test_clean_2_(df_test, median_dict, bound_dict, 
                                            median_neigh, median_prope, glob_med_neigh, glob_med_prope, test = True)
X_train, y_train = (train_clean.drop('price', axis=1)).values, (train_clean['price']).values
X_test = test_clean.values

best_model = xgb.XGBClassifier(
    tree_method='auto',
    enable_categorical=True,
    objective='multi:softmax',
    eval_metric='mlogloss',
    **best_params  
)

best_model.fit(X_train, y_train, verbose=False)
y_pred = best_model.predict(X_test)

df_predictions = pd.DataFrame({'id': range(len(y_pred)) , 'Price': y_pred})
df_predictions.to_csv('submission1.csv', index=False)


Accuracy: 
   0.4913496143761364 
Params: 
   {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.01, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}
Best score: 0.4913496143761364
Best parameters: {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.01, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}


In [None]:
# Accuracy: 
#    0.5671880745888057 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.1, 'gamma': 0.5, 'subsample': 0.95, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01}

# Accuracy: 
#    0.5671024113732074 
# Params: 
#    {'max_depth': 9, 'n_estimators': 1200, 'eta': 0.1, 'gamma': 0.5, 'subsample': 0.95, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01}

# Accuracy: 
#    0.5421702331508867 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.3, 'gamma': 0.5, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01}

# Accuracy: 
#    0.5498288445186382 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.3, 'gamma': 0.5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01}

# Accuracy: 
#    0.5594133712685273 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.1, 'gamma': 0.5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01, 'max_delta_step': 5}

# Accuracy: 
#    0.5715560498031736 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01, 'max_delta_step': 3}
## eta is boosting acc

# Accuracy: 
#    0.571546728960379 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 3}

# Accuracy: 
#    0.566775071158093 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.7, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}

# Accuracy: 
#    0.5730900839972636 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}

# Accuracy: 
#    0.57403912919536   __0.56127
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}

# Accuracy: 
#    0.5748324286382628 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.1, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}

# Accuracy: 
#    0.5729162888383669 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'reg_lambda': 0.01, 'max_delta_step': 7}






In [None]:
# ## cross validation
# model = xgb.XGBClassifier(n_estimators=1000, 
#                           max_depth=7, 
#                           eta=0.1, 
#                           subsample=0.7, 
#                           colsample_bytree=0.8, 
#                           enable_categorical=True,
#                           objective = 'multi:softmax')
# X, y = train_clean.iloc[:, :-1].to_numpy(), train_clean.iloc[:, -1].to_numpy()

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# model.fit(X_train, y_train)
# y_pred = np.round(model.predict(X_val))
# y_pred[y_pred == 6.0] = 5.0
# y_pred[y_pred == -1.0] = 0
# accuracy = accuracy_score(y_val, y_pred)
# print("Accuracy:", accuracy)


# ## test output
# X_test = test_clean.to_numpy()
# y_pred = model.predict(X_test)

# df_predictions = pd.DataFrame({'id': range(len(y_pred)) , 'Price': y_pred})
# df_predictions.to_csv('submission.csv', index=False)





## Accuracy: 0.5272020725388601
## Accuracy: 0.5310880829015544 -- bound [0,5]
## Accuracy: 0.5310880829015544 -- lower bound = 0


## Accuracy: 0.5660621761658031 ---classifier
## Accuracy: 0.5660621761658031 ---classifier  15.7s
## Accuracy: 0.5537564766839378
##              0.5589378238341969
##              0.5621761658031088


# 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count', 'host_total_listings_count',
#                               'minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights',
#                               'minimum_nights_avg_ntm','maximum_nights', 
#                               'availability_60', 'availability_90', 'availability_365',   0.5550518134715026, 0.5712435233160622
#                               'number_of_reviews_l30d', 'number_of_reviews',   0.5718911917098446
#                               'bedrooms', 'beds',   0.5621761658031088
#                               'host_has_profile_pic'  0.5615284974093264


# 'number_of_reviews_l30d', 'number_of_reviews'

## Accuracy: 0.577    target encode
## Accuracy: 0.5841968911917098   --label
## Accuracy: 0.5841968911917098   -- category
## no kink outlier --> drop to 0.56


In [None]:
# # test dict
#     params_dict = {
#         'max_depth': [10],
#         'n_estimators': [1000],
#         'eta': [0.1],
#         'gamma':[0.5],
#         'subsample': [0.95],
#         'colsample_bytree': [0.8],
#         'min_child_weight':[3],
#         'reg_lambda': [0.01]
#     }

## Accuracy:  10
#    0.5675342354275558   0.55489

# 4m  ## 0.5673575129533679, 0.55479
# Best parameters: {'max_depth': 7, 'n_estimators': 1000, 'eta': 0.1, 'gamma': 0.1, 
# 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.01}




#host_id cannot be added:0.5675342354275558 - 0.5601632618442313
## target 0.5283550346491853 -- 0.5404111858372997 -- 0.5675342354275558



# Accuracy: 
#    0.5721689470355151 
# Params: 
#    {'max_depth': 10, 'n_estimators': 1000, 'eta': 0.01, 'gamma': 0.5, 'subsample': 0.95, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'reg_lambda': 0.1, 'max_delta_step': 5}

