In [46]:
import numpy as np
import pandas as pd
from numpy.random import randn

import matplotlib.pyplot as plt
import seaborn as sns
import dataprep as dp
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression, RANSACRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [47]:
#Import data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/submission.csv')
train.drop(['id'], axis=1, inplace=True)
column_names = list(train.columns.values.tolist())

In [55]:
column_names

['hour',
 'hour_bef_temperature',
 'hour_bef_precipitation',
 'hour_bef_windspeed',
 'hour_bef_humidity',
 'hour_bef_visibility',
 'hour_bef_ozone',
 'hour_bef_pm10',
 'hour_bef_pm2.5',
 'count']

In [48]:
ex = train.copy()

def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * npee.pi * data[col]/max_val)
    return data

ex = encode(ex, 'hour', 23)

In [49]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

ex.drop(index=[934, 1035], inplace=True)

ex = IterativeImputer(random_state=2021).fit_transform(ex)
ex = pd.DataFrame(ex)

column_names_cy = column_names + ['hour_sin', 'hour_cos']

ex.columns = column_names_cy

In [50]:
col_name=['hour', 'hour_bef_temperature', 'hour_bef_windspeed', 'hour_bef_humidity', 
          'hour_bef_visibility', 'hour_bef_ozone']

for ilt in col_name:
    Q1=ex[ilt].quantile(0.25)
    Q3=ex[ilt].quantile(0.75)
    IQR=Q3-Q1
    train_delout=ex[(ex[ilt]<(Q1 - 1.5*IQR)) | (ex[ilt]>(Q3+1.5*IQR))]
    ex=ex.drop(train_delout.index, axis=0)
ex[col_name].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1433 entries, 0 to 1456
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   hour                  1433 non-null   float64
 1   hour_bef_temperature  1433 non-null   float64
 2   hour_bef_windspeed    1433 non-null   float64
 3   hour_bef_humidity     1433 non-null   float64
 4   hour_bef_visibility   1433 non-null   float64
 5   hour_bef_ozone        1433 non-null   float64
dtypes: float64(6)
memory usage: 78.4 KB


In [29]:
ex = ex.reset_index(drop=True)

In [90]:
def objective_extratree(trial):
    params_et = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 4),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", 4, 5, 6, 7, 8]),
        "warm_start": trial.suggest_categorical("warm_start", [True, False]),
        "random_state": 2021
    }
    
    X = ex[ex.columns.difference(['count', 'hour_bef_visibility'])]
    y = ex[['count']]
    X_train_ori, X_val_ori, y_train_ori, y_val_ori = train_test_split(X, y, test_size=0.33, random_state=2021)
    
    model = ExtraTreesRegressor(**params_et)
    model.fit(X_train_ori, y_train_ori)
    
    pred = model.predict(X_val_ori)
    rmse = np.sqrt(mean_squared_error(y_val_ori, pred))
    
    return rmse

In [91]:
import optuna
from optuna.samplers import TPESampler

sampler = TPESampler(seed=2021)
study = optuna.create_study(
    study_name="et_optimizer",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_extratree, n_trials=30)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-08-28 17:27:34,975][0m A new study created in memory with name: et_optimizer[0m
[32m[I 2021-08-28 17:27:36,498][0m Trial 0 finished with value: 37.572102113021984 and parameters: {'n_estimators': 700, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'warm_start': True}. Best is trial 0 with value: 37.572102113021984.[0m
[32m[I 2021-08-28 17:27:36,635][0m Trial 1 finished with value: 37.70226370398789 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 7, 'warm_start': False}. Best is trial 0 with value: 37.572102113021984.[0m
[32m[I 2021-08-28 17:27:37,211][0m Trial 2 finished with value: 50.39114831041336 and parameters: {'n_estimators': 800, 'max_depth': 2, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 8, 'warm_start': True}. Best is trial 0 with value: 37.572102113021984.[0m
[32m[I 2021-08-28 17:27:38,019][0m Trial 3 finished with value:

[32m[I 2021-08-28 17:27:54,340][0m Trial 29 finished with value: 37.572102113021984 and parameters: {'n_estimators': 700, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'warm_start': False}. Best is trial 27 with value: 36.76976409664076.[0m


Best Score: 36.76976409664076
Best trial: {'n_estimators': 700, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'warm_start': False}


In [92]:
X = encode(ex[ex.columns.difference(['count', 'hour_bef_visibility'])], 'hour', 23)
y = ex[['count']]

cols = X.columns.tolist()

it_test = test[test.columns.difference(['id', 'hour_bef_visibility'])].copy()
it_test = encode(it_test, 'hour', 23)
X_test = IterativeImputer(random_state=2021).fit_transform(it_test)

reg = ExtraTreesRegressor(n_estimators=700, max_depth= 14, min_samples_split=2, min_samples_leaf=1, 
                          max_features='sqrt', warm_start= False, random_state=42)
reg.fit(X, y)
predicts = reg.predict(X_test)

submission['count'] = predicts
submission.to_csv('et_cyclic.csv', index=False)