In [1]:
import numpy as np
import pandas as pd
from numpy.random import randn

import matplotlib.pyplot as plt
import seaborn as sns
import dataprep as dp
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression, RANSACRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Import data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/submission.csv')
train.drop(['id'], axis=1, inplace=True)
column_names = list(train.columns.values.tolist())

In [3]:
train.drop(index=[934, 1035], inplace=True)

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

it_train = train.copy()

it_train = IterativeImputer(random_state=2021).fit_transform(it_train)

itImp = pd.DataFrame(it_train)
itImp.columns = column_names

itImp.isna().sum()

hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

In [5]:
col_name=['hour', 'hour_bef_temperature', 'hour_bef_windspeed', 'hour_bef_humidity', 
          'hour_bef_visibility', 'hour_bef_ozone']

itImp_mid = itImp.copy()

for ilt in col_name:
    Q1=itImp_mid[ilt].quantile(0.25)
    Q3=itImp_mid[ilt].quantile(0.75)
    IQR=Q3-Q1
    train_delout=itImp_mid[(itImp_mid[ilt]<(Q1 - 1.5*IQR)) | (itImp_mid[ilt]>(Q3+1.5*IQR))]
    itImp_mid=itImp_mid.drop(train_delout.index, axis=0)
itImp_mid[col_name].info()

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1433 entries, 0 to 1456
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   hour                  1433 non-null   float64
 1   hour_bef_temperature  1433 non-null   float64
 2   hour_bef_windspeed    1433 non-null   float64
 3   hour_bef_humidity     1433 non-null   float64
 4   hour_bef_visibility   1433 non-null   float64
 5   hour_bef_ozone        1433 non-null   float64
dtypes: float64(6)
memory usage: 78.4 KB


In [6]:
def busyHourGen(data, col):
    lst = data[col]
    lst_ = []
    for i in lst:
        if (6 < i < 10) or (16 < i < 20):
            lst_.append(1)
        else:
            lst_.append(0)
    data['busy_hour'] = lst_
    return data

In [7]:
an = busyHourGen(itImp_mid, 'hour') # an = busy_hour 추가된 데이터셋

In [None]:
import optuna
from optuna.samplers import TPESampler

def objective_extratree(trial):
    params_et = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 4),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", 4, 5, 6]),
        "warm_start": trial.suggest_categorical("warm_start", [True, False]),
        "random_state": 2021
    }
    
    X = an[an.columns.difference(['count', 'hour_bef_visibility'])]
    y = an[['count']]
    X_train_ori, X_val_ori, y_train_ori, y_val_ori = train_test_split(X, y, test_size=0.33, random_state=2021)
    
    model = ExtraTreesRegressor(**params_et)
    model.fit(X_train_ori, y_train_ori)
    
    pred = model.predict(X_val_ori)
    rmse = np.sqrt(mean_squared_error(y_val_ori, pred))
    
    return rmse

sampler = TPESampler(seed=2021)
study = optuna.create_study(
    study_name="et_optimizer",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_extratree, n_trials=30)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [8]:
X = an[an.columns.difference(['count', 'hour_bef_visibility'])]
y = an[['count']]
colls = X.columns.tolist()

it_test = test[test.columns.difference(['id', 'hour_bef_visibility'])].copy()
it_test = busyHourGen(it_test, 'hour')
it_test = it_test[colls]
X_test = IterativeImputer(random_state=2021).fit_transform(it_test)

optuna_ver = ExtraTreesRegressor(n_estimators=500, max_depth= 16, warm_start= True, random_state=2021)
optuna_ver.fit(X, y)
predicts = optuna_ver.predict(X_test)

submission['count'] = predicts
submission.to_csv('et_dummyfeat3.csv', index=False)

In [31]:
par = {'n_estimators': 500, 'max_depth': 14, 'warm_start': True, 'random_state': 2021}

X = an[an.columns.difference(['count', 'hour_bef_visibility'])]
colls = X.columns.tolist()
X = np.column_stack((X['hour']**3, X['hour_bef_temperature']**2, X))
y = an[['count']]

it_test = test[test.columns.difference(['id', 'hour_bef_visibility'])].copy()
it_test = busyHourGen(it_test, 'hour')
it_test = it_test[colls]
X_test = np.column_stack((it_test['hour']**3, it_test['hour_bef_temperature']**2, it_test))
X_test = IterativeImputer(random_state=2021).fit_transform(X_test)

optuna_ver = ExtraTreesRegressor(**par)
optuna_ver.fit(X, y)
predicts = optuna_ver.predict(X_test)

submission['count'] = predicts
submission.to_csv('et_dummyfeatwind.csv', index=False)

In [32]:
import optuna
from optuna.samplers import TPESampler

def objective_extratree(trial):
    params_et = {
        "n_estimators": 500, #trial.suggest_int("n_estimators", 100, 1000, step=1),
        "max_depth": trial.suggest_int("max_depth", 8, 18),
        #"min_samples_split": trial.suggest_int("min_samples_split", 2, 4),
        #"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3),
        "warm_start": True, #trial.suggest_categorical("warm_start", [True, False]),
        "random_state": 2021
    }
    
    X = an[an.columns.difference(['count', 'hour_bef_visibility'])]
    X = np.column_stack((X['hour']**3, X['hour_bef_temperature']**2, X))
    y = an[['count']]
    X_train_ori, X_val_ori, y_train_ori, y_val_ori = train_test_split(X, y, test_size=0.33, random_state=2021)
    
    model = ExtraTreesRegressor(**params_et)
    model.fit(X_train_ori, y_train_ori)
    
    pred = model.predict(X_val_ori)
    rmse = np.sqrt(mean_squared_error(y_val_ori, pred))
    
    return rmse

sampler = TPESampler(seed=2021)
study = optuna.create_study(
    study_name="et_optimizer",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_extratree, n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-08-29 22:48:04,193][0m A new study created in memory with name: et_optimizer[0m
[32m[I 2021-08-29 22:48:05,239][0m Trial 0 finished with value: 37.01949207166466 and parameters: {'max_depth': 14}. Best is trial 0 with value: 37.01949207166466.[0m
[32m[I 2021-08-29 22:48:06,344][0m Trial 1 finished with value: 36.9560743984034 and parameters: {'max_depth': 16}. Best is trial 1 with value: 36.9560743984034.[0m
[32m[I 2021-08-29 22:48:07,050][0m Trial 2 finished with value: 37.127289306955106 and parameters: {'max_depth': 9}. Best is trial 1 with value: 36.9560743984034.[0m
[32m[I 2021-08-29 22:48:07,920][0m Trial 3 finished with value: 36.96020202182806 and parameters: {'max_depth': 11}. Best is trial 1 with value: 36.9560743984034.[0m
[32m[I 2021-08-29 22:48:09,013][0m Trial 4 finished with value: 37.043538502379135 and parameters: {'max_depth': 18}. Best is trial 1 with value: 36.9560743984034.[0m
[32m[I 2021-08-29 22:48:09,720][0m Trial 5 finished with 

Best Score: 36.9560743984034
Best trial: {'max_depth': 16}
