# Model Performance Test

## Test Coverage

1. RandomForest (Baseline)
2. RandomForest (GridSearch)
3. XGBoost
4. lightgbm
5. Ridge
6. Lasso

## Result

1. RandomForest (Baseline) : **-129.66053879429143**
2. RandomForest (GridSearch) : **-129.66053879429143**
3. XGBoost : **-134.52866928479915**
4. lightgbm : **-128.65809393573224**
5. Ridge : **-130.7595200519923**
6. Lasso : **-128.2108916930039**

## Import Module

In [29]:
import pandas as pd
import numpy as np
from os.path import join as Join

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = Join(DATA_ROOT, '../../../../competition_data/parking_data/')

TRAIN_ROOT = Join(DATA_ROOT, 'train_version_4.csv')
TEST_ROOT = Join(DATA_ROOT, 'test_version_4.csv')
SUBMISSION_ROOT = Join(DATA_ROOT, 'sample_submission.csv')

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(f"SUBMISSION_ROOT : {SUBMISSION_ROOT}")

DATA_ROOT : ../../../../competition_data/parking_data/
TRAIN_ROOT : ../../../../competition_data/parking_data/train_version_4.csv
TEST_ROOT : ../../../../competition_data/parking_data/test_version_4.csv
SUBMISSION_ROOT : ../../../../competition_data/parking_data/sample_submission.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)
submission = pd.read_csv(SUBMISSION_ROOT)

print("Data Loaded!")

Data Loaded!


## Data Split

In [5]:
train = train.set_index('단지코드')
test = test.set_index('단지코드')

In [6]:
X_train = train.drop(['등록차량수'], axis=1)
y_train = train['등록차량수']
X_test = test

## Model Performance

In [13]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### [Test 1] RandomForest (Baseline)

In [8]:
model_1 = RandomForestRegressor(n_jobs=-1, random_state=42)

In [9]:
print(f"[RandomForest (Baseline) : {cross_val_score(model_1, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean()}")

[RandomForest (Baseline) : -137.1207619047619


### [Test 2] RandomForest (GridSearch)

In [18]:
model_2 = RandomForestRegressor()

param_grid = {
    "max_depth" : [11, 15, 19],
    "n_estimators" : [50, 100, 300],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_2, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error', verbose=1
)

gs.fit(X_train, y_train)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Score : -129.66053879429143
Best Params : {'max_depth': 15, 'n_estimators': 100, 'random_state': 42}


In [19]:
model_2 = RandomForestRegressor(max_depth=15, n_estimators=100, random_state=42)

### [Test 3] XGBoost

In [23]:
model_3 = XGBRegressor()

param_grid = {
    "n_estimators" : [50, 100, 300, 500, 1000],
    "eval_metric" : ['mae'],
    "learning_rate" : [0.01, 0.001],
    "max_depth" : [7, 9, 11, 15],
    "use_label_encoder" : [False],
    "seed" : [42],
    "reg_lambda" : [1.7256912198205319]
}

gs = GridSearchCV(
    model_3, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train, y_train)

print(f"Best Score : {gs.best_score_}")
print(f"Best params : {gs.best_params_}")



Best Score : -134.52866928479915
Best params : {'eval_metric': 'mae', 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'reg_lambda': 1.7256912198205319, 'seed': 42, 'use_label_encoder': False}


In [24]:
model_3 = XGBRegressor(eval_metric='mae', learning_rate=0.01, max_depth=9, n_estimators=500, reg_lambda=1.7256912198205319, seed=42, use_label_encoder=False)

### [Test 4] lightGBM

In [27]:
model_4 = LGBMRegressor()

param_grid = {
    "n_estimators" : [1000, 1500, 2000],
    "learning_rate" : [0.01],
    "max_depth" : [3, 5, 9, 11],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_4, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train, y_train)

print(f"Best Score : {gs.best_score_}")
print(f"Best params : {gs.best_params_}")



Best Score : -128.65809393573224
Best params : {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'random_state': 42}


In [28]:
model_4 = LGBMRegressor(n_estimators=1000, max_depth=5, learning_rate=0.01, random_state=42)

### [Test 5] Ridge

In [33]:
model_5 = Ridge()

param_grid = {
    "alpha" : [0.01, 0.05, 0.1],
    "normalize" : [True],
    "max_iter" : [1000],
    "solver" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_5, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train, y_train)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -130.7595200519923
Best Params : {'alpha': 0.05, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'solver': 'sag'}


### [Test 6] Lasso

In [36]:
model_5 = Lasso()

param_grid = {
    "alpha" : [0.1, 0.5, 0.7, 1.0],
    "normalize" : [True],
    "max_iter" : [1000],
    "warm_start" : [True],
    "selection" : ['cyclic', 'random'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_5, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train, y_train)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")

Best Score : -128.2108916930039
Best Params : {'alpha': 0.5, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'selection': 'random', 'warm_start': True}




In [37]:
model_6 = Lasso(alpha=0.5, max_iter=1000, normalize=True, random_state=42, selection='random', warm_start=True)

## Train

In [38]:
model_6.fit(X_train, y_train)

Lasso(alpha=0.5, normalize=True, random_state=42, selection='random',
      warm_start=True)

## Inference & Submit

In [39]:
predict = model_6.predict(X_test)

In [43]:
submission['num'] = predict

In [44]:
submission.to_csv('lasso_with_version_4.csv', index=False)