# Model Performance Test

## Test Coverage

1. RandomForest (Baseline)
2. RandomForest (GridSearch)
3. XGBoost
4. lightgbm
5. Ridge
6. Lasso

## Result

### Version 5

1. RandomForest (Baseline) : **-135.3542569213732**
2. RandomForest (GridSearch) : **-129.82509272712304**
3. XGBoost : **-133.05619049563796**
4. lightgbm : **-129.29797683448098**
5. Ridge : **-129.53765260865381**
6. Lasso : **-126.23847596987002**

### version 6

1. RandomForest (Baseline) : **-131.47390586932448**
2. RandomForest (GridSearch) : **-124.0621623994042**
3. XGBoost : **-124.64631778252223**
4. lightgbm : fit 불가
5. Ridge : **-129.52106875849282**
6. Lasso : **-126.43471733505991**

## Import Module

In [1]:
import pandas as pd
import numpy as np
from os.path import join as Join

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = Join(DATA_ROOT, '../../../../competition_data/parking_data/')

TRAIN_5_ROOT = Join(DATA_ROOT, 'train_version_5.csv')
TEST_5_ROOT = Join(DATA_ROOT, 'test_version_5.csv')
TRAIN_6_ROOT = Join(DATA_ROOT, 'train_version_6.csv')
TEST_6_ROOT = Join(DATA_ROOT, 'test_version_6.csv')
SUBMISSION_ROOT = Join(DATA_ROOT, 'sample_submission.csv')

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_5_ROOT : {TRAIN_5_ROOT}")
print(f"TEST_5_ROOT : {TEST_5_ROOT}")
print(f"TRAIN_6_ROOT : {TRAIN_6_ROOT}")
print(f"TEST_6_ROOT : {TEST_6_ROOT}")
print(f"SUBMISSION_ROOT : {SUBMISSION_ROOT}")

DATA_ROOT : ../../../../competition_data/parking_data/
TRAIN_5_ROOT : ../../../../competition_data/parking_data/train_version_5.csv
TEST_5_ROOT : ../../../../competition_data/parking_data/test_version_5.csv
TRAIN_6_ROOT : ../../../../competition_data/parking_data/train_version_6.csv
TEST_6_ROOT : ../../../../competition_data/parking_data/test_version_6.csv
SUBMISSION_ROOT : ../../../../competition_data/parking_data/sample_submission.csv


In [3]:
train_5 = pd.read_csv(TRAIN_5_ROOT)
test_5 = pd.read_csv(TEST_5_ROOT)
train_6 = pd.read_csv(TRAIN_6_ROOT)
test_6 = pd.read_csv(TEST_6_ROOT)
submission = pd.read_csv(SUBMISSION_ROOT)

print("Data Loaded!")

Data Loaded!


## Data Split

In [4]:
train_5 = train_5.set_index('단지코드')
test_5 = test_5.set_index('단지코드')

train_6 = train_6.set_index('단지코드')
test_6 = test_6.set_index('단지코드')

In [5]:
X_train_5 = train_5.drop(['등록차량수'], axis=1)
y_train_5 = train_5['등록차량수']
X_test_5 = test_5

X_train_6 = train_6.drop(['등록차량수'], axis=1)
y_train_6 = train_6['등록차량수']
X_test_6 = test_6

## Model Performance

In [6]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### [Test 1] RandomForest (Baseline)

In [7]:
model_1 = RandomForestRegressor(n_jobs=-1, random_state=42)

#### Version 5

In [11]:
print(f"[RandomForest (Baseline) : {cross_val_score(model_1, X_train_5, y_train_5, cv=10, scoring='neg_mean_absolute_error').mean()}")

[RandomForest (Baseline) : -135.3542569213732


#### Version 6

In [12]:
print(f"[RandomForest (Baseline) : {cross_val_score(model_1, X_train_6, y_train_6, cv=10, scoring='neg_mean_absolute_error').mean()}")

[RandomForest (Baseline) : -131.47390586932448


### [Test 2] RandomForest (GridSearch)

#### Version 5

In [15]:
model_2 = RandomForestRegressor()

param_grid = {
    "max_depth" : [11, 15, 19],
    "n_estimators" : [300, 500, 700],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_2, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_5, y_train_5)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -129.82509272712304
Best Params : {'max_depth': 15, 'n_estimators': 500, 'random_state': 42}


#### Version 6

In [17]:
model_2 = RandomForestRegressor()

param_grid = {
    "max_depth" : [15, 19, 21],
    "n_estimators" : [100, 300, 500],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_2, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_6, y_train_6)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -124.0621623994042
Best Params : {'max_depth': 19, 'n_estimators': 300, 'random_state': 42}


### [Test 3] XGBoost

#### Version 5

In [18]:
model_3 = XGBRegressor()

param_grid = {
    "n_estimators" : [50, 100, 300, 500, 1000],
    "eval_metric" : ['mae'],
    "learning_rate" : [0.01, 0.001],
    "max_depth" : [7, 9, 11, 15],
    "use_label_encoder" : [False],
    "seed" : [42],
    "reg_lambda" : [1.7256912198205319]
}

gs = GridSearchCV(
    model_3, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_5, y_train_5)

print(f"Best Score : {gs.best_score_}")
print(f"Best params : {gs.best_params_}")



Best Score : -133.05619049563796
Best params : {'eval_metric': 'mae', 'learning_rate': 0.01, 'max_depth': 11, 'n_estimators': 1000, 'reg_lambda': 1.7256912198205319, 'seed': 42, 'use_label_encoder': False}


#### Version 6

In [20]:
model_3 = XGBRegressor()

param_grid = {
    "n_estimators" : [1000, 1500],
    "eval_metric" : ['mae'],
    "learning_rate" : [0.01, 0.001],
    "max_depth" : [5, 7, 9],
    "use_label_encoder" : [False],
    "seed" : [42],
    "reg_lambda" : [1.7256912198205319]
}

gs = GridSearchCV(
    model_3, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_6, y_train_6)

print(f"Best Score : {gs.best_score_}")
print(f"Best params : {gs.best_params_}")



Best Score : -124.64631778252223
Best params : {'eval_metric': 'mae', 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000, 'reg_lambda': 1.7256912198205319, 'seed': 42, 'use_label_encoder': False}


### [Test 4] lightGBM

#### Version 5

In [21]:
model_4 = LGBMRegressor()

param_grid = {
    "n_estimators" : [1000, 1500, 2000],
    "learning_rate" : [0.01],
    "max_depth" : [3, 5, 9, 11],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_4, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_5, y_train_5)

print(f"Best Score : {gs.best_score_}")
print(f"Best params : {gs.best_params_}")



Best Score : -129.29797683448098
Best params : {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000, 'random_state': 42}


### [Test 5] Ridge

#### Version 5

In [24]:
model_5 = Ridge()

param_grid = {
    "alpha" : [0.01, 0.05, 0.1],
    "normalize" : [True],
    "max_iter" : [1000],
    "solver" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_5, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_5, y_train_5)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -129.53765260865381
Best Params : {'alpha': 0.05, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'solver': 'svd'}


#### Version 6

In [25]:
model_5 = Ridge()

param_grid = {
    "alpha" : [0.01, 0.05, 0.1],
    "normalize" : [True],
    "max_iter" : [1000],
    "solver" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_5, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_6, y_train_6)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -129.52106875849282
Best Params : {'alpha': 0.05, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'solver': 'svd'}


### [Test 6] Lasso

#### Version 5

In [27]:
model_6 = Lasso()

param_grid = {
    "alpha" : [0.1, 0.5, 0.7, 1.0],
    "normalize" : [True],
    "max_iter" : [1000],
    "warm_start" : [True],
    "selection" : ['cyclic', 'random'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_6, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_5, y_train_5)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -126.23847596987002
Best Params : {'alpha': 0.5, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'selection': 'random', 'warm_start': True}


#### Version 6

In [28]:
model_6 = Lasso()

param_grid = {
    "alpha" : [0.1, 0.5, 0.7, 1.0],
    "normalize" : [True],
    "max_iter" : [1000],
    "warm_start" : [True],
    "selection" : ['cyclic', 'random'],
    "random_state" : [42]
}

gs = GridSearchCV(
    model_6, param_grid,
    cv=k_fold, scoring='neg_mean_absolute_error'
)

gs.fit(X_train_6, y_train_6)

print(f"Best Score : {gs.best_score_}")
print(f"Best Params : {gs.best_params_}")



Best Score : -126.43471733505991
Best Params : {'alpha': 0.5, 'max_iter': 1000, 'normalize': True, 'random_state': 42, 'selection': 'cyclic', 'warm_start': True}


## Train

In [34]:
model = Lasso(alpha=0.5, max_iter=1000, normalize=True, random_state=42, selection='cyclic', warm_start=True)

In [35]:
model.fit(X_train_6, y_train_6)

Lasso(alpha=0.5, normalize=True, random_state=42, warm_start=True)

## Inference & Submit

In [36]:
predict = model.predict(X_test_6)

In [37]:
submission['num'] = predict

In [40]:
submission.to_csv('Lasso_with_version_6.csv', index=False)