# Result Optimization

- Train data와 Test data의 분포 편향이 존재할 수 있음.
    - 최종 예측 값에서 30을 빼고 음수 값들은 모두 0으로 처리해서 제출해보자.
    - [Sejong-Kaggle-Challengers 모임 간 논의된 방안 중 하나.](https://github.com/Sejong-Kaggle-Challengers/MAIN/blob/main/%5B17-2%5D%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94%EC%98%88%EC%B8%A1/%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94%EB%B3%B4%EA%B3%A0%EC%84%9C.pdf)

## Import Module

In [7]:
import pandas as pd
import numpy as np
from os.path import join as Join
from tqdm.notebook import tqdm

from sklearn.linear_model import Lasso

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = Join(DATA_ROOT, '../../../../competition_data/parking_data/')

TRAIN_ROOT = Join(DATA_ROOT, 'train_version_5.csv')
TEST_ROOT = Join(DATA_ROOT, 'test_version_5.csv')
SUBMISSION_ROOT = Join(DATA_ROOT, 'sample_submission.csv')

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(f"SUBMISSION_ROOT : {SUBMISSION_ROOT}")

DATA_ROOT : ../../../../competition_data/parking_data/
TRAIN_ROOT : ../../../../competition_data/parking_data/train_version_5.csv
TEST_ROOT : ../../../../competition_data/parking_data/test_version_5.csv
SUBMISSION_ROOT : ../../../../competition_data/parking_data/sample_submission.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)
submission = pd.read_csv(SUBMISSION_ROOT)

print("Data Loaded!")

Data Loaded!


## Data Split

In [6]:
X_train = train.drop(['단지코드', '등록차량수'], axis=1)
y_train = train['등록차량수']
X_test = test.drop(['단지코드'], axis=1)

print(f"X_train : {X_train.shape}")
print(f"y_train : {y_train.shape}")
print(f"X_test : {X_test.shape}")

X_train : (421, 29)
y_train : (421,)
X_test : (150, 29)


## Train

In [8]:
model = Lasso(alpha=0.5, max_iter=1000, normalize=True, random_state=42, selection='random', warm_start=True)

In [9]:
model.fit(X_train, y_train)

Lasso(alpha=0.5, normalize=True, random_state=42, selection='random',
      warm_start=True)

## Inference & Processing & submission

In [10]:
predict = model.predict(X_test)

In [12]:
submission['num'] = predict

In [14]:
submission['num'] -= 30

In [15]:
submission.loc[submission.num < 0, 'num'] = 0

In [17]:
submission.to_csv('result_optimization.csv', index=False)