In [39]:
random_state = 42

import numpy as np
import pandas as pd
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [40]:
train = pd.read_csv("../data/pre_train.csv")
test = pd.read_csv("../data/pre_test.csv")
y = pd.read_csv("../data/target.csv")

In [41]:
scaler = StandardScaler()

train[train.columns] = scaler.fit_transform(train)
test[test.columns] = scaler.transform(test)

In [42]:
y = y.values.ravel()
y = np.log1p(y)

In [43]:
x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=random_state)

In [44]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(893296, 88) (893296,) (223325, 88) (223325,)


In [45]:
import lightgbm as lgb

model = lgb.LGBMRegressor(n_estimators=10000,
                          objective='regression',
                          max_depth=-1,
                          learning_rate=0.1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          random_state=42)

> training's rmse: 3520.41   
> valid_1's rmse: 5888.11

In [46]:
model.fit(
    x_train, y_train, # 학습 데이터를 입력합니다.
    eval_set=[(x_train, y_train), (x_test, y_test)], # 평가셋을 지정합니다.
    eval_metric ='rmse', # 평가과정에서 사용할 평가함수를 지정합니다.
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(period=100, show_stdv=True)], # 앞서 지정했던 callback함수와 동일하게 지정합니다.
    )

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5653
[LightGBM] [Info] Number of data points in the train set: 893296, number of used features: 88
[LightGBM] [Info] Start training from score 10.753211
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.159947	training's l2: 0.0255831	valid_1's rmse: 0.160393	valid_1's l2: 0.025726
[200]	training's rmse: 0.137393	training's l2: 0.0188768	valid_1's rmse: 0.138424	valid_1's l2: 0.0191611
[300]	training's rmse: 0.12651	training's l2: 0.0160048	valid_1's rmse: 0.127924	valid_1's l2: 0.0163645
[400]	training's rmse: 0.118858	training's l2: 0.0141271	valid_1's rmse: 0.120541	valid_1's l2: 0.0145301
[500]	training's rmse: 0.112441	training's l2: 0.0126429	valid_1's rmse: 0.114408	valid_1's l2: 0.0130893
[600]	training's rmse: 0.10767	training's l2: 0.0115928	valid_1's 

In [47]:
import pickle

# 학습된 모델을 저장합니다. Pickle 라이브러리를 이용하겠습니다.
with open('model/saved_model_02.pkl', 'wb') as f:
    pickle.dump(model, f)

In [48]:
test_pred = model.predict(test)
test_pred = np.expm1(test_pred)

In [50]:
preds_df = pd.DataFrame(test_pred.astype(int), columns=["target"])

In [51]:
preds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9272 entries, 0 to 9271
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   target  9272 non-null   int64
dtypes: int64(1)
memory usage: 72.6 KB


In [52]:
preds_df.to_csv('../data/sub/output_lightgbm_001.csv', index=False)