## Import

In [16]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import MinMaxScaler

## Data Load

In [17]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '/data/ephemeral/home/code/new_train.csv'
test_path  = '/data/ephemeral/home/code/new_test.csv'
dt = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)

In [41]:
dt_test.head()

Unnamed: 0,층,건축년도,등기신청일자,거래유형,경비비관리형태,세대전기계약방법,청소비관리형태,기타_의무_임대_임의_1_2_3_4,구,계약년도,...,전용면적,단지분류,세대타입_분양형태,관리방식,복도유형,난방방식,전용면적별세대현황_60_이하,전용면적별세대현황_60__85_이하,85__135_이하,135_초과
0,0.117647,0.03,0.1,0.1,2.0,1.0,2.0,1.0,25,2023,...,0.234808,5.0,1.0,0.5,4.0,3.0,0.005391,0.048714,0.0,0.0
1,0.191176,0.33,0.1,1.0,3.0,2.0,3.0,3.0,25,2023,...,0.332337,5.0,1.0,1.0,4.0,2.0,0.0,0.0,0.154667,0.0
2,0.264706,0.02,0.1,1.0,2.0,1.0,3.0,3.0,25,2023,...,0.514734,5.0,1.0,0.5,3.0,2.0,0.0,0.0,0.19,0.0
3,0.25,0.02,0.1,1.0,2.0,1.0,3.0,3.0,25,2023,...,0.419594,5.0,1.0,0.5,3.0,2.0,0.0,0.0,0.19,0.0
4,0.132353,0.02,0.1,1.0,2.0,1.0,3.0,3.0,25,2023,...,0.319308,5.0,1.0,0.5,3.0,2.0,0.0,0.0,0.19,0.0


In [18]:
dt.head(5)

Unnamed: 0,층,건축년도,등기신청일자,거래유형,경비비관리형태,세대전기계약방법,청소비관리형태,기타/의무/임대/임의=1/2/3/4,target,구,...,전용면적,단지분류,세대타입(분양형태),관리방식,복도유형,난방방식,전용면적별세대현황(60㎡이하),전용면적별세대현황(60㎡~85㎡이하),85㎡~135㎡이하,135㎡초과
0,0.09589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,124000,25,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
1,0.109589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,123500,25,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
2,0.123288,0.03,0.1,0.5,2.0,1.0,2.0,1.0,91500,25,...,0.10852,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
3,0.109589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,130000,25,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
4,0.082192,0.03,0.1,0.5,2.0,1.0,2.0,1.0,117000,25,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0


In [19]:
import re

# 피처 이름 정리 함수(모델에 들어갔을 때 오류 방지지)
def clean_column_names(columns):
    cleaned_columns = []
    for col in columns:
        # 특수 문자와 공백을 _로 대체
        col = re.sub(r'[^\w\s]', '_', col)  # 특수 문자 제거
        col = re.sub(r'\s+', '_', col)  # 공백을 _로 대체
        col = col.strip('_')  # 앞뒤 언더스코어 제거
        cleaned_columns.append(col)
    return cleaned_columns

# 데이터프레임 열 이름 변환
dt.columns = clean_column_names(dt.columns)
dt_test.columns = clean_column_names(dt_test.columns)
# 확인
print(dt.columns)

Index(['층', '건축년도', '등기신청일자', '거래유형', '경비비관리형태', '세대전기계약방법', '청소비관리형태',
       '기타_의무_임대_임의_1_2_3_4', 'target', '구', '계약년도', '성수기여부', '전용면적', '단지분류',
       '세대타입_분양형태', '관리방식', '복도유형', '난방방식', '전용면적별세대현황_60_이하',
       '전용면적별세대현황_60__85_이하', '85__135_이하', '135_초과'],
      dtype='object')


In [20]:
# Target과 독립변수들을 분리해줍니다.
y_train = dt['target']
X_train = dt.drop(['target'], axis=1)

In [21]:
X_train.head(5)

Unnamed: 0,층,건축년도,등기신청일자,거래유형,경비비관리형태,세대전기계약방법,청소비관리형태,기타_의무_임대_임의_1_2_3_4,구,계약년도,...,전용면적,단지분류,세대타입_분양형태,관리방식,복도유형,난방방식,전용면적별세대현황_60_이하,전용면적별세대현황_60__85_이하,85__135_이하,135_초과
0,0.09589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,25,2017,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
1,0.109589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,25,2017,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
2,0.123288,0.03,0.1,0.5,2.0,1.0,2.0,1.0,25,2017,...,0.10852,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
3,0.109589,0.03,0.1,0.5,2.0,1.0,2.0,1.0,25,2018,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0
4,0.082192,0.03,0.1,0.5,2.0,1.0,2.0,1.0,25,2018,...,0.168839,5.0,1.0,0.5,4.0,3.0,0.00402,0.048714,0.0,0.0


In [22]:
y_train.head(5)

0    124000
1    123500
2     91500
3    130000
4    117000
Name: target, dtype: int64

## Train Valid Split -> 나중에는 K fold 적용하기기

In [23]:
# Hold out split을 사용해 학습 데이터와 검증 데이터를 8:2 비율로 나누겠습니다.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2023)

## Model

In [24]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [35]:
gbm = lgb.LGBMRegressor(n_estimators=2000,
    num_leaves = 100,
    random_state = 42,)

In [36]:
gbm.fit(
    X_train, y_train, # 학습 데이터를 입력합니다.
    eval_set=[(X_train, y_train), (X_val, y_val)], # 평가셋을 지정합니다.
    eval_metric ='rmse', # 평가과정에서 사용할 평가함수를 지정합니다.
    callbacks=[lgb.log_evaluation(period=10, show_stdv=True)], # 앞서 지정했던 callback함수와 동일하게 지정합니다.
    )

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1025
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 20
[LightGBM] [Info] Start training from score 58000.483999
[10]	training's rmse: 24214.1	training's l2: 5.86321e+08	valid_1's rmse: 24484.7	valid_1's l2: 5.99499e+08
[20]	training's rmse: 17927.6	training's l2: 3.21401e+08	valid_1's rmse: 18202.7	valid_1's l2: 3.31337e+08
[30]	training's rmse: 15844.6	training's l2: 2.51051e+08	valid_1's rmse: 16149.6	valid_1's l2: 2.6081e+08
[40]	training's rmse: 14847.4	training's l2: 2.20444e+08	valid_1's rmse: 15178.9	valid_1's l2: 2.304e+08
[50]	training's rmse: 14282.4	training's l2: 2.03988e+08	valid_1's rmse: 14655.8	valid_1's l2: 2.14792e+08
[60]	training's rmse: 13893.7	training's l2: 1.93034e+08	valid_1's rmse: 14298	valid_1's l2: 2.04432e+08
[70]	training's rmse: 13575	training's l2: 1.84281e+08	v

In [37]:
# 예측 및 평가
y_pred = gbm.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.4f}")

RMSE: 10231.0343


In [38]:
# 학습한 모델을 저장합니다.
import joblib
joblib.dump(gbm, 'lightgbm_n2000_nlf100_rs42_new_data.pkl')

['lightgbm_n2000_nlf100_rs42_new_data.pkl']

In [39]:
# 저장된 모델을 불러옵니다.
with open('lightgbm_n2000_nlf100_rs42_new_data.pkl', 'rb') as f:
    model = pickle.load(f)

In [42]:
%%time
#X_test = dt_test.drop(['target'], axis=1)
X_test = dt_test

# Test dataset에 대한 inference를 진행합니다.
real_test_pred = model.predict(X_test)

CPU times: user 1.23 s, sys: 60 μs, total: 1.23 s
Wall time: 138 ms


In [43]:
print(real_test_pred)          # 예측값들이 출력됨을 확인할 수 있습니다.

[180202.39925088 348947.57535085 575489.22835535 ... 123371.71375923
 101549.80537691 103329.16219269]


In [44]:
# 앞서 예측한 예측값들을 저장합니다.
preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
preds_df.to_csv('output_lightgbm_n2000_nlf100_rs42_new_data.csv', index=False)