# Score: 19447

In [1]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import gdown
import joblib

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, TimeSeriesSplit
import lightgbm as lgb

import eli5
from eli5.sklearn import PermutationImportance

# 코드 셀 실행 후 경고를 무시
# import warnings
# warnings.filterwarnings(action='ignore')

In [2]:
train_path = '../../data/train.csv'
test_path  = '../../data/test.csv'
dt_train = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)

In [3]:
# train/test 구분을 위한 칼럼을 하나 만들어 줍니다.
dt_train['is_test'] = 0
dt_test['is_test'] = 1
df = pd.concat([dt_train, dt_test])     # 하나의 데이터로 만들어줍니다.

In [4]:
df['is_public'] = df["k-전용면적별세대현황(60㎡이하)"].notnull()

In [5]:
df['구'] = df['시군구'].map(lambda x : x.split()[1])
df['동'] = df['시군구'].map(lambda x : x.split()[2])
del df['시군구']

mean_val_by_gu = df.groupby("구")["target"].mean().sort_values(ascending=False)
mean_val_by_dong = df.groupby("동")["target"].mean().sort_values(ascending=False)
std_val_by_gu = df.groupby("구")["target"].std().sort_values(ascending=False)
std_val_by_dong = df.groupby("동")["target"].std().sort_values(ascending=False)

order1_mean_gu = mean_val_by_gu.index.tolist()
order2_mean_dong = mean_val_by_dong.index.tolist()
order3_std_gu = std_val_by_gu.index.tolist()
order4_std_dong = std_val_by_dong.index.tolist()

# Create a dictionary to map district names to their corresponding label encoded values
gu_mapping1 = {district: label for label, district in enumerate(order1_mean_gu)}
gu_mapping2 = {district: label for label, district in enumerate(order3_std_gu)}
dong_mapping1 = {dong: label for label, dong in enumerate(order2_mean_dong)}
dong_mapping2 = {dong: label for label, dong in enumerate(order4_std_dong)}

df['구_encoded'] = df["구"].map(gu_mapping1)
df['구_std'] = df["구"].map(gu_mapping2)
df['동_encoded'] = df["동"].map(dong_mapping1)
df['동_std'] = df["동"].map(dong_mapping2)

In [6]:
df['계약년'] = df['계약년월'].astype('str').map(lambda x : x[:4]).astype('int')

In [7]:
drop_col = ['번지', '본번', '부번', '계약일', 'k-전화번호', 'k-팩스번호', 'k-관리방식', 'k-복도유형', 'k-시행사', 'k-사용검사일-사용승인일', 'k-홈페이지', 'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '단지신청일', 'k-관리비부과면적', '주차대수', '건축면적', '해제사유발생일', '단지소개기존clob', 'k-135㎡초과', '중개사소재지', '등기신청일자', '거래유형', 'k-전체동수', 'k-전체세대수', 'k-연면적', 'k-주거전용면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', '좌표X', '좌표Y', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-난방방식','k-건설사(시공사)', '계약년월']
df.drop(drop_col, axis=1, inplace=True)

In [8]:
def info_df(df):
    temp_df = pd.DataFrame()
    temp_df['col_name'] = df.columns
    

    for i in range(len(df.columns)):
        x = df.iloc[:, i]
        temp_df.loc[i, 'data_type'] = x.dtype
        temp_df.loc[i, 'have_null'] = any(x.isna())
        temp_df.loc[i, 'null_count'] = x.isna().sum()
        temp_df.loc[i, 'null_ratio'] = x.isna().sum() / x.shape[0]
        temp_df.loc[i, 'nunique'] = x.nunique()
    return temp_df

In [9]:
# 먼저, 연속형 변수와 범주형 변수를 위 info에 따라 분리해주겠습니다.
continuous_columns = []
categorical_columns = []

for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        continuous_columns.append(column)
    else:
        categorical_columns.append(column)

print("연속형 변수:", continuous_columns)
print("범주형 변수:", categorical_columns)

# 범주형 변수에 대한 보간
df[categorical_columns] = df[categorical_columns].fillna('NULL')

연속형 변수: ['전용면적(㎡)', '층', '건축년도', 'target', 'is_test', 'is_public', '구_encoded', '구_std', '동_encoded', '동_std', '계약년']
범주형 변수: ['아파트명', '도로명', '구', '동']


In [10]:
# Set the display option to show all rows
pd.set_option('display.max_rows', None)
info_df(df)

Unnamed: 0,col_name,data_type,have_null,null_count,null_ratio,nunique
0,아파트명,object,False,0.0,0.0,6550.0
1,전용면적(㎡),float64,False,0.0,0.0,14670.0
2,층,int64,False,0.0,0.0,73.0
3,건축년도,int64,False,0.0,0.0,60.0
4,도로명,object,False,0.0,0.0,9245.0
5,target,float64,True,9272.0,0.008219,14530.0
6,is_test,int64,False,0.0,0.0,2.0
7,is_public,bool,False,0.0,0.0,2.0
8,구,object,False,0.0,0.0,25.0
9,동,object,False,0.0,0.0,337.0


In [11]:
pd.reset_option('display.max_rows')

In [12]:
df_train = df.loc[df['is_test']==0, :]
df_test = df.loc[df['is_test']==1, :]

df_train.drop(['is_test'], axis=1, inplace=True)
df_test.drop(['is_test'], axis=1, inplace=True)
print(df_train.shape, df_test.shape)

(1118822, 14) (9272, 14)


In [13]:
# dt_test의 target은 일단 0으로 임의로 채워주도록 하겠습니다.
df_test['target'] = 0

In [14]:
# 변수 삭제 및 파생변수 제작으로 추가된 변수들이 존재하기에, 다시한번 연속형과 범주형 칼럼을 분리해주겠습니다.
continuous_columns_v2 = []
categorical_columns_v2 = []

for column in df_train.columns:
    if pd.api.types.is_numeric_dtype(df_train[column]):
        continuous_columns_v2.append(column)
    else:
        categorical_columns_v2.append(column)

print("연속형 변수:", continuous_columns_v2)
print("범주형 변수:", categorical_columns_v2)

# 아래에서 범주형 변수들을 대상으로 레이블인코딩을 진행해 주겠습니다.

연속형 변수: ['전용면적(㎡)', '층', '건축년도', 'target', 'is_public', '구_encoded', '구_std', '동_encoded', '동_std', '계약년']
범주형 변수: ['아파트명', '도로명', '구', '동']


In [15]:
# 각 변수에 대한 LabelEncoder를 저장할 딕셔너리
label_encoders = {}

# Implement Label Encoding
for col in tqdm( categorical_columns_v2 ):
    lbl = LabelEncoder()

    # Label-Encoding을 fit
    lbl.fit( df_train[col].astype(str) )
    df_train[col] = lbl.transform(df_train[col].astype(str))
    label_encoders[col] = lbl           # 나중에 후처리를 위해 레이블인코더를 저장해주겠습니다.

    # Test 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가해줍니다.
    for label in np.unique(df_test[col]):
      if label not in lbl.classes_: # unseen label 데이터인 경우
        lbl.classes_ = np.append(lbl.classes_, label) # 미처리 시 ValueError발생하니 주의하세요!

    df_test[col] = lbl.transform(df_test[col].astype(str))

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:01<00:00,  3.22it/s]


In [16]:
def preprocess_feature_name(feature_name):
  """특수 문자를 제거하고 소문자로 변환합니다."""
  feature_name = feature_name.replace("-", "_")
  feature_name = feature_name.replace(",", "_")
  feature_name = feature_name.replace(".", "_")
  feature_name = feature_name.replace("(", "_")
  feature_name = feature_name.replace(")", "_")
  feature_name = feature_name.lower()
  return feature_name

def apply_preprocessed_feature_names(df_train):
  """데이터 프레임의 feature 이름을 수정합니다."""
  df_train.columns = [preprocess_feature_name(feature) for feature in df_train.columns]
  return df_train

# 데이터 프레임에 적용
df_train = apply_preprocessed_feature_names(df_train.copy())
df_test = apply_preprocessed_feature_names(df_test.copy())

In [17]:
df_train_before2021 = df_train[df_train['계약년'] <= 2020].reset_index()
df_train_after2021 = df_train[df_train['계약년'] > 2020].reset_index()

print(len(df_train_before2021), len(df_train_after2021))

1045943 72879


In [18]:
y = df_train_before2021['target']
X = df_train_before2021.drop(['target', 'index'], axis=1)
X_test = df_test.drop(['target'], axis=1)

In [19]:
gbm_list = []
for random_state in [0, 1, 42, 2023, 2024]:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    gbm = lgb.LGBMRegressor(n_estimators=2000, max_depth=20, num_leaves=100,
                        min_child_samples=60, feature_fraction=0.8,
                        bagging_fraction=0.8)

    gbm.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(stopping_rounds=50),
                    lgb.log_evaluation(period=50, show_stdv=True)])
    
    gbm_list.append(gbm)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1651
[LightGBM] [Info] Number of data points in the train set: 836754, number of used features: 13
[LightGBM] [Info] Start training from score 54855.987417
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 9961.39	training's l2: 9.92293e+07	valid_1's rmse: 10219.4	valid_1's l2: 1.04437e+08
[100]	training's rmse: 8380.95	training's l2: 7.02404e+07	valid_1's rmse: 8669.63	valid_1's l2: 7.51624e+07
[150]	training's rmse: 7777.61	training's l2: 6.04912e+07	valid_1's rmse: 8105.08	valid_1's l2: 6.56924e+07
[200]	training's rmse: 7389.48	training's l2: 5.46045e+07	valid_1's rmse: 7749.55	valid_1's l2: 6.00555e+07
[250]	training's rmse: 7103.65	training's l2: 5.04618e+07	valid_1's rmse: 7488.97	valid_1's l2: 5.60847e+07
[300]	training's rmse: 6871.86	training's l2: 4.72224e+07	valid_1's rmse: 7285.63	valid_1's l

In [20]:
for e in gbm_list:
    print(e.best_score_)

defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('rmse', 5205.2440958930765), ('l2', 27094566.09782974)]), 'valid_1': OrderedDict([('rmse', 6156.852579249154), ('l2', 37906833.68260696)])})
defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('rmse', 5206.426318134855), ('l2', 27106875.006167267)]), 'valid_1': OrderedDict([('rmse', 6125.788312370296), ('l2', 37525282.447972514)])})
defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('rmse', 5199.173867131504), ('l2', 27031408.900663164)]), 'valid_1': OrderedDict([('rmse', 6159.627756618054), ('l2', 37941014.10009957)])})
defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('rmse', 5208.129284664733), ('l2', 27124610.645782374)]), 'valid_1': OrderedDict([('rmse', 6069.698534871405), ('l2', 36841240.30422008)])})
defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('rmse', 5201.1759322568605), ('l2', 27052231.078288015)]), 'valid

In [21]:
y = df_train_after2021['target']
X = df_train_after2021.drop(['target', 'index'], axis=1)

In [22]:
gbm_transfer_list = []
for i, random_state in enumerate([0, 1, 42, 2023, 2024]):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)

    gbm = gbm_list[i]

    gbm.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(stopping_rounds=10),
                    lgb.log_evaluation(period=10, show_stdv=True)])
    
    gbm_transfer_list.append(gbm)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1623
[LightGBM] [Info] Number of data points in the train set: 58303, number of used features: 13
[LightGBM] [Info] Start training from score 102978.207622
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 38589.5	training's l2: 1.48915e+09	valid_1's rmse: 38857.9	valid_1's l2: 1.50993e+09
[20]	training's rmse: 25096.1	training's l2: 6.29813e+08	valid_1's rmse: 25532	valid_1's l2: 6.51883e+08
[30]	training's rmse: 20732.1	training's l2: 4.2982e+08	valid_1's rmse: 21302.9	valid_1's l2: 4.53815e+08
[40]	training's rmse: 18890.1	training's l2: 3.56835e+08	valid_1's rmse: 19626.2	valid_1's l2: 3.85188e+08
[50]	training's rmse: 17790.5	training's l2: 3.16503e+08	valid_1's rmse: 18717.5	valid_1's l2: 3.50345e+08
[60]	training's rmse: 17030	training's l2: 2.90021e+08	valid_1's rmse: 18114.8	valid_1's l2: 3.28145

In [25]:
for e in gbm_transfer_list:
    real_test_pred = e.predict(X_test)
    preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
    display(preds_df.describe())    



Unnamed: 0,target
count,9272.0
mean,105073.8
std,74563.8
min,2815.0
25%,62157.25
50%,85567.0
75%,124132.8
max,1165343.0




Unnamed: 0,target
count,9272.0
mean,105089.1
std,74172.95
min,2919.0
25%,61943.75
50%,85459.0
75%,124573.5
max,1080015.0




Unnamed: 0,target
count,9272.0
mean,104963.3
std,74565.58
min,3052.0
25%,62008.5
50%,85883.5
75%,123612.2
max,1177325.0




Unnamed: 0,target
count,9272.0
mean,104952.1
std,74267.68
min,3898.0
25%,62166.5
50%,85373.5
75%,124012.2
max,1182063.0




Unnamed: 0,target
count,9272.0
mean,104979.7
std,75000.5
min,2987.0
25%,62033.75
50%,85516.5
75%,124311.8
max,1186502.0


In [26]:

real_test_pred = gbm_transfer_list[-1].predict(X_test)
preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
preds_df.to_csv('transfer_learning.csv', index=False)

