In [55]:
'''
!pip install eli5==0.13.0
!apt-get install -y fonts-nanum
'''

'\n!pip install eli5==0.13.0\n!apt-get install -y fonts-nanum\n'

In [56]:

# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance


In [57]:
train_path = '/data/ephemeral/home/train.csv'
test_path  = '/data/ephemeral/home/test.csv'
subway_path = '/data/ephemeral/home/subway_feature.csv'
bus_path = '/data/ephemeral/home/bus_feature.csv'
sub_path = '/data/ephemeral/home/realprice_file.csv'
sub_data = pd.read_csv(sub_path)
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
subway = pd.read_csv(subway_path)
bus = pd.read_csv(bus_path)

In [58]:
# train/test 구분을 위한 칼럼을 하나 만들어 줍니다.
df_train['is_test'] = 0
df_test['is_test'] = 1
concat = pd.concat([df_train, df_test])

In [59]:
concat['is_test'].value_counts()

0    1118822
1       9272
Name: is_test, dtype: int64

In [60]:
concat = concat.rename(columns={'전용면적(㎡)':'전용면적'})

In [61]:
concat['등기신청일자'] = pd.to_numeric(concat['등기신청일자'], errors='coerce').astype('Int64')
print(concat['등기신청일자'])

0           <NA>
1           <NA>
2           <NA>
3           <NA>
4           <NA>
          ...   
9267        <NA>
9268        <NA>
9269        <NA>
9270        <NA>
9271    20230905
Name: 등기신청일자, Length: 1128094, dtype: Int64


In [62]:
concat['해제사유발생여부'] = pd.notnull(concat['해제사유발생일']).astype(int)
print(concat[['해제사유발생여부']])

      해제사유발생여부
0            0
1            0
2            0
3            0
4            0
...        ...
9267         1
9268         0
9269         0
9270         0
9271         0

[1128094 rows x 1 columns]


In [63]:
concat['등기신청일자여부'] = pd.notnull(concat['등기신청일자']).astype(int)
print(concat[['등기신청일자여부']])

      등기신청일자여부
0            0
1            0
2            0
3            0
4            0
...        ...
9267         0
9268         0
9269         0
9270         0
9271         1

[1128094 rows x 1 columns]


최종선택 column
- 아파트명, 도로명, 전용면적, 계약년월, 건축년도
제거 고민 기준
- 해제사유발생여부, 등기신청여부

In [64]:
concat = concat.assign(year=concat['계약년월'].astype('str').str[:4].astype(int)).query('year > 2013')

In [65]:
merged_df = pd.merge(concat, sub_data, on='계약년월', how='left')

In [66]:
merged_df['실거래지수']

0         101.0
1         101.0
2         101.0
3         103.0
4         103.0
          ...  
731133    157.9
731134    157.9
731135    159.8
731136    161.4
731137    161.4
Name: 실거래지수, Length: 731138, dtype: float64

In [67]:
concat = merged_df

In [68]:
selected_columns = ['도로명', '전용면적', '실거래지수','is_test','target']
selected_concat = concat[selected_columns]

In [69]:
selected_concat

Unnamed: 0,도로명,전용면적,실거래지수,is_test,target
0,언주로 3,79.97,101.0,0,124000.0
1,언주로 3,79.97,101.0,0,123500.0
2,언주로 3,54.98,101.0,0,91500.0
3,언주로 3,79.97,103.0,0,130000.0
4,언주로 3,79.97,103.0,0,117000.0
...,...,...,...,...,...
731133,신내역로1길 85,84.65,157.9,1,
731134,신내역로1길 85,84.62,157.9,1,
731135,신내역로1길 85,101.65,159.8,1,
731136,신내역로1길 85,84.94,161.4,1,


In [70]:
selected_concat['도로명'] = selected_concat['도로명'].astype('category')

In [71]:
selected_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 731138 entries, 0 to 731137
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   도로명      731138 non-null  category
 1   전용면적     731138 non-null  float64 
 2   실거래지수    731138 non-null  float64 
 3   is_test  731138 non-null  int64   
 4   target   721866 non-null  float64 
dtypes: category(1), float64(3), int64(1)
memory usage: 29.6 MB


In [72]:
# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# '도로명' 열 라벨 인코딩
selected_concat['도로명_encoded'] = label_encoder.fit_transform(selected_concat['도로명'])

# 결과 출력
print(selected_concat[['도로명', '도로명_encoded']])

              도로명  도로명_encoded
0           언주로 3         6066
1           언주로 3         6066
2           언주로 3         6066
3           언주로 3         6066
4           언주로 3         6066
...           ...          ...
731133  신내역로1길 85         5461
731134  신내역로1길 85         5461
731135  신내역로1길 85         5461
731136  신내역로1길 85         5461
731137  신내역로1길 85         5461

[731138 rows x 2 columns]


In [73]:
selected_concat = selected_concat.drop(['도로명'], axis=1)

In [74]:
selected_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 731138 entries, 0 to 731137
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   전용면적         731138 non-null  float64
 1   실거래지수        731138 non-null  float64
 2   is_test      731138 non-null  int64  
 3   target       721866 non-null  float64
 4   도로명_encoded  731138 non-null  int64  
dtypes: float64(3), int64(2)
memory usage: 33.5 MB


In [75]:
#selected_concat['도로명_encoded'] = selected_concat['도로명_encoded'].astype('category')

In [76]:
selected_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 731138 entries, 0 to 731137
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   전용면적         731138 non-null  float64
 1   실거래지수        731138 non-null  float64
 2   is_test      731138 non-null  int64  
 3   target       721866 non-null  float64
 4   도로명_encoded  731138 non-null  int64  
dtypes: float64(3), int64(2)
memory usage: 33.5 MB


In [77]:
# 이제 다시 train과 test dataset을 분할해줍니다. 위에서 제작해 놓았던 is_test 칼럼을 이용합니다.
dt_train = selected_concat.query('is_test==0')
dt_test = selected_concat.query('is_test==1')

# 이제 is_test 칼럼은 drop해줍니다.
dt_train.drop(['is_test'], axis = 1, inplace=True)
dt_test.drop(['is_test'], axis = 1, inplace=True)
print(dt_train.shape, dt_test.shape)

(721866, 4) (9272, 4)


In [78]:
import lightgbm

x, y, test = dt_train[['전용면적','도로명_encoded','실거래지수']], dt_train[['target']], dt_test[['전용면적','도로명_encoded','실거래지수']]

pred = lightgbm.LGBMRegressor(n_estimators=100, learning_rate=0.01, max_depth=5).fit(x, y).predict(test)

pd.DataFrame(pred.astype(int), columns=["target"]).to_csv('sub_f2.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 721866, number of used features: 3
[LightGBM] [Info] Start training from score 65880.745652
