In [72]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

In [73]:
train_path = "../modified_data/v04_train.csv"
test_path = "../modified_data/v04_test.csv"

In [74]:
train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path,index_col=0)

In [75]:
train["is_test"] = 0
test["is_test"] = 1
concat = pd.concat([train, test])

# hash encoding 

In [76]:
!pip install category_encoders

[0m

In [77]:
import category_encoders as ce

In [78]:
column_date = ["k-사용검사일-사용승인일","k-등록일자", "k-수정일자", "단지승인일", "단지신청일", "등기신청일자"]

for column in column_date:
    print(column, concat[column].dtypes)
    concat[column] = pd.to_datetime(concat[column])
    print(concat[column].dtypes)

continuous_columns = []
categorical_columns = []
for column in concat.columns:
    if pd.api.types.is_numeric_dtype(concat[column]) or pd.api.types.is_datetime64_ns_dtype(concat[column]):
        continuous_columns.append(column)
    else:
        categorical_columns.append(column)

print("연속형 변수:", continuous_columns)
print("범주형 변수:", categorical_columns)

k-사용검사일-사용승인일 object
datetime64[ns]
k-등록일자 object
datetime64[ns]
k-수정일자 object
datetime64[ns]
단지승인일 object
datetime64[ns]
단지신청일 object
datetime64[ns]
등기신청일자 float64
datetime64[ns]
연속형 변수: ['본번', '부번', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도', '등기신청일자', 'k-전체동수', 'k-전체세대수', 'k-사용검사일-사용승인일', 'k-연면적', 'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-등록일자', 'k-수정일자', '건축면적', '주차대수', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y', '단지신청일', 'target', 'is_test']
범주형 변수: ['번지', '아파트명', '거래유형', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-복도유형', 'k-난방방식', 'k-건설사(시공사)', '기타/의무/임대/임의=1/2/3/4', '시', '구', '동']


In [79]:
hash_encoder = ce.HashingEncoder(cols=["아파트명"], n_components=8)  # n_components는 해시 테이블의 크기를 결정합니다.
df_encoded = hash_encoder.fit_transform(concat)

In [80]:
df_encoded.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,번지,본번,...,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target,시,구,동,is_test
0,0,0,0,0,0,0,1,0,658-1,658.0,...,1.0,0.0,127.05721,37.476763,2022-11-17 10:19:06,124000.0,서울특별시,강남구,개포동,0
1,0,0,0,0,0,0,1,0,658-1,658.0,...,1.0,0.0,127.05721,37.476763,2022-11-17 10:19:06,123500.0,서울특별시,강남구,개포동,0
2,0,0,0,0,0,0,1,0,658-1,658.0,...,1.0,0.0,127.05721,37.476763,2022-11-17 10:19:06,91500.0,서울특별시,강남구,개포동,0
3,0,0,0,0,0,0,1,0,658-1,658.0,...,1.0,0.0,127.05721,37.476763,2022-11-17 10:19:06,130000.0,서울특별시,강남구,개포동,0
4,0,0,0,0,0,0,1,0,658-1,658.0,...,1.0,0.0,127.05721,37.476763,2022-11-17 10:19:06,117000.0,서울특별시,강남구,개포동,0


In [81]:
df_encoded.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       '번지', '본번', '부번', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도', '등기신청일자',
       '거래유형', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-복도유형', 'k-난방방식',
       'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-사용검사일-사용승인일', 'k-연면적', 'k-주거전용면적',
       'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-등록일자', 'k-수정일자', '건축면적', '주차대수',
       '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y',
       '단지신청일', 'target', '시', '구', '동', 'is_test'],
      dtype='object')

# 단지분류, 세대타입, 복도유형, 난방방식, 기타/의무/임대/임의 ,  거래유형

In [82]:
df_encoded = pd.get_dummies(df_encoded, columns=['k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-복도유형', 'k-난방방식', '기타/의무/임대/임의=1/2/3/4', '거래유형'])


In [83]:
df_encoded.head(1)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,번지,본번,...,k-난방방식_개별난방,k-난방방식_기타,k-난방방식_중앙난방,k-난방방식_지역난방,기타/의무/임대/임의=1/2/3/4_기타,기타/의무/임대/임의=1/2/3/4_의무,기타/의무/임대/임의=1/2/3/4_임대,기타/의무/임대/임의=1/2/3/4_임의,거래유형_중개거래,거래유형_직거래
0,0,0,0,0,0,0,1,0,658-1,658.0,...,1,0,0,0,0,0,0,1,0,0


In [84]:
df_encoded.drop(["번지", "본번", "부번", "k-건설사(시공사)", "시"], axis=1, inplace=True)

In [85]:
df_encoded.columns

Index(['번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도',
       '등기신청일자', '거래유형', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-복도유형',
       'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-사용검사일-사용승인일', 'k-연면적',
       'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-등록일자', 'k-수정일자', '건축면적', '주차대수',
       '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y',
       '단지신청일', 'target', '시', '구', '동', 'is_test'],
      dtype='object')

In [89]:
label_encoders = {}

for col in tqdm( ["구","동"]):
    lbl = LabelEncoder()

    # Label-Encoding을 fit
    lbl.fit( df_encoded[col].astype(str) )
    df_encoded[col] = lbl.transform(df_encoded[col].astype(str))
    label_encoders[col] = lbl           # 나중에 후처리를 위해 레이블인코더를 저장해주겠습니다.

    # Test 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가해줍니다.
    for label in np.unique(df_encoded[col]):
      if label not in lbl.classes_: # unseen label 데이터인 경우
        lbl.classes_ = np.append(lbl.classes_, label) # 미처리 시 ValueError발생하니 주의하세요!

    df_encoded[col] = lbl.transform(df_encoded[col].astype(str))

 50%|█████     | 1/2 [00:01<00:01,  1.63s/it]


ValueError: y contains previously unseen labels: '8'

In [None]:
modified_train = df_encoded[df_encoded["is_test"]==0]
modified_test = df_encoded[df_encoded["is_test"]==1]
modified_train.head()
modified_train.drop(["is_test"], axis=1, inplace=True)
modified_test.head()
modified_test.drop(["is_test"], axis=1, inplace=True)

modified_test.to_csv("../modified_data/v05_test.csv")
modified_train.to_csv("../modified_data/v05_train.csv")