# Version 3-1

## Test Coverage

1. Train에서 공급유형이 장기전세, 공공분양, 공공임대(5년)인 example을 아예 Drop한다.
2. 각 단지코드 내에 임대건물구분 -> '아파트'이면 1, '상가&아파트'이면 0 으로 매핑
3. '자격유형별 평균 임대료' feature 추가

## Import Module

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = os.path.join(DATA_ROOT, "../../../competition_data/parking_data/")

TRAIN_ROOT = os.path.join(DATA_ROOT, "train.csv")
TEST_ROOT = os.path.join(DATA_ROOT, "test.csv")
AGE_GENDER_INFO_ROOT = os.path.join(DATA_ROOT, "age_gender_info.csv")

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(F"SUBMISSION_ROOT : {AGE_GENDER_INFO_ROOT}")

DATA_ROOT : ../../../competition_data/parking_data/
TRAIN_ROOT : ../../../competition_data/parking_data/train.csv
TEST_ROOT : ../../../competition_data/parking_data/test.csv
SUBMISSION_ROOT : ../../../competition_data/parking_data/age_gender_info.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)
age_gender_info = pd.read_csv(AGE_GENDER_INFO_ROOT)

print("Data Loaded!")

Data Loaded!


## Preprocessing (Version 1)

### 지역명 숫자로 매핑

In [4]:
local_map = {}

for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

train['지역'] = train['지역'].map(local_map)
test['지역'] = test['지역'].map(local_map)

### 전용면적 처리

In [5]:
train['전용면적'] = train['전용면적']//5*5
test['전용면적'] = test['전용면적']//5*5

## Preprocessing (Version 2)

### `'-'` -> NULL, dtype을 float으로 변경

In [6]:
columns = ['임대보증금', '임대료']

for col in columns:
    train.loc[train[col] == '-', col] = np.nan
    test.loc[test[col] == '-', col] = np.nan

    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

### NULL 값 처리

#### 임대보증금, 임대료

In [7]:
train[['임대보증금', '임대료']] = train[['임대보증금', '임대료']].fillna(0)
test[['임대보증금', '임대료']] = test[['임대보증금', '임대료']].fillna(0)

#### 지하철, 버스

In [8]:
cols = ['도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수']
train[cols] = train[cols].fillna(0)
test[cols] = test[cols].fillna(0)

#### 자격유형

In [9]:
test.loc[test.단지코드.isin(['C2411']) & test.자격유형.isnull(), '자격유형'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.자격유형.isnull(), '자격유형'] = 'C'

#### 중복 example 제거

In [10]:
train = train.drop_duplicates()
test = test.drop_duplicates()

### 자격유형 병합

- 'J', 'L', 'K', 'N', 'M', 'O' -> '행복주택_공급대상'
    - 공급유형이 행복주택인 경우에서만 나타남
- 'H', 'B', 'E', 'G' -> '국민임대_공급대상'
    - E는 영구임대인 경우도 있긴 하지만 국민임대인 경우가 더 많아서 여기에 포함시킴
- 'C', 'I', 'F', 'G' -> '영구임대_공급대상'

In [11]:
train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'
test.loc[test.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'

train.loc[train.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대_공급대상'
test.loc[test.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대_공급대상'

train.loc[train.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'
test.loc[test.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'

### 공급유형 병합

- '공공분양', '공공임대(10년)', '공공임대(분납)' -> '공공임대(5년/10년/분납/분양)'

In [12]:
train.loc[train.공급유형.isin(['공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(10년/분납/분양)'
test.loc[test.공급유형.isin(['공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(10년/분납/분양)'

## Preprocessing (Version 3)

### [Test 1] Train에서 공급유형이 장기전세, 공공분양, 공공임대(5년)인 example을 아예 Drop한다.

In [13]:
idx = train[(train.공급유형 == '장기전세') | (train.공급유형 == '공공분양') | (train.공급유형 == '공공임대(5년)')].index
train = train.drop(idx)

train.공급유형.unique().tolist()

['국민임대', '공공임대(50년)', '영구임대', '임대상가', '공공임대(10년/분납/분양)', '행복주택']

### [Test 3] '자격유형별 평균 임대료' feature 추가

In [14]:
train_test_3 = train[['단지코드', '자격유형', '임대료']].copy()

qualifies = train_test_3.자격유형.unique().tolist()

for qualify in tqdm(qualifies):
    train_test_3.loc[train_test_3.자격유형 == qualify, '평균임대료(자격유형)'] = train_test_3.loc[train_test_3.자격유형 == qualify, '임대료'].mean()

train_test_3

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,단지코드,자격유형,임대료,평균임대료(자격유형)
0,C2483,A,103680.0,211018.228867
1,C2483,A,103680.0,211018.228867
2,C2483,A,184330.0,211018.228867
3,C2483,A,184330.0,211018.228867
4,C2483,A,184330.0,211018.228867
...,...,...,...,...
2945,C2437,영구임대_공급대상,107530.0,93702.237762
2946,C2532,A,116090.0,211018.228867
2948,C2532,A,142310.0,211018.228867
2950,C2532,A,142310.0,211018.228867


In [15]:
test_test_3 = test[['단지코드', '자격유형', '임대료']].copy()

qualifies = test_test_3.자격유형.unique().tolist()

for qualify in tqdm(qualifies):
    test_test_3.loc[test_test_3.자격유형 == qualify, '평균임대료(자격유형)'] = test_test_3.loc[test_test_3.자격유형 == qualify, '임대료'].mean()

test_test_3

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,단지코드,자격유형,임대료,평균임대료(자격유형)
0,C1072,국민임대_공급대상,189840.0,202304.563107
1,C1072,A,249930.0,191891.669627
2,C1072,국민임대_공급대상,249930.0,202304.563107
3,C1072,국민임대_공급대상,249930.0,202304.563107
4,C1072,국민임대_공급대상,296780.0,202304.563107
...,...,...,...,...
1017,C1267,행복주택_공급대상,0.0,107397.631579
1018,C2189,국민임대_공급대상,106400.0,202304.563107
1019,C2189,국민임대_공급대상,106400.0,202304.563107
1020,C2189,국민임대_공급대상,144600.0,202304.563107


## Aggregation

### 단지코드 별로 모두 같은 값을 가지는 feature

- `Version 2`에서 사용한 방법

In [16]:
unique_cols = ['총세대수', '지역', '공가수', \
    '도보 10분거리 내 지하철역 수(환승노선 수 반영)', 
    '도보 10분거리 내 버스정류장 수', 
    '단지내주차면수', '등록차량수']

train_agg = train.set_index('단지코드')[unique_cols].drop_duplicates()
test_agg = test.set_index('단지코드')[[col for col in unique_cols if col != '등록차량수']].drop_duplicates()

In [17]:
list(train_agg.columns)

['총세대수',
 '지역',
 '공가수',
 '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
 '도보 10분거리 내 버스정류장 수',
 '단지내주차면수',
 '등록차량수']

### 단지코드 별로 다양한 값을 가지는 feature

#### '전용면적', '전용면적별세대수'

In [18]:
train_dedicated = train[['단지코드', '전용면적', '전용면적별세대수']].copy()

codes = train_dedicated.단지코드.unique().tolist()

for code in tqdm(codes):
    train_dedicated.loc[train_dedicated.단지코드 == code, '전용면적'] = train_dedicated.loc[train_dedicated.단지코드 == code, '전용면적'].mean()

for code in tqdm(codes):
    train_dedicated.loc[train_dedicated.단지코드 == code, '전용면적별세대수'] = train_dedicated.loc[train_dedicated.단지코드 == code, '전용면적별세대수'].mean()

  0%|          | 0/421 [00:00<?, ?it/s]

  0%|          | 0/421 [00:00<?, ?it/s]

In [19]:
test_dedicated = test[['단지코드', '전용면적', '전용면적별세대수']].copy()

codes = test_dedicated.단지코드.unique().tolist()

for code in tqdm(codes):
    test_dedicated.loc[test_dedicated.단지코드 == code, '전용면적'] = test_dedicated.loc[test_dedicated.단지코드 == code, '전용면적'].mean()

for code in tqdm(codes):
    test_dedicated.loc[test_dedicated.단지코드 == code, '전용면적별세대수'] = test_dedicated.loc[test_dedicated.단지코드 == code, '전용면적별세대수'].mean()

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [20]:
train_dedicated_agg = train_dedicated.drop_duplicates()

train_dedicated_agg = train_dedicated_agg.set_index('단지코드')

train_dedicated_agg

Unnamed: 0_level_0,전용면적,전용면적별세대수
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C2483,46.875000,112.500000
C2515,41.428571,77.857143
C1407,38.125000,114.250000
C1945,49.000000,127.000000
C1470,41.250000,174.000000
...,...,...
C2586,25.000000,18.000000
C2035,33.000000,98.400000
C2020,25.000000,13.333333
C2437,20.000000,90.000000


In [21]:
test_dedicated_agg = test_dedicated.drop_duplicates()

test_dedicated_agg = test_dedicated_agg.set_index('단지코드')

test_dedicated_agg

Unnamed: 0_level_0,전용면적,전용면적별세대수
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C1072,46.250000,94.250000
C1128,43.333333,150.444444
C1456,41.250000,66.625000
C1840,41.250000,148.250000
C1332,43.750000,162.125000
...,...,...
C2456,32.500000,86.500000
C1266,33.000000,118.200000
C2152,25.000000,60.000000
C1267,26.363636,60.909091


#### [Test 3]

In [24]:
def reshape_cat_features(data, cast_col, value_col):
    res = data.drop_duplicates(['단지코드', cast_col]).assign(counter=1).pivot(index='단지코드', columns=cast_col, values=value_col).fillna(0)
    res.columns.name = None
    res = res.rename(columns={col:cast_col+'_'+col for col in res.columns})
    return res

##### 자격유형

- Version 2와 같이 feature로 올려서 처리

In [25]:
train_test_3_qualify = train_test_3[['단지코드', '자격유형']].copy()

train_test_3_qualify_agg = reshape_cat_features(data=train_test_3_qualify, cast_col='자격유형', value_col='counter')

train_test_3_qualify_agg

Unnamed: 0_level_0,자격유형_A,자격유형_D,자격유형_국민임대_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1000,1.0,0.0,0.0,0.0,0.0
C1004,0.0,1.0,0.0,1.0,0.0
C1005,1.0,0.0,0.0,0.0,0.0
C1013,1.0,0.0,0.0,0.0,0.0
C1014,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
C2663,0.0,0.0,1.0,0.0,0.0
C2666,1.0,0.0,0.0,0.0,0.0
C2670,1.0,0.0,0.0,0.0,0.0
C2680,1.0,0.0,0.0,0.0,0.0


In [26]:
test_test_3_qualify = test_test_3[['단지코드', '자격유형']].copy()

test_test_3_qualify_agg = reshape_cat_features(data=test_test_3_qualify, cast_col='자격유형', value_col='counter')

test_test_3_qualify_agg

Unnamed: 0_level_0,자격유형_A,자격유형_D,자격유형_국민임대_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1003,0.0,0.0,0.0,0.0,1.0
C1006,0.0,1.0,0.0,1.0,0.0
C1016,1.0,0.0,0.0,0.0,0.0
C1019,1.0,0.0,0.0,0.0,0.0
C1030,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
C2653,1.0,0.0,0.0,0.0,0.0
C2675,1.0,0.0,0.0,0.0,0.0
C2676,0.0,1.0,0.0,1.0,0.0
C2688,0.0,0.0,1.0,1.0,0.0


##### '임대료'와 '평균임대료(자격유형)'

- 단지코드별로 평균값을 취한 값을 사용하기

In [27]:
train_test_3_mean = train_test_3[['단지코드', '임대료', '평균임대료(자격유형)']].copy()

codes = train_test_3_mean.단지코드.unique().tolist()

for code in tqdm(codes):
    train_test_3_mean.loc[train_test_3_mean.단지코드 == code, '임대료'] = train_test_3_mean.loc[train_test_3_mean.단지코드 == code, '임대료'].mean()

for code in tqdm(codes):
    train_test_3_mean.loc[train_test_3_mean.단지코드 == code, '평균임대료(자격유형)'] = train_test_3_mean.loc[train_test_3_mean.단지코드 == code, '평균임대료(자격유형)'].mean()

  0%|          | 0/421 [00:00<?, ?it/s]

  0%|          | 0/421 [00:00<?, ?it/s]

In [28]:
test_test_3_mean = test_test_3[['단지코드', '임대료', '평균임대료(자격유형)']].copy()

codes = test_test_3_mean.단지코드.unique().tolist()

for code in tqdm(codes):
    test_test_3_mean.loc[test_test_3_mean.단지코드 == code, '임대료'] = test_test_3_mean.loc[test_test_3_mean.단지코드 == code, '임대료'].mean()

for code in tqdm(codes):
    test_test_3_mean.loc[test_test_3_mean.단지코드 == code, '평균임대료(자격유형)'] = test_test_3_mean.loc[test_test_3_mean.단지코드 == code, '평균임대료(자격유형)'].mean()

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [29]:
train_test_3_mean = train_test_3_mean.drop_duplicates()

train_test_3_mean = train_test_3_mean.set_index('단지코드')

print(f"train_test_3_mean : {train_test_3_mean.shape}")

train_test_3_mean

train_test_3_mean : (421, 2)


Unnamed: 0_level_0,임대료,평균임대료(자격유형)
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C2483,167910.000000,211018.228867
C2515,139557.142857,211018.228867
C1407,165956.250000,211018.228867
C1945,200914.000000,180192.614679
C1470,120150.000000,211018.228867
...,...,...
C2586,111180.000000,124577.164179
C2035,158800.000000,211018.228867
C2020,135976.666667,124577.164179
C2437,107530.000000,93702.237762


In [30]:
test_test_3_mean = test_test_3_mean.drop_duplicates()

test_test_3_mean = test_test_3_mean.set_index('단지코드')

print(f"test_test_3_mean : {test_test_3_mean.shape}")

test_test_3_mean

test_test_3_mean : (150, 2)


Unnamed: 0_level_0,임대료,평균임대료(자격유형)
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C1072,265843.750000,201002.951422
C1128,245516.666667,197676.610449
C1456,222290.000000,191891.669627
C1840,123150.000000,191891.669627
C1332,292880.000000,202304.563107
...,...,...
C2456,155082.500000,202304.563107
C1266,199878.000000,202304.563107
C2152,0.000000,80207.857143
C1267,84548.181818,145593.550558


##### '자격유형', '임대료', '평균임대료(자격유형)' 하나로 병합

In [31]:
train_test_3_agg = pd.concat([train_test_3_qualify_agg, train_test_3_mean], axis=1)
test_test_3_agg = pd.concat([test_test_3_qualify_agg, test_test_3_mean], axis=1)

print(f"train_test_3_agg : {train_test_3_agg.shape}\ntest_test_3_agg : {test_test_3_agg.shape}")

train_test_3_agg

train_test_3_agg : (421, 7)
test_test_3_agg : (150, 7)


Unnamed: 0_level_0,자격유형_A,자격유형_D,자격유형_국민임대_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상,임대료,평균임대료(자격유형)
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C1000,1.0,0.0,0.0,0.0,0.0,119635.000000,211018.228867
C1004,0.0,1.0,0.0,1.0,0.0,20939.000000,18740.447552
C1005,1.0,0.0,0.0,0.0,0.0,222046.666667,211018.228867
C1013,1.0,0.0,0.0,0.0,0.0,134726.000000,211018.228867
C1014,1.0,0.0,0.0,0.0,0.0,160488.571429,211018.228867
...,...,...,...,...,...,...,...
C2663,0.0,0.0,1.0,0.0,0.0,269070.000000,180192.614679
C2666,1.0,0.0,0.0,0.0,0.0,133145.000000,211018.228867
C2670,1.0,0.0,0.0,0.0,0.0,116117.500000,211018.228867
C2680,1.0,0.0,0.0,0.0,0.0,129816.666667,211018.228867


### Final Aggregation

In [32]:
train_agg = pd.concat([train_agg, train_dedicated_agg, train_test_3_agg], axis=1)
test_agg = pd.concat([test_agg, test_dedicated_agg, test_test_3_agg], axis=1)

print(f"train_agg : {train_agg.shape}\ntest_agg : {test_agg.shape}")

train_agg

train_agg : (421, 16)
test_agg : (150, 15)


Unnamed: 0_level_0,총세대수,지역,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,전용면적,전용면적별세대수,자격유형_A,자격유형_D,자격유형_국민임대_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상,임대료,평균임대료(자격유형)
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C2483,900,0,38.0,0.0,3.0,1425.0,1015.0,46.875000,112.500000,1.0,0.0,0.0,0.0,0.0,167910.000000,211018.228867
C2515,545,1,17.0,0.0,3.0,624.0,205.0,41.428571,77.857143,1.0,0.0,0.0,0.0,0.0,139557.142857,211018.228867
C1407,1216,2,13.0,1.0,1.0,1285.0,1064.0,38.125000,114.250000,1.0,0.0,0.0,0.0,0.0,165956.250000,211018.228867
C1945,755,3,6.0,1.0,3.0,734.0,730.0,49.000000,127.000000,0.0,0.0,1.0,0.0,0.0,200914.000000,180192.614679
C1470,696,4,14.0,0.0,2.0,645.0,553.0,41.250000,174.000000,1.0,0.0,0.0,0.0,0.0,120150.000000,211018.228867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2586,90,9,7.0,0.0,3.0,66.0,57.0,25.000000,18.000000,0.0,0.0,0.0,0.0,1.0,111180.000000,124577.164179
C2035,492,5,24.0,0.0,1.0,521.0,246.0,33.000000,98.400000,1.0,0.0,0.0,0.0,0.0,158800.000000,211018.228867
C2020,40,8,7.0,1.0,2.0,25.0,19.0,25.000000,13.333333,0.0,0.0,0.0,0.0,1.0,135976.666667,124577.164179
C2437,90,11,12.0,0.0,1.0,30.0,16.0,20.000000,90.000000,0.0,0.0,0.0,1.0,0.0,107530.000000,93702.237762


## To CSV

In [34]:
TRAIN_VERSION_3_ROOT = os.path.join(DATA_ROOT, 'train_version_3_1.csv')
TEST_VERSION_3_ROOT = os.path.join(DATA_ROOT, 'test_version_3_1.csv')

train_agg.to_csv(TRAIN_VERSION_3_ROOT)
test_agg.to_csv(TEST_VERSION_3_ROOT)