# Basic EDA

- competition 코드 공유에 공개되어있는 EDA를 참고하여 확인한 내용입니다.
- [주차수요 예측 EDA & Catboost Baseline(LB: 118.6101)](https://dacon.io/competitions/official/235745/codeshare/2851?page=1&dtype=recent)


## Import module

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

import matplotlib
from matplotlib import font_manager, rc
from matplotlib import pyplot as plt
import platform

if platform.system() == 'Window':
    font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
    rc('font', family=font_name)
else:
    rc('font', family='AppleChthic')

matplotlib.rcParams['axes.unicode_minus'] = False

## Data Search

### Data Load & Basic Search

In [2]:
DATA_ROOT = ''
DATA_ROOT = os.path.join(DATA_ROOT, "../../../competition_data/parking_data/")

TRAIN_ROOT = os.path.join(DATA_ROOT, "train.csv")
TEST_ROOT = os.path.join(DATA_ROOT, "test.csv")
AGE_GENDER_INFO_ROOT = os.path.join(DATA_ROOT, "age_gender_info.csv")

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(F"SUBMISSION_ROOT : {AGE_GENDER_INFO_ROOT}")

DATA_ROOT : ../../../competition_data/parking_data/
TRAIN_ROOT : ../../../competition_data/parking_data/train.csv
TEST_ROOT : ../../../competition_data/parking_data/test.csv
SUBMISSION_ROOT : ../../../competition_data/parking_data/age_gender_info.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)
age_gender_info = pd.read_csv(AGE_GENDER_INFO_ROOT)

print("Data Loaded!")

Data Loaded!


- train에서 Null을 포함하는 features
    - 임대보증금
    - 임대료
    - 도보 10분거리 내 지하철역 수(환ㅅ으노선 수 반영)
    - 도보 10분거리 내 버스정류장 수

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          2952 non-null   object 
 1   총세대수                          2952 non-null   int64  
 2   임대건물구분                        2952 non-null   object 
 3   지역                            2952 non-null   object 
 4   공급유형                          2952 non-null   object 
 5   전용면적                          2952 non-null   float64
 6   전용면적별세대수                      2952 non-null   int64  
 7   공가수                           2952 non-null   float64
 8   자격유형                          2952 non-null   object 
 9   임대보증금                         2383 non-null   object 
 10  임대료                           2383 non-null   object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  2741 non-null   float64
 12  도보 10분거리 내 버스정류장 수            2948 non-null   float64
 13  단지내

- test에서 Null을 포함하는 features
    - 자격유형
    - 임대보증금
    - 임대료
    - 도보 10분거리 내 지하철역 수(환승노선 수 반영)

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022 entries, 0 to 1021
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          1022 non-null   object 
 1   총세대수                          1022 non-null   int64  
 2   임대건물구분                        1022 non-null   object 
 3   지역                            1022 non-null   object 
 4   공급유형                          1022 non-null   object 
 5   전용면적                          1022 non-null   float64
 6   전용면적별세대수                      1022 non-null   int64  
 7   공가수                           1022 non-null   float64
 8   자격유형                          1020 non-null   object 
 9   임대보증금                         842 non-null    object 
 10  임대료                           842 non-null    object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  980 non-null    float64
 12  도보 10분거리 내 버스정류장 수            1022 non-null   float64
 13  단지내

### 숫자이어야 하는 feature에서 숫자가 아닌 값 확인

- `임대보증금`, `임대료` 의 dtype이 object이다.

In [6]:
not_nums = []

for val in tqdm(train['임대보증금'].unique().tolist()):

    try :
        num = float(val)
    except :
        not_nums.append(val)

print(f"\nCharacter Values in 'train['임대보증금']' : \n{not_nums}\n")

not_nums = []

for val in tqdm(train['임대료'].unique().tolist()):

    try :
        num = float(val)
    except :
        not_nums.append(val)

print(f"\nCharacter Values in 'train['임대료']' : \n{not_nums}\n")

  0%|          | 0/958 [00:00<?, ?it/s]


Character Values in 'train['임대보증금']' : 
['-']



  0%|          | 0/996 [00:00<?, ?it/s]


Character Values in 'train['임대료']' : 
['-']



In [7]:
not_nums = []

for val in tqdm(test['임대보증금'].unique().tolist()):

    try :
        num = float(val)
    except :
        not_nums.append(val)

print(f"\nCharacter Values in 'test['임대보증금']' : \n{not_nums}\n")

not_nums = []

for val in tqdm(test['임대료'].unique().tolist()):

    try :
        num = float(val)
    except :
        not_nums.append(val)

print(f"\nCharacter Values in 'test['임대료']' : \n{not_nums}\n")

  0%|          | 0/405 [00:00<?, ?it/s]


Character Values in 'test['임대보증금']' : 
['-']



  0%|          | 0/405 [00:00<?, ?it/s]


Character Values in 'test['임대료']' : 
['-']



### '-'을 null로 바꾸고, dtype을 float로 변경

In [8]:
columns = ['임대보증금', '임대료']

for col in columns:
    train.loc[train[col] == '-', col] = np.nan
    test.loc[test[col] == '-', col] = np.nan

    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

## NULL 값이 있는 변수 탐색

### `임대보증금`, `임대료`

- 임대보증금이 null이면 임대료도 null이다.
- '-' 인 경우도 모두 null로 만들어주었으므로,
    - null을 모두 0으로 만들자

In [9]:
columns = ['공급유형', '자격유형']
bases = ['임대보증금', '임대료']

for base in bases:
    print("========"*5)
    
    for col in columns:

        val_counts = train.loc[train[base].isnull(), col].value_counts()
        print(f"\n[Train '{col}' value counts based on '{base}'] : \n{val_counts}\n")
    
    print("========"*5)


[Train '공급유형' value counts based on '임대보증금'] : 
임대상가    562
국민임대      8
공공분양      7
행복주택      4
Name: 공급유형, dtype: int64


[Train '자격유형' value counts based on '임대보증금'] : 
D    569
H      8
K      4
Name: 자격유형, dtype: int64


[Train '공급유형' value counts based on '임대료'] : 
임대상가    562
장기전세      9
국민임대      8
공공분양      7
행복주택      4
Name: 공급유형, dtype: int64


[Train '자격유형' value counts based on '임대료'] : 
D    569
A      9
H      8
K      4
Name: 자격유형, dtype: int64



In [10]:
columns = ['공급유형', '자격유형']
bases = ['임대보증금', '임대료']

for base in bases:
    print("========"*5)
    
    for col in columns:

        val_counts = test.loc[test[base].isnull(), col].value_counts()
        print(f"\n[Test '{col}' value counts based on '{base}'] : \n{val_counts}\n")
    
    print("========"*5)


[Test '공급유형' value counts based on '임대보증금'] : 
임대상가    177
영구임대      5
행복주택      4
Name: 공급유형, dtype: int64


[Test '자격유형' value counts based on '임대보증금'] : 
D    180
L      4
C      2
Name: 자격유형, dtype: int64


[Test '공급유형' value counts based on '임대료'] : 
임대상가    177
영구임대      5
행복주택      4
Name: 공급유형, dtype: int64


[Test '자격유형' value counts based on '임대료'] : 
D    180
L      4
C      2
Name: 자격유형, dtype: int64



In [11]:
train[['임대보증금', '임대료']] = train[['임대보증금', '임대료']].fillna(0)
test[['임대보증금', '임대료']] = test[['임대보증금', '임대료']].fillna(0)

### `도보 10분거리 내 지하철역 수(환승노선 수 반영)`, `도보 10분거리 내 버스정류장 수`

- NULL 값을 0으로 대체

#### Train

In [12]:
subway_null_codes = train[train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].isnull()].단지코드.unique()
print(f"Null이 포함된 example의 code 종류의 수 : {len(subway_null_codes)}")
print(f"해당 code인 example 수 (Null인 example 수) : {train.loc[train.단지코드.isin(subway_null_codes), '도보 10분거리 내 지하철역 수(환승노선 수 반영)'].value_counts(dropna=False).item()}")

Null이 포함된 example의 code 종류의 수 : 20
해당 code인 example 수 (Null인 example 수) : 211


In [13]:
bus_null_codes = train[train['도보 10분거리 내 버스정류장 수'].isnull()].단지코드.unique()
print(f"Null이 포함된 example의 code 종류의 수 : {len(bus_null_codes)}")
print(f"해당 code인 example 수 (Null인 example 수) : {train.loc[train.단지코드.isin(bus_null_codes), '도보 10분거리 내 버스정류장 수'].value_counts(dropna=False).item()}")

Null이 포함된 example의 code 종류의 수 : 1
해당 code인 example 수 (Null인 example 수) : 4


#### Test

In [14]:
subway_null_codes = test[test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].isnull()].단지코드.unique()
print(f"Null이 포함된 example의 code 종류의 수 : {len(subway_null_codes)}")
print(f"해당 code인 example 수 (Null인 example 수) : {test.loc[test.단지코드.isin(subway_null_codes), '도보 10분거리 내 지하철역 수(환승노선 수 반영)'].value_counts(dropna=False).item()}")

Null이 포함된 example의 code 종류의 수 : 5
해당 code인 example 수 (Null인 example 수) : 42


#### fillna

In [15]:
cols = ['도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수']
train[cols] = train[cols].fillna(0)
test[cols] = test[cols].fillna(0)

### `자격유형`

In [16]:
test[test.자격유형.isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
196,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000.0,37470.0,0.0,2.0,840.0
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000.0,44770.0,0.0,2.0,173.0


#### `C2411` 자격유형 양상

- A만 존재하므로 `A`로 채우면 될 듯 하다.

In [17]:
test.loc[test.단지코드 == 'C2411', '자격유형'].unique().tolist()

['A', nan]

In [18]:
test.loc[test.단지코드.isin(['C2411']) & test.자격유형.isnull(), '자격유형'] = 'A'

#### `C2253` 자격유형 양상

- 임대보증금, 임대료가 존재하면 자격유형이 C, 없으면 D로 설정되어있는 양상을 보인다.
- Nan이 있는 example은 존재하므로 `C`로 채우면 될 듯 하다.

In [19]:
test.loc[test.단지코드 == 'C2253', '자격유형'].unique().tolist()

[nan, 'C', 'D']

In [20]:
test[test.단지코드 == 'C2253']

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
258,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000.0,44770.0,0.0,2.0,173.0
259,C2253,1161,아파트,강원도,영구임대,31.32,239,0.0,C,3731000.0,83020.0,0.0,2.0,173.0
260,C2253,1161,아파트,강원도,영구임대,31.32,149,0.0,C,3731000.0,83020.0,0.0,2.0,173.0
261,C2253,1161,상가,강원도,임대상가,13.77,1,0.0,D,0.0,0.0,0.0,2.0,173.0
262,C2253,1161,상가,강원도,임대상가,22.89,1,0.0,D,0.0,0.0,0.0,2.0,173.0
263,C2253,1161,상가,강원도,임대상가,22.91,1,0.0,D,0.0,0.0,0.0,2.0,173.0
264,C2253,1161,상가,강원도,임대상가,23.79,1,0.0,D,0.0,0.0,0.0,2.0,173.0
265,C2253,1161,상가,강원도,임대상가,23.79,1,0.0,D,0.0,0.0,0.0,2.0,173.0
266,C2253,1161,상가,강원도,임대상가,23.86,1,0.0,D,0.0,0.0,0.0,2.0,173.0
267,C2253,1161,상가,강원도,임대상가,23.86,1,0.0,D,0.0,0.0,0.0,2.0,173.0


In [21]:
test.loc[test.단지코드.isin(['C2253']) & test.자격유형.isnull(), '자격유형'] = 'C'

## 중복 확인

In [22]:
print(f"Train Shape : {train.shape}")
print(f"Drop Duplicates Train shape : {train.drop_duplicates().shape}")
print(f"Number of Duplicates in Train: {train.shape[0] - train.drop_duplicates().shape[0]}")

Train Shape : (2952, 15)
Drop Duplicates Train shape : (2632, 15)
Number of Duplicates in Train: 320


In [23]:
print(f"Test Shape : {test.shape}")
print(f"Drop Duplicates Test shape : {test.drop_duplicates().shape}")
print(f"Number of Duplicates in Test: {test.shape[0] - test.drop_duplicates().shape[0]}")

Test Shape : (1022, 14)
Drop Duplicates Test shape : (949, 14)
Number of Duplicates in Test: 73


### Drop Duplicates

In [24]:
train = train.drop_duplicates()
test = test.drop_duplicates()

### 하나의 단지 코드에서 Unique 값 확인 (C2483)

In [25]:
train[train.단지코드=='C2483']

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000.0,103680.0,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000.0,103680.0,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000.0,184330.0,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000.0,184330.0,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000.0,184330.0,0.0,3.0,1425.0,1015.0
5,C2483,900,아파트,경상북도,국민임대,51.95,89,38.0,A,27304000.0,184330.0,0.0,3.0,1425.0,1015.0
6,C2483,900,아파트,경상북도,국민임대,51.95,135,38.0,A,27304000.0,184330.0,0.0,3.0,1425.0,1015.0
7,C2483,900,아파트,경상북도,국민임대,59.88,86,38.0,A,30357000.0,214270.0,0.0,3.0,1425.0,1015.0


In [32]:
print(f"전체 변수들 : \n{train.columns.tolist()}\n")
print(f"C2483에서 유일한 값을 가지는 변수들 : \n {list(train.columns[train[train.단지코드 == 'C2483'].nunique() == 1])} \n")
print(f"C2438에서 중복된 값을 가지는 변수들 : \n {list(set(train.columns.tolist()) - set(train.columns[train[train.단지코드 == 'C2483'].nunique() == 1]))} \n")

전체 변수들 : 
['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수']

C2483에서 유일한 값을 가지는 변수들 : 
 ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '공가수', '자격유형', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'] 

C2438에서 중복된 값을 가지는 변수들 : 
 ['임대보증금', '임대료', '전용면적별세대수', '전용면적'] 



In [34]:
train.groupby(['단지코드']).nunique(dropna=False)

Unnamed: 0_level_0,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C1000,1,1,1,1,5,6,1,1,3,3,1,1,1,1
C1004,1,2,1,2,15,3,1,2,3,3,1,1,1,1
C1005,1,1,1,1,3,3,1,1,3,3,1,1,1,1
C1013,1,1,1,1,4,5,1,1,3,3,1,1,1,1
C1014,1,1,1,1,6,7,1,1,4,4,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2663,1,1,1,1,5,6,1,1,4,4,1,1,1,1
C2666,1,1,1,1,4,6,1,1,2,2,1,1,1,1
C2670,1,1,1,1,4,3,1,1,3,3,1,1,1,1
C2680,1,1,1,1,2,3,1,1,2,2,1,1,1,1


- 단지코드가 423개 이므로 groupby한 결과의 unique값 개수의 총합 값이 `423`보다 크면, 그 feature는 `unique값이 2개 이상 존재하는 feature`이다.
    - 임대건물구분, 공급유형, 전용면적, 전용면적별세대수, 자격유형, 임대보증금, 임대료
- 단지코드별로 집계할 때
    - unique값이 1개씩 존재하는 feature 들은 그대로 사용,
    - 여러개 존재하는 feature들은 각 항목들을 feature로 만들어 사용

In [35]:
train.groupby(['단지코드']).nunique(dropna=False).sum(axis=0)

총세대수                             423
임대건물구분                           456
지역                               423
공급유형                             488
전용면적                            1898
전용면적별세대수                        2230
공가수                              423
자격유형                             510
임대보증금                           1277
임대료                             1289
도보 10분거리 내 지하철역 수(환승노선 수 반영)     423
도보 10분거리 내 버스정류장 수               423
단지내주차면수                          423
등록차량수                            423
dtype: int64

## 단지코드 별 집계

### 단지 코드 내에 unique 값이 하나만 존재하는 feature들

In [46]:
unique_cols = ['총세대수', '지역', '공가수', \
    '도보 10분거리 내 지하철역 수(환승노선 수 반영)', 
    '도보 10분거리 내 버스정류장 수', 
    '단지내주차면수', '등록차량수']

train_agg = train.set_index('단지코드')[unique_cols].drop_duplicates()
test_agg = test.set_index('단지코드')[[col for col in unique_cols if col != '등록차량수']].drop_duplicates()

In [47]:
train_agg

Unnamed: 0_level_0,총세대수,지역,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C2483,900,경상북도,38.0,0.0,3.0,1425.0,1015.0
C2515,545,경상남도,17.0,0.0,3.0,624.0,205.0
C1407,1216,대전광역시,13.0,1.0,1.0,1285.0,1064.0
C1945,755,경기도,6.0,1.0,3.0,734.0,730.0
C1470,696,전라북도,14.0,0.0,2.0,645.0,553.0
...,...,...,...,...,...,...,...
C2586,90,제주특별자치도,7.0,0.0,3.0,66.0,57.0
C2035,492,강원도,24.0,0.0,1.0,521.0,246.0
C2020,40,부산광역시,7.0,1.0,2.0,25.0,19.0
C2437,90,충청북도,12.0,0.0,1.0,30.0,16.0


In [48]:
test_agg

Unnamed: 0_level_0,총세대수,지역,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1072,754,경기도,14.0,0.0,2.0,683.0
C1128,1354,경기도,9.0,0.0,3.0,1216.0
C1456,619,부산광역시,18.0,0.0,16.0,547.0
C1840,593,전라북도,7.0,0.0,3.0,543.0
C1332,1297,경기도,11.0,0.0,2.0,1112.0
...,...,...,...,...,...,...
C2456,349,제주특별자치도,17.0,0.0,4.0,270.0
C1266,596,충청북도,35.0,0.0,1.0,593.0
C2152,120,강원도,9.0,0.0,1.0,40.0
C1267,675,경상남도,38.0,0.0,1.0,467.0


### 단지코드 내에 unique 값이 2개 이상 존재하는 feature들

- 임대건물구분, 공급유형, 전용면적, 전용면적별세대수, 자격유형, 임대보증금, 임대료

In [59]:
def reshape_cat_features(data, cast_col, value_col):
    res = data.drop_duplicates(['단지코드', cast_col]).assign(counter=1).pivot(index='단지코드', columns=cast_col, values=value_col).fillna(0)
    res.columns.name = None
    res = res.rename(columns={col:cast_col+'_'+col for col in res.columns})
    return res

#### 임대건물구분

In [60]:
reshape_cat_features(data=train, cast_col='임대건물구분', value_col='counter')

Unnamed: 0_level_0,임대건물구분_상가,임대건물구분_아파트
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C1000,0.0,1.0
C1004,1.0,1.0
C1005,0.0,1.0
C1013,0.0,1.0
C1014,0.0,1.0
...,...,...
C2663,0.0,1.0
C2666,0.0,1.0
C2670,0.0,1.0
C2680,0.0,1.0


In [61]:
reshape_cat_features(data=test, cast_col='임대건물구분', value_col='counter')

Unnamed: 0_level_0,임대건물구분_상가,임대건물구분_아파트
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1
C1003,0.0,1.0
C1006,1.0,1.0
C1016,0.0,1.0
C1019,0.0,1.0
C1030,0.0,1.0
...,...,...
C2653,0.0,1.0
C2675,0.0,1.0
C2676,1.0,1.0
C2688,0.0,1.0


#### 공급유형

In [62]:
pd.concat([train.공급유형.value_counts(), test.공급유형.value_counts()], axis=1)

Unnamed: 0,공급유형,공급유형.1
국민임대,1730,619.0
임대상가,285,112.0
행복주택,203,121.0
공공임대(10년),203,34.0
영구임대,149,44.0
공공임대(50년),31,13.0
공공임대(분납),12,6.0
장기전세,9,
공공분양,7,
공공임대(5년),3,


In [63]:
train.loc[train.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
test.loc[test.공급유형.isin(['공공임대(5년)', '공공분양', '공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(5년/10년/분납/분양)'
train.loc[train.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'
test.loc[test.공급유형.isin(['장기전세', '국민임대']), '공급유형'] = '국민임대/장기전세'

In [64]:
set(train.공급유형)

{'공공임대(50년)', '공공임대(5년/10년/분납/분양)', '국민임대/장기전세', '영구임대', '임대상가', '행복주택'}

In [65]:
set(test.공급유형)

{'공공임대(50년)', '공공임대(5년/10년/분납/분양)', '국민임대/장기전세', '영구임대', '임대상가', '행복주택'}

In [66]:
reshape_cat_features(data=train, cast_col='공급유형', value_col='counter')

Unnamed: 0_level_0,공급유형_공공임대(50년),공급유형_공공임대(5년/10년/분납/분양),공급유형_국민임대/장기전세,공급유형_영구임대,공급유형_임대상가,공급유형_행복주택
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1000,0.0,0.0,1.0,0.0,0.0,0.0
C1004,0.0,0.0,0.0,1.0,1.0,0.0
C1005,0.0,0.0,1.0,0.0,0.0,0.0
C1013,0.0,0.0,1.0,0.0,0.0,0.0
C1014,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
C2663,0.0,0.0,1.0,0.0,0.0,0.0
C2666,0.0,0.0,1.0,0.0,0.0,0.0
C2670,0.0,0.0,1.0,0.0,0.0,0.0
C2680,0.0,0.0,1.0,0.0,0.0,0.0


In [67]:
reshape_cat_features(data=test, cast_col='공급유형', value_col='counter')

Unnamed: 0_level_0,공급유형_공공임대(50년),공급유형_공공임대(5년/10년/분납/분양),공급유형_국민임대/장기전세,공급유형_영구임대,공급유형_임대상가,공급유형_행복주택
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1003,0.0,0.0,0.0,0.0,0.0,1.0
C1006,0.0,0.0,0.0,1.0,1.0,0.0
C1016,0.0,0.0,1.0,0.0,0.0,0.0
C1019,0.0,0.0,1.0,0.0,0.0,0.0
C1030,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
C2653,0.0,0.0,1.0,0.0,0.0,0.0
C2675,0.0,0.0,1.0,0.0,0.0,0.0
C2676,0.0,0.0,0.0,1.0,1.0,0.0
C2688,0.0,0.0,1.0,1.0,0.0,0.0


#### 자격유형

- feature 생성 컨셉 : 특정자격 유형이 많으면 그 단지는 등록차량수가 적을 것이다.
    - ex) 소득수준이 낮은 자격유형의 세대가 많으면 주차수요가 적을 가능성이 높을 것으로 예상가능하다.
- 문제점 : 특성이 비슷한 항목끼리 묶어야 의미있을 듯 한데, 비식별화 되어있어서 각 코드값이 무엇인지 정확히 알 수 없다.
- 단지코드별로 자격유형별 세대수를 알 수 있으면 좋을 것 같지만 이 데이터에서는 파악이 불가능하다.
- 자격유형별 소득수준을 간접적으로 파악하기 위해 자격유형별 임대보증금의 평균, 임대료의 평균을 feature로 사용해 볼 수 있을 것 같다.

In [68]:
pd.concat([train.자격유형.value_counts(), \
    test.자격유형.value_counts()], axis=1)

Unnamed: 0,자격유형,자격유형.1
A,1775,569.0
D,292,114.0
H,154,92.0
J,105,81.0
C,92,35.0
I,49,7.0
E,37,10.0
K,33,16.0
L,33,12.0
N,29,10.0


In [69]:
train.loc[train.자격유형=='B', ['임대건물구분', '공급유형']].drop_duplicates()

Unnamed: 0,임대건물구분,공급유형
26,아파트,국민임대/장기전세


In [71]:
train.loc[train.공급유형 == '국민임대/장기전세', ['자격유형', '임대건물구분']].drop_duplicates()

Unnamed: 0,자격유형,임대건물구분
0,A,아파트
26,B,아파트
823,E,아파트
886,G,아파트
898,H,아파트


In [72]:
train.loc[train.공급유형 == '국민임대/장기전세', '자격유형'].value_counts()

A    1524
H     154
E      34
B      18
G       9
Name: 자격유형, dtype: int64

국민임대/장기전세

- (일반) 해당지역 거주 무주택세대구성원
- (특별/우선)3자녀 이상 가구
- (특별/우선)국가유공자
- (특별/우선)영구임대 입주자
- (특별/우선)비닐간이공작물 거주자
- (특별/우선)신혼부부(혼인기간 5년이내)
- (특별/우선)사업지구 철거민
- (특별/우선)기타 공급대상(고령자, 노부모부양자, 장애인, 파독근로자 등)
- 건수가 가장 많은 A는 `해당지역 거주 무주택세대구성원`으로 예상

In [73]:
# 다른 공급 유형별 자격유형도 살펴보자
train.loc[train.공급유형.isin(['영구임대']), '자격유형'].value_counts()

C    92
I    49
E     3
F     3
A     2
Name: 자격유형, dtype: int64

영구임대

- (일반)생계급여 또는 의료급여 수급자
- (일반)국가유공자
- (특별/우선)수급자 선정기준의 소득인정액 이하인 국가유공자
- (특별/우선)귀환국군포로
- (특별/우선)수급자 신혼부부

In [74]:
train.loc[train.공급유형.isin(['공공임대(5년/10년/분납/분양)']), '자격유형'].value_counts()

A    218
D      7
Name: 자격유형, dtype: int64

공공임대(5년/10년/분납/분양)

- (일반)해당지역 거주무주택세대구성원
- (특별/우선)다자녀 특별
- (특별/우선)신혼부부 특별
- (특별/우선)생애최초 특별
- (특별/우선)노부모부양 특별
- (특별/우선)국가유공자 특별
- (특별/우선)기타 특별(장애인, 철거민 등)
- A는 `(일반)해당지역 거주무주택세대구성원`으로 추정됨

In [75]:
train.loc[train.공급유형.isin(['행복주택']), '자격유형'].value_counts()

J    105
K     33
L     33
N     29
M      2
O      1
Name: 자격유형, dtype: int64

행복주택

- 대학생(취준생 포함), 사회초년생(제취준생 포함), 신혼부부(예비신혼부부, 대학생, 취준생 신혼부부 포함), 고령자, 주거급여수급자, 산업단지 근로자
- 이렇게 6개 항목이랑 매핑 될듯

In [76]:
train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '공급유형'].value_counts()
# 'J', 'L', 'K', 'N', 'M', 'O' 는 공급유형이 행복주택인 경우에서만 나타남 이것만 따로 묶는게 좋을듯

행복주택    203
Name: 공급유형, dtype: int64

In [77]:

train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'
test.loc[test.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'

- 'H', 'B', 'E', 'G' 는 국민임대/장기전세 공급대상(E는 영구임대인 경우도 있긴 하지만 국민임대/장기전세인 경우가 더 많아서 여기에 포함시킴)
- 'C', 'I', 'F', 'G' 는 영구임대 공급대상으로 묶어 보겠음

In [79]:
train.loc[train.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'
test.loc[test.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대/장기전세_공급대상'

train.loc[train.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'
test.loc[test.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'

In [80]:
pd.concat([train.자격유형.value_counts(), 
           test.자격유형.value_counts()], axis=1)

Unnamed: 0,자격유형,자격유형.1
A,1775,569
D,292,114
국민임대/장기전세_공급대상,218,103
행복주택_공급대상,203,121
영구임대_공급대상,144,42


In [81]:
reshape_cat_features(data=train, cast_col='자격유형', value_col='counter')

Unnamed: 0_level_0,자격유형_A,자격유형_D,자격유형_국민임대/장기전세_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1000,1.0,0.0,0.0,0.0,0.0
C1004,0.0,1.0,0.0,1.0,0.0
C1005,1.0,0.0,0.0,0.0,0.0
C1013,1.0,0.0,0.0,0.0,0.0
C1014,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
C2663,0.0,0.0,1.0,0.0,0.0
C2666,1.0,0.0,0.0,0.0,0.0
C2670,1.0,0.0,0.0,0.0,0.0
C2680,1.0,0.0,0.0,0.0,0.0


In [82]:
reshape_cat_features(data=test, cast_col='자격유형', value_col='counter')

Unnamed: 0_level_0,자격유형_A,자격유형_D,자격유형_국민임대/장기전세_공급대상,자격유형_영구임대_공급대상,자격유형_행복주택_공급대상
단지코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1003,0.0,0.0,0.0,0.0,1.0
C1006,0.0,1.0,0.0,1.0,0.0
C1016,1.0,0.0,0.0,0.0,0.0
C1019,1.0,0.0,0.0,0.0,0.0
C1030,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
C2653,1.0,0.0,0.0,0.0,0.0
C2675,1.0,0.0,0.0,0.0,0.0
C2676,0.0,1.0,0.0,1.0,0.0
C2688,0.0,0.0,1.0,1.0,0.0


In [83]:
train_agg = pd.concat([train_agg,
                       reshape_cat_features(data=train, cast_col='임대건물구분', value_col='counter'),
                       reshape_cat_features(data=train, cast_col='공급유형', value_col='counter'),
                       reshape_cat_features(data=train, cast_col='자격유형', value_col='counter')], axis=1)

test_agg = pd.concat([test_agg,
                       reshape_cat_features(data=test, cast_col='임대건물구분', value_col='counter'),
                       reshape_cat_features(data=test, cast_col='공급유형', value_col='counter'),
                       reshape_cat_features(data=test, cast_col='자격유형', value_col='counter')], axis=1)

In [84]:
train_agg.shape, test_agg.shape

((423, 20), (150, 19))

# BaseLine Model Test

## 지역명 숫자로 매핑

In [86]:
local_map = {}

for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

train_agg['지역'] = train_agg['지역'].map(local_map)
test_agg['지역'] = test_agg['지역'].map(local_map)

In [90]:
list(train_agg.columns.unique())

['총세대수',
 '지역',
 '공가수',
 '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
 '도보 10분거리 내 버스정류장 수',
 '단지내주차면수',
 '등록차량수',
 '임대건물구분_상가',
 '임대건물구분_아파트',
 '공급유형_공공임대(50년)',
 '공급유형_공공임대(5년/10년/분납/분양)',
 '공급유형_국민임대/장기전세',
 '공급유형_영구임대',
 '공급유형_임대상가',
 '공급유형_행복주택',
 '자격유형_A',
 '자격유형_D',
 '자격유형_국민임대/장기전세_공급대상',
 '자격유형_영구임대_공급대상',
 '자격유형_행복주택_공급대상']

## Train

In [101]:
X_train = train_agg.drop(['등록차량수', '지역'], axis=1)
y_train = train_agg['등록차량수']
X_test = test_agg.drop(['지역'], axis=1)

In [97]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [98]:
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

## Inference & Submission

In [102]:
pred = model.predict(X_test)

In [104]:
SUBMISSION_ROOT = os.path.join(DATA_ROOT, "sample_submission.csv")

submission = pd.read_csv(SUBMISSION_ROOT)
print("Data Loaded!")

Data Loaded!


In [105]:
submission

Unnamed: 0,code,num
0,C1072,0
1,C1128,0
2,C1456,0
3,C1840,0
4,C1332,0
...,...,...
145,C2456,0
146,C1266,0
147,C2152,0
148,C1267,0


In [106]:
submission['num'] = pred

In [108]:
submission.to_csv('basicEDA.csv', index=False)