# Version 6

## Test Coverage

1. 임대보증금, 평균임대료(자격유형) -> 금액 구간 나누기
    - 임대보증금 : **25구간**
    - 임대료 : **15구간**
    - 평균임대료(자격유형) : **10구간**

## Import Module

In [1]:
import pandas as pd
import numpy as np
from os.path import join as Join
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = Join(DATA_ROOT, '../../../../competition_data/parking_data/')

TRAIN_ROOT = Join(DATA_ROOT, 'train_version_5.csv')
TEST_ROOT = Join(DATA_ROOT, 'test_version_5.csv')

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")

DATA_ROOT : ../../../../competition_data/parking_data/
TRAIN_ROOT : ../../../../competition_data/parking_data/train_version_5.csv
TEST_ROOT : ../../../../competition_data/parking_data/test_version_5.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)

## 구간화 함수 정의

In [4]:
label_encoder = LabelEncoder()

def make_bin(data1, data2, variable, n):
    _, bin_dividers =np.histogram(data1[variable], bins=n) #train의 구간화를 적용

    bin_names=[str(i) for i in range(n)]

    data1[variable]=pd.cut(x=data1[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    data2[variable]=pd.cut(x=data2[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)

    return data1, data2

## 구간 수 최적값 찾아보기

In [5]:
tmp_train = train.copy()
tmp_test = test.copy()

### 임대보증금

In [6]:
n = 25

In [7]:
counter, bin_dividers = np.histogram(tmp_train.임대보증금, bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_train.임대보증금, bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

3     90
4     76
2     47
5     47
6     34
0     27
7     25
8     18
10    12
1     10
9      9
12     6
11     5
15     3
13     2
14     2
16     2
17     2
19     1
22     1
23     1
24     1
18     0
20     0
21     0
Name: 임대보증금, dtype: int64

In [8]:
counter, bin_dividers = np.histogram(tmp_test.임대보증금, bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_test.임대보증금, bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

5     21
7     19
4     17
6     17
8     14
9     13
0      8
13     6
12     6
10     5
11     5
3      3
18     3
24     3
14     2
20     2
1      1
2      1
15     1
17     1
19     1
21     1
16     0
22     0
23     0
Name: 임대보증금, dtype: int64

### 임대료

In [9]:
n = 15

In [10]:
counter, bin_dividers = np.histogram(tmp_train.임대료, bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_train.임대료, bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

2     136
1     118
3      69
0      34
4      29
7      10
8      10
5       5
6       5
9       4
14      1
10      0
11      0
12      0
13      0
Name: 임대료, dtype: int64

In [11]:
counter, bin_dividers = np.histogram(tmp_test.임대료, bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_test.임대료, bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

3     50
2     19
4     19
6     17
5     15
7      9
0      8
1      3
12     3
9      2
13     2
14     2
8      1
10     0
11     0
Name: 임대료, dtype: int64

### 평균임대료

In [12]:
n = 10

In [13]:
counter, bin_dividers = np.histogram(tmp_train['평균임대료(자격유형)'], bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_train['평균임대료(자격유형)'], bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

9    313
5     32
8     29
0     17
1      9
6      8
7      7
2      4
3      1
4      1
Name: 평균임대료(자격유형), dtype: int64

In [14]:
counter, bin_dividers = np.histogram(tmp_test['평균임대료(자격유형)'], bins=n)

bin_names = bin_names=[str(i) for i in range(n)]

pd.cut(x=tmp_test['평균임대료(자격유형)'], bins=bin_dividers, labels=bin_names, include_lowest=True).value_counts()

9    120
4     15
0      6
7      5
3      2
1      1
6      1
2      0
5      0
8      0
Name: 평균임대료(자격유형), dtype: int64

## Preprocessing (Version 6)

In [15]:
train, test = make_bin(data1=train, data2=test, variable='임대보증금', n=25)
train, test = make_bin(data1=train, data2=test, variable='임대료', n=15)
train, test = make_bin(data1=train, data2=test, variable='평균임대료(자격유형)', n=10)

In [16]:
test.loc[test['평균임대료(자격유형)'].isnull(), '평균임대료(자격유형)'] = str(9)

## To CSV

In [17]:
TRAIN_VERSION_6_ROOT = Join(DATA_ROOT, 'train_version_6.csv')
TEST_VERSION_6_ROOT = Join(DATA_ROOT, 'test_version_6.csv')

train.to_csv(TRAIN_VERSION_6_ROOT)
test.to_csv(TEST_VERSION_6_ROOT)