# Baseline Model

- Competition에서 제공하는 baseline을 구현한 내용입니다.

## 문제 내용

### [Goal]

- 유형별 임대주택 설계 시 `단지 내 적정 주차 수요 예측`

### [Evaluation]

- MAE(Mean Absolute Error)

## Import Module

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

## Data Load

In [3]:
DATA_ROOT = ''
DATA_ROOT = os.path.join(DATA_ROOT, "../../../competition_data/parking_data/")

TRAIN_ROOT = os.path.join(DATA_ROOT, "train.csv")
TEST_ROOT = os.path.join(DATA_ROOT, "test.csv")
SUBMISSION_ROOT = os.path.join(DATA_ROOT, "sample_submission.csv")

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(F"SUBMISSION_ROOT : {SUBMISSION_ROOT}")

DATA_ROOT : ../../../competition_data/parking_data/
TRAIN_ROOT : ../../../competition_data/parking_data/train.csv
TEST_ROOT : ../../../competition_data/parking_data/test.csv
SUBMISSION_ROOT : ../../../competition_data/parking_data/sample_submission.csv


In [4]:
Raw_train = pd.read_csv(TRAIN_ROOT)
Raw_test = pd.read_csv(TEST_ROOT)
submission = pd.read_csv(SUBMISSION_ROOT)

print("Data Loaded!")

Data Loaded!


## EDA

### Shape

In [5]:
print(f"Train set : {Raw_train.shape}")
print(f"Test set : {Raw_test.shape}")

Train set : (2952, 15)
Test set : (1022, 14)


### Data Heads

In [6]:
Raw_train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [7]:
Raw_test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0


### column names

- **등록 차량 수**를 target 으로..?

In [8]:
train_cols = Raw_train.columns.tolist()
test_cols = Raw_test.columns.tolist()

print(f"[Train columns] : \n{train_cols} \n")
print(f"[Test columns] : \n{test_cols}")

[Train columns] : 
['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'] 

[Test columns] : 
['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '도보 10분거리 내 버스정류장 수', '단지내주차면수']


### 결측치 확인

In [9]:
print("[Train]")
Raw_train.isna().sum()

[Train]


단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    211
도보 10분거리 내 버스정류장 수                4
단지내주차면수                           0
등록차량수                             0
dtype: int64

In [10]:
print("[Test]")
Raw_test.isna().sum()

[Test]


단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              2
임대보증금                           180
임대료                             180
도보 10분거리 내 지하철역 수(환승노선 수 반영)     42
도보 10분거리 내 버스정류장 수                0
단지내주차면수                           0
dtype: int64

## Column name 변경

- `지하철역 수` -> 지하철
- `버스정류장 수` -> 버스

In [11]:
Raw_train.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수', '등록차량수'
]

Raw_test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수'
]

In [12]:
train = Raw_train.copy()
test = Raw_test.copy()

## 지역명 숫자로 매핑

In [13]:
local_map = {}

for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

In [14]:
train['지역'] = train['지역'].map(local_map)
test['지역'] = test['지역'].map(local_map)

## 전용 면적을 5의 배수로 변경

In [15]:
train['전용면적'] = train['전용면적']//5*5
test['전용면적'] = test['전용면적']//5*5

## 전용 면적 `상/하한` 적용

- `상한` : 100
- `하한` : 15

In [16]:
idx = train[train['전용면적'] > 100].index
train.loc[idx, '전용면적'] = 100
idx = test[test['전용면적'] > 100].index
test.loc[idx, '전용면적'] = 100

idx = train[train['전용면적'] < 15].index
train.loc[idx, '전용면적'] = 15
idx = test[test['전용면적'] < 15].index
test.loc[idx, '전용면적'] = 15

In [17]:
print(f"[Train Unique Values] \n {train['전용면적'].unique()}\n")
print(f"[Test Unique Values] \n {test['전용면적'].unique()}")

[Train Unique Values] 
 [ 35.  50.  55.  30.  45.  40.  25.  70.  15.  20. 100.  60.  75.  80.
  65.]

[Test Unique Values] 
 [ 35.  45.  50.  30.  55.  25.  75. 100.  15.  20.  40.  60.  80.  70.]


## 단지별 데이터 1차원으로 취합

- `단지코드`가 같은 example들은 아래의 columns의 feature를 모두 같게 가지는 것 같음.

In [18]:
columns = ['단지코드', '총세대수', '공가수', '지역', '단지내주차면수', '지하철', '버스']
target = '등록차량수'
area_columns = []
for area in train['전용면적'].unique():
    area_columns.append(f"면적_{area}")

In [19]:
print(np.unique(area_columns))

['면적_100.0' '면적_15.0' '면적_20.0' '면적_25.0' '면적_30.0' '면적_35.0' '면적_40.0'
 '면적_45.0' '면적_50.0' '면적_55.0' '면적_60.0' '면적_65.0' '면적_70.0' '면적_75.0'
 '면적_80.0']


In [20]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

In [21]:
for i, code in tqdm(enumerate(train['단지코드'].unique())):
    temp = train[train['단지코드'] == code]
    temp.index = range(temp.shape[0])
    
    # 단지코드별로 내용이 같은 feature들 축약
    # => 같은 단지코드인 example중 첫번째 것으로 대표
    for col in columns:
        new_train.loc[i, col] = temp.loc[0, col]
    
    # 같은 전용 면적을 가지는 세대 수를 계산하여 하나의 feature로 만들기
    # eg. 해당 단지코드를 가진 example중 전용면적이 35.0인 세대 수는 몇 개이다.
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_train.loc[i, col] = temp[temp['전용면적'] == area]['전용면적별세대수'].sum()

    new_train.loc[i, '등록차량수'] = temp.loc[0, '등록차량수']

for i, code in tqdm(enumerate(test['단지코드'].unique())):
    temp = test[test['단지코드'] == code]
    temp.index = range(temp.shape[0])

    for col in columns:
        new_test.loc[i, col] = temp.loc[0, col]

    for col in area_columns:
        area = float(col.split('_')[-1])
        new_test.loc[i, col] = temp[temp['전용면적'] == area]['전용면적별세대수'].sum()


0it [00:00, ?it/s]

In [None]:
new_train

Unnamed: 0,단지코드,총세대수,공가수,지역,단지내주차면수,지하철,버스,면적_35.0,면적_50.0,면적_55.0,...,면적_25.0,면적_70.0,면적_15.0,면적_20.0,면적_100.0,면적_60.0,면적_75.0,면적_80.0,면적_65.0,등록차량수
0,C2483,900.0,38.0,0.0,1425.0,0.0,3.0,149.0,665.0,86.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1015.0
1,C2515,545.0,17.0,1.0,624.0,0.0,3.0,80.0,132.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205.0
2,C1407,1216.0,13.0,2.0,1285.0,1.0,1.0,0.0,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1064.0
3,C1945,755.0,6.0,3.0,734.0,1.0,3.0,240.0,303.0,212.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730.0
4,C1470,696.0,14.0,4.0,645.0,0.0,2.0,254.0,246.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,C2586,90.0,7.0,9.0,66.0,0.0,3.0,36.0,0.0,0.0,...,42.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0
419,C2035,492.0,24.0,5.0,521.0,0.0,1.0,156.0,0.0,0.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0
420,C2020,40.0,7.0,8.0,25.0,1.0,2.0,15.0,0.0,0.0,...,5.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
421,C2437,90.0,12.0,11.0,30.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,16.0


## Feature 변화

- 이름을 바꾼 Feature : 지하철역 수, 버스정류장 수
    - 지하철역 수 -> 지하철
    - 버스정류장 수 -> 버스
- 다르게 사용된 Feature : 전용면적, 전용면적별 세대수
    - 이 둘의 조합
- 사라진 Feature : `공급유형`, `임대료`, `임대건물구분`, `임대보증금`, `자격유형`

In [None]:
new_train_cols = set(new_train.columns.tolist())
new_test_cols = set(new_test.columns.tolist())

train_cols = set(train_cols)
test_cols = set(test_cols)

print(f"[Removed columns] : \n{train_cols - new_train_cols}")
print(f"\n[Added columns] : \n{new_train_cols - train_cols}")
print(f"\n[Maintained columns] : \n{train_cols & new_train_cols}")

[Removed columns] : 
{'공급유형', '임대료', '임대건물구분', '임대보증금', '도보 10분거리 내 지하철역 수(환승노선 수 반영)', '전용면적', '전용면적별세대수', '자격유형', '도보 10분거리 내 버스정류장 수'}

[Added columns] : 
{'면적_45.0', '면적_65.0', '면적_35.0', '면적_70.0', '면적_60.0', '버스', '면적_15.0', '면적_20.0', '면적_55.0', '지하철', '면적_50.0', '면적_30.0', '면적_80.0', '면적_25.0', '면적_75.0', '면적_100.0', '면적_40.0'}

[Maintained columns] : 
{'지역', '공가수', '단지코드', '단지내주차면수', '등록차량수', '총세대수'}


## 결측치 처리

In [None]:
new_train = new_train.fillna(-1)
new_test = new_test.fillna(-1)

## Train

In [None]:
x_train = new_train.iloc[:, 1:-1]
y_train = new_train.iloc[:, -1]
x_test = new_test.iloc[:, 1:]

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [None]:
model.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

## Inference & Submission

In [None]:
pred = model.predict(x_test)

In [None]:
submission

Unnamed: 0,code,num
0,C1072,0
1,C1128,0
2,C1456,0
3,C1840,0
4,C1332,0
...,...,...
145,C2456,0
146,C1266,0
147,C2152,0
148,C1267,0


In [None]:
submission['num'] = pred

In [None]:
submission.to_csv('baseline.csv', index=False)