# Version 5

## Test Coverage

- [Test 1] NULL 처리 방안 변화
    - 지하철
        - 충청남도 -> -1
        - 대전광역시 -> -2
        - 경상남도 -> -3
    - 버스
        - 중위값(median)

## Import Module

In [1]:
import pandas as pd
import numpy as np
from os.path import join as Join
from tqdm.notebook import tqdm

from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold

## Data Load

In [2]:
DATA_ROOT = ''
DATA_ROOT = Join(DATA_ROOT, '../../../competition_data/parking_data/')

TRAIN_ROOT = Join(DATA_ROOT, 'train.csv')
TEST_ROOT = Join(DATA_ROOT, 'test.csv')
AGE_GENDER_INFO_ROOT = Join(DATA_ROOT, 'age_gender_info.csv')

print(f"DATA_ROOT : {DATA_ROOT}")
print(f"TRAIN_ROOT : {TRAIN_ROOT}")
print(f"TEST_ROOT : {TEST_ROOT}")
print(f"AGE_GENDER_INFO_ROOT : {AGE_GENDER_INFO_ROOT}")

DATA_ROOT : ../../../competition_data/parking_data/
TRAIN_ROOT : ../../../competition_data/parking_data/train.csv
TEST_ROOT : ../../../competition_data/parking_data/test.csv
AGE_GENDER_INFO_ROOT : ../../../competition_data/parking_data/age_gender_info.csv


In [3]:
train = pd.read_csv(TRAIN_ROOT)
test = pd.read_csv(TEST_ROOT)
age_gender_info = pd.read_csv(AGE_GENDER_INFO_ROOT)

print("Data Loaded!")

Data Loaded!


## Preprocessing (Version 1)

### 지역명 숫자로 매핑

In [4]:
local_map = {}

for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

train['지역'] = train['지역'].map(local_map)
test['지역'] = test['지역'].map(local_map)

### 전용면적 5의 배수로 변경

In [5]:
train['전용면적'] = train['전용면적']//5*5
test['전용면적'] = test['전용면적']//5*5

### Column 명 간소화

In [6]:
train.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '자격유형',
    '임대보증금', '임대료', '지하철', '버스', '단지내주차면수', '등록차량수']

test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '자격유형',
    '임대보증금', '임대료', '지하철', '버스','단지내주차면수']

## Preprocessing (Version 2 & Version 5)

### `'-'` -> NULL, dtype을 float으로 변경

In [7]:
columns = ['임대보증금', '임대료']

for col in columns:
    train.loc[train[col] == '-', col] = np.nan
    test.loc[test[col] == '-', col] = np.nan

    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

### NULL 값 처리

#### 임대보증금, 임대료

In [8]:
train[['임대보증금', '임대료']] = train[['임대보증금', '임대료']].fillna(0)
test[['임대보증금', '임대료']] = test[['임대보증금', '임대료']].fillna(0)

#### 지하철, 버스

In [9]:
train.loc[(train.지하철.isnull()) & (train.지역 == local_map['충청남도']), '지하철'] = -1
train.loc[(train.지하철.isnull()) & (train.지역 == local_map['대전광역시']), '지하철'] = -2
train.loc[(train.지하철.isnull()) & (train.지역 == local_map['경상남도']), '지하철'] = -3

test.loc[(test.지하철.isnull()) & (test.지역 == local_map['충청남도']), '지하철'] = -1
test.loc[(test.지하철.isnull()) & (test.지역 == local_map['대전광역시']), '지하철'] = -2

In [10]:
train.loc[train.버스.isnull(), '버스'] = train.버스.median()

#### 자격유형

In [11]:
test.loc[test.단지코드.isin(['C2411']) & test.자격유형.isnull(), '자격유형'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.자격유형.isnull(), '자격유형'] = 'C'

### 중복 example 제거

In [12]:
train = train.drop_duplicates()
test = test.drop_duplicates()

### 자격유형 병합

In [13]:
train.loc[train.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'
test.loc[test.자격유형.isin(['J', 'L', 'K', 'N', 'M', 'O']), '자격유형'] = '행복주택_공급대상'

train.loc[train.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대_공급대상'
test.loc[test.자격유형.isin(['H', 'B', 'E', 'G']), '자격유형'] = '국민임대_공급대상'

train.loc[train.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'
test.loc[test.자격유형.isin(['C', 'I', 'F']), '자격유형'] = '영구임대_공급대상'

### 공급유형 병합

In [14]:
train.loc[train.공급유형.isin(['공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(10년/분납)'
test.loc[test.공급유형.isin(['공공임대(10년)', '공공임대(분납)']), '공급유형'] = '공공임대(10년/분납)'

## Preprocessing (Version 3)

### Train -> 공급유형이 장기전세, 공공분양, 공공임대(5년)인 example Drop

In [15]:
idx = train[(train.공급유형 == '장기전세') | (train.공급유형 == '공공분양') | (train.공급유형 == '공공임대(5년)')].index
train = train.drop(idx)

train.공급유형.unique().tolist()

['국민임대', '공공임대(50년)', '영구임대', '임대상가', '공공임대(10년/분납)', '행복주택']

### 각 단지코드 내에 임대건물구분 -> '아파트'이면 1, '상가&아파트'이면 0 으로 매핑

In [16]:
codes = train.단지코드.unique().tolist()

for code in tqdm(codes):
    values = train.loc[train.단지코드 == code, '임대건물구분'].unique().tolist()

    if '상가' in values:
        train.loc[train.단지코드 == code, '임대건물구분'] = 0
    else:
        train.loc[train.단지코드 == code, '임대건물구분'] = 1

  0%|          | 0/421 [00:00<?, ?it/s]

In [17]:
codes = test.단지코드.unique().tolist()

for code in tqdm(codes):
    values = test.loc[test.단지코드 == code, '임대건물구분'].unique().tolist()

    if '상가' in values:
        test.loc[test.단지코드 == code, '임대건물구분'] = 0
    else:
        test.loc[test.단지코드 == code, '임대건물구분'] = 1

  0%|          | 0/150 [00:00<?, ?it/s]

### '자격유형별 평균 임대료' feature 추가

In [18]:
qualifies = train.자격유형.unique().tolist()

for qualify in tqdm(qualifies):
    train.loc[train.자격유형 == qualify, '평균임대료(자격유형)'] = train.loc[train.자격유형 == qualify, '임대료'].mean()

  0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
qualifies = test.자격유형.unique().tolist()

for qualify in tqdm(qualifies):
    test.loc[test.자격유형 == qualify, '평균임대료(자격유형)'] = test.loc[test.자격유형 == qualify, '임대료'].mean()

  0%|          | 0/5 [00:00<?, ?it/s]

## Aggregation

### 단지코드 별로 모두 같은 값을 가지는 feature

In [20]:
unique_cols = ['총세대수', '지역', '공가수', '지하철', '버스', '단지내주차면수', '임대건물구분', '등록차량수']

train_eq = train.set_index('단지코드')[unique_cols].drop_duplicates()
test_eq = test.set_index('단지코드')[[col for col in unique_cols if col != '등록차량수']].drop_duplicates()

### 단지코드 별로 다양한 값을 가지는 feature

In [21]:
train_neq = train.drop(unique_cols, axis=1)
test_neq = test.drop([col for col in unique_cols if col != '등록차량수'], axis=1)

#### 단지 별 평균 값으로 처리할 feature 들

- 전용면적, 전용면적별세대수, 임대보증금, 임대료, 평균임대료(자격유형)

In [23]:
mean_cols = ['단지코드', '전용면적', '전용면적별세대수', '임대보증금', '임대료', '평균임대료(자격유형)']
mean_vals_train = train_neq[mean_cols].copy()

codes = mean_vals_train.단지코드.unique().tolist()

for code in tqdm(codes):
    mean_vals_train.loc[mean_vals_train.단지코드 == code, '전용면적'] = mean_vals_train.loc[mean_vals_train.단지코드 == code, '전용면적'].mean()
    mean_vals_train.loc[mean_vals_train.단지코드 == code, '전용면적별세대수'] = mean_vals_train.loc[mean_vals_train.단지코드 == code, '전용면적별세대수'].mean()
    mean_vals_train.loc[mean_vals_train.단지코드 == code, '임대보증금'] = mean_vals_train.loc[mean_vals_train.단지코드 == code, '임대보증금'].mean()
    mean_vals_train.loc[mean_vals_train.단지코드 == code, '임대료'] = mean_vals_train.loc[mean_vals_train.단지코드 == code, '임대료'].mean()
    mean_vals_train.loc[mean_vals_train.단지코드 == code, '평균임대료(자격유형)'] = mean_vals_train.loc[mean_vals_train.단지코드 == code, '평균임대료(자격유형)'].mean()

mean_vals_train = mean_vals_train.drop_duplicates().set_index('단지코드')

  0%|          | 0/421 [00:00<?, ?it/s]

In [24]:
mean_cols = ['단지코드', '전용면적', '전용면적별세대수', '임대보증금', '임대료', '평균임대료(자격유형)']
mean_vals_test = test_neq[mean_cols].copy()

codes = mean_vals_test.단지코드.unique().tolist()

for code in tqdm(codes):
    mean_vals_test.loc[mean_vals_test.단지코드 == code, '전용면적'] = mean_vals_test.loc[mean_vals_test.단지코드 == code, '전용면적'].mean()
    mean_vals_test.loc[mean_vals_test.단지코드 == code, '전용면적별세대수'] = mean_vals_test.loc[mean_vals_test.단지코드 == code, '전용면적별세대수'].mean()
    mean_vals_test.loc[mean_vals_test.단지코드 == code, '임대보증금'] = mean_vals_test.loc[mean_vals_test.단지코드 == code, '임대보증금'].mean()
    mean_vals_test.loc[mean_vals_test.단지코드 == code, '임대료'] = mean_vals_test.loc[mean_vals_test.단지코드 == code, '임대료'].mean()
    mean_vals_test.loc[mean_vals_test.단지코드 == code, '평균임대료(자격유형)'] = mean_vals_test.loc[mean_vals_test.단지코드 == code, '평균임대료(자격유형)'].mean()

mean_vals_test = mean_vals_test.drop_duplicates().set_index('단지코드')

  0%|          | 0/150 [00:00<?, ?it/s]

#### feature reshape

- 공급유형, 자격유형

In [25]:
def reshape_cat_features(data, cast_col, value_col):
    res = data.drop_duplicates(['단지코드', cast_col]).assign(counter=1).pivot(index='단지코드', columns=cast_col, values=value_col).fillna(0)
    res.columns.name = None
    res = res.rename(columns={col:cast_col+'_'+col for col in res.columns})
    return res

In [26]:
pattern_vals_train = pd.concat([reshape_cat_features(data=train_neq, cast_col='공급유형', value_col='counter'), \
    reshape_cat_features(data=train_neq, cast_col='자격유형', value_col='counter')], axis=1)

pattern_vals_test = pd.concat([reshape_cat_features(data=test_neq, cast_col='공급유형', value_col='counter'), \
    reshape_cat_features(data=test_neq, cast_col='자격유형', value_col='counter')], axis=1)

### Final Aggregation

In [29]:
train = pd.concat([train_eq, mean_vals_train, pattern_vals_train], axis=1)
test = pd.concat([test_eq, mean_vals_test, pattern_vals_test], axis=1)

## Preprocessing (Version 4)

In [31]:
age_gender_info = age_gender_info.set_index('지역')

In [32]:
#열 이름 변경
age_columns=[]
for i in range(0,11):
    for j in range(0,2):
        temp=str(i)+"_"+str(j)
        age_columns.append(temp)

print(len(age_columns))
print(age_columns)

age_gender_info.columns = age_columns

22
['0_0', '0_1', '1_0', '1_1', '2_0', '2_1', '3_0', '3_1', '4_0', '4_1', '5_0', '5_1', '6_0', '6_1', '7_0', '7_1', '8_0', '8_1', '9_0', '9_1', '10_0', '10_1']


In [33]:
#열 추가
age_gender_info['1st']=0
age_gender_info['1gender']=0
age_gender_info['2nd']=0
age_gender_info['2gender']=0
age_gender_info['3rd']=0
age_gender_info['3gender']=0

In [34]:
for id in age_gender_info.index:
    temp=pd.DataFrame(age_gender_info.loc[id]).T
    temp=temp.sort_values(axis=1,by=[id],ascending=False)
    temp_ascend=temp.columns
    
    age_gender_info['1st'].loc[id]=temp_ascend[0].split('_')[0]
    age_gender_info['1gender'].loc[id]=temp_ascend[0].split('_')[1]
    age_gender_info['2nd'].loc[id]=temp_ascend[1].split('_')[0]
    age_gender_info['2gender'].loc[id]=temp_ascend[1].split('_')[1]
    age_gender_info['3rd'].loc[id]=temp_ascend[2].split('_')[0]
    age_gender_info['3gender'].loc[id]=temp_ascend[2].split('_')[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


### Train/Test 와 age_gender_info 병합

In [35]:
age_gender_info=age_gender_info.drop(columns=age_columns,axis=1)
age_gender_info.head(2)

Unnamed: 0_level_0,1st,1gender,2nd,2gender,3rd,3gender
지역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
경상북도,5,0,4,0,6,0
경상남도,6,0,5,0,4,0


In [36]:
local_map = {'경상북도': 0, \
    '경상남도': 1,
    '대전광역시': 2,
    '경기도': 3,
    '전라북도': 4,
    '강원도': 5,
    '광주광역시': 6,
    '충청남도': 7,
    '부산광역시': 8,
    '제주특별자치도': 9,
    '울산광역시': 10,
    '충청북도': 11,
    '전라남도': 12,
    '대구광역시': 13,
    '서울특별시': 14,
    '세종특별자치시': 15}

regions = age_gender_info.index

for region in tqdm(regions):
    train.loc[train.지역 == local_map[region], '1st'] = age_gender_info.loc[region, '1st']
    train.loc[train.지역 == local_map[region], '1gender'] = age_gender_info.loc[region, '1gender']
    train.loc[train.지역 == local_map[region], '2nd'] = age_gender_info.loc[region, '2nd']
    train.loc[train.지역 == local_map[region], '2gender'] = age_gender_info.loc[region, '2gender']
    train.loc[train.지역 == local_map[region], '3rd'] = age_gender_info.loc[region, '3rd']
    train.loc[train.지역 == local_map[region], '3gender'] = age_gender_info.loc[region, '3gender']

for region in tqdm(regions):
    test.loc[test.지역 == local_map[region], '1st'] = age_gender_info.loc[region, '1st']
    test.loc[test.지역 == local_map[region], '1gender'] = age_gender_info.loc[region, '1gender']
    test.loc[test.지역 == local_map[region], '2nd'] = age_gender_info.loc[region, '2nd']
    test.loc[test.지역 == local_map[region], '2gender'] = age_gender_info.loc[region, '2gender']
    test.loc[test.지역 == local_map[region], '3rd'] = age_gender_info.loc[region, '3rd']
    test.loc[test.지역 == local_map[region], '3gender'] = age_gender_info.loc[region, '3gender']

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

## To CSV

In [40]:
TRAIN_VERSION_5_ROOT = Join(DATA_ROOT, 'train_version_5.csv')
TEST_VERSION_5_ROOT = Join(DATA_ROOT, 'test_version_5.csv')

train.to_csv(TRAIN_VERSION_5_ROOT)
test.to_csv(TEST_VERSION_5_ROOT)