In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor as rf
import xgboost as xgb
import optuna
import random
np.random.seed(1234)
random.seed(1234)
import warnings
warnings.filterwarnings('ignore')

# data load
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

I read data_descript.txt and looked up the meanings of items. 
There are two types of variables: numeric and category variable which leads us to do prep. 
And common sense can tell noteworthy variables such as sqarefeet of areas or rates of condition. 

Prep: Categorical variables

ref: [카테고리 변수들을 다루는 방법들](https://www.dacon.io/codeshare/2510)

4 Kinds of Categorical Variables

| Variable Name | 변수명 | 해석 | 예시 | 
|---|---|---|---|
| Nominal | 명목변수 | 2개 이상의 종류가 있는 순서가 무의미한 변수 | gender | 
| Ordinal | 순위변수 | "순서"의 개념이 존재하는 변수 | order |
| Cyclical | 주기변수 | "주기"의 개념이 존재하는 변수 | 요일 | 
| Binary | 이산변수 | 2가지 종류밖에 존재하지 않는 변수 | gender?? | 


1. ENCODING : 순위변수는 mapping을 통해, 나머지는 Labelencoder를 통해 숫자로 변환
2. Binarize : 이진화
3. ONE-HOT : 원-핫 인코딩
4. Categorical 변수를 대처하는 다른 방법 - GROUPBY

In [10]:
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True) # sort : 열을 기준으로 정렬(abc순)
all_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [11]:
all_df["SalePrice"]

0       208500.0
1       181500.0
2       223500.0
3       140000.0
4       250000.0
          ...   
2914         NaN
2915         NaN
2916         NaN
2917         NaN
2918         NaN
Name: SalePrice, Length: 2919, dtype: float64

LabelEncorder는 카테고리를 연속적인 수치로 변환. 
학습 데이터만 변환시키면 테스트 데이터에만 존재하는 변수를 만나면 오류가 발생하므로 통합하여 변환. 
결측치(NaN)을 임의의 문자열(ex. missing)로 변환하거나 삭제 - NaN을 읽게 하면 되지 않나? 

In [12]:
from sklearn.preprocessing import LabelEncoder
categories = all_df.columns[all_df.dtypes=='object']
print(categories)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [13]:
for cat in categories:
    le = LabelEncoder()
    print(cat)
    all_df[cat].fillna('missing', inplace=True)
    le = le.fit(all_df[cat])
    all_df[cat] = le.transform(all_df[cat])
    all_df[cat] = all_df[cat].astype('category')

MSZoning
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence
MiscFeature
SaleType
SaleCondition


In [14]:
all_df.dtypes

Id                  int64
MSSubClass          int64
MSZoning         category
LotFrontage       float64
LotArea             int64
                   ...   
MoSold              int64
YrSold              int64
SaleType         category
SaleCondition    category
SalePrice         float64
Length: 81, dtype: object

근데 값이 어떻게 변환됐는지는 모르겠는.. 