In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# 設定 data_path, 並讀取 app_train
dir_data = './data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)

## 在初步 EDA 的過程，我們無可避免會想問的問題 ##

- 不同資料類型各有多少個欄位？
> df.get_dtype_counts() <br>
>  or  <br>
> df.dtypes.value_counts()  <br>
>  or <br>
> group by df.dtypes 
  
- 類別型欄位 (pandas 中的 object) 的類別數量?
> select by : df.select_type()
- 模型怎麼處理類別型？有什麼表示方法？
>  Label encoding / One Hot encoding


[Pandas dtypes](https://blog.csdn.net/claroja/article/details/72622375)
[Label Encoder vs. One Hot Encoder in Machine Learning](https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621)

In [49]:
print('--- All columns ---')
print(len(app_train.columns))
print('[' + ','.join(app_train.columns) + ']')
print()

#print(app_train.dtypes)

# 不同資料類型各有多少個欄位？
g = app_train.columns.to_series().groupby(app_train.dtypes)
print('--- 不同資料類型各有多少個欄位 ---')
print(f"不同資料類型各有多少個欄位: \n{g.size()}")
print()
# print(app_train.get_dtype_counts())
# print(app_train.dtypes.value_counts())


# 類別型欄位 (pandas 中的 object) 的類別數量?
print('--- Object columns ---')
objtype = app_train.select_dtypes(include=['object'])
print(len(objtype.columns))
print('[' + ','.join(objtype.columns) + ']')
print()

print('--- column value catalog count (Object columns) ---')
print(app_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0))

print('----------------------')
g1 = app_train.columns.to_series().groupby(app_train.dtypes).groups
print(g1)


--- All columns ---
122
[SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AV

## 作業
將下列部分資料片段 sub_train 使用 One Hot encoding, 並觀察轉換前後的欄位數量 (使用 shape) 與欄位名稱 (使用 head) 變化

In [69]:
sub_train = pd.DataFrame(app_train['WEEKDAY_APPR_PROCESS_START'])
print(sub_train.shape)
sub_train.head()

(307511, 1)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [70]:
# 使用 One Hot encoding
from sklearn.preprocessing import OneHotEncoder

'''
在機器學習建模前要先做資料預處理，如果遇到類別特徵（Categorical features），要先轉換成數值，但因為類別沒有順序之分，
所以不可以轉換成有大小差別的數值，而要轉換成虛擬變數（Dummy variable），轉換的方法就叫做one-hot encoding（獨熱編碼）。
'''
df = pd.get_dummies(sub_train)
print(df.shape)
df.head()

(307511, 7)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0


#### API
[OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) <br>
[DataFrame.get_dummies()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html)
> Convert categorical variable into dummy/indicator variables