# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [1]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = 'C:\\Users\SamuelWork\Downloads\data'

### 之前做過的處理

In [2]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [3]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [7]:
app_train.columns.values

array(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3',

In [13]:
temp = app_train[['TARGET', 'DAYS_EMPLOYED', 'DAYS_BIRTH']]

In [14]:
temp

Unnamed: 0,TARGET,DAYS_EMPLOYED,DAYS_BIRTH
0,1,-637.0,9461
1,0,-1188.0,16765
2,0,-225.0,19046
3,0,-3039.0,19005
4,0,-3038.0,19932
...,...,...,...
307506,0,-236.0,9327
307507,0,,20775
307508,0,-7921.0,14966
307509,1,-4786.0,11961


In [15]:
temp['DAYS_EMPLOYED'] = abs(temp['DAYS_EMPLOYED'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
temp

Unnamed: 0,TARGET,DAYS_EMPLOYED,DAYS_BIRTH
0,1,637.0,9461
1,0,1188.0,16765
2,0,225.0,19046
3,0,3039.0,19005
4,0,3038.0,19932
...,...,...,...
307506,0,236.0,9327
307507,0,,20775
307508,0,7921.0,14966
307509,1,4786.0,11961


In [20]:
temp = temp.dropna()
temp

Unnamed: 0,TARGET,DAYS_EMPLOYED,DAYS_BIRTH
0,1,637.0,9461
1,0,1188.0,16765
2,0,225.0,19046
3,0,3039.0,19005
4,0,3038.0,19932
...,...,...,...
307504,0,7258.0,16705
307506,0,236.0,9327
307508,0,7921.0,14966
307509,1,4786.0,11961


In [22]:
temp['DAYS_EMPLOYED'] = pd.cut(temp['DAYS_EMPLOYED'], 10)
temp['DAYS_BIRTH'] = pd.cut(temp['DAYS_BIRTH'], 10)
temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,TARGET,DAYS_EMPLOYED,DAYS_BIRTH
0,1,"(-17.912, 1791.2]","(9260.1, 11031.2]"
1,0,"(-17.912, 1791.2]","(16344.5, 18115.6]"
2,0,"(-17.912, 1791.2]","(18115.6, 19886.7]"
3,0,"(1791.2, 3582.4]","(18115.6, 19886.7]"
4,0,"(1791.2, 3582.4]","(19886.7, 21657.8]"
...,...,...,...
307504,0,"(7164.8, 8956.0]","(16344.5, 18115.6]"
307506,0,"(-17.912, 1791.2]","(9260.1, 11031.2]"
307508,0,"(7164.8, 8956.0]","(14573.4, 16344.5]"
307509,1,"(3582.4, 5373.6]","(11031.2, 12802.3]"


In [24]:
temp['DAYS_EMPLOYED'].value_counts().sort_index()

(-17.912, 1791.2]     134392
(1791.2, 3582.4]       65188
(3582.4, 5373.6]       27932
(5373.6, 7164.8]       11382
(7164.8, 8956.0]        6563
(8956.0, 10747.2]       3387
(10747.2, 12538.4]      2027
(12538.4, 14329.6]       955
(14329.6, 16120.8]       261
(16120.8, 17912.0]        50
Name: DAYS_EMPLOYED, dtype: int64

In [25]:
temp['DAYS_BIRTH'].value_counts().sort_index()

(7471.289, 9260.1]    13627
(9260.1, 11031.2]     32897
(11031.2, 12802.3]    38209
(12802.3, 14573.4]    41348
(14573.4, 16344.5]    39707
(16344.5, 18115.6]    33204
(18115.6, 19886.7]    29720
(19886.7, 21657.8]    15889
(21657.8, 23428.9]     6120
(23428.9, 25200.0]     1416
Name: DAYS_BIRTH, dtype: int64

In [28]:
temp = temp.loc[temp['TARGET'] == 1, :]
temp

Unnamed: 0,TARGET,DAYS_EMPLOYED,DAYS_BIRTH
0,1,"(-17.912, 1791.2]","(9260.1, 11031.2]"
26,1,"(1791.2, 3582.4]","(18115.6, 19886.7]"
40,1,"(-17.912, 1791.2]","(16344.5, 18115.6]"
42,1,"(3582.4, 5373.6]","(12802.3, 14573.4]"
94,1,"(1791.2, 3582.4]","(9260.1, 11031.2]"
...,...,...,...
307448,1,"(1791.2, 3582.4]","(9260.1, 11031.2]"
307475,1,"(1791.2, 3582.4]","(12802.3, 14573.4]"
307481,1,"(1791.2, 3582.4]","(19886.7, 21657.8]"
307489,1,"(-17.912, 1791.2]","(16344.5, 18115.6]"
