# Week4授業課題 信用情報の学習

In [78]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, auc

## 【問題1】コンペティション内容の確認
コンペティションのOverviewページ読み、「Home Credit Default Risk」について以下の観点について確認してください。

何を学習し、何を予測するのか
どのようなファイルを作りKaggleに提出するか
提出されたものはどういった指標値で評価されるのか

<pre>
122個ある特徴量からその人が返済する確率を予測する

IDとTARGETがカラムとして入っているcsvファイルを提出する
TARGETにはその人が返済する確率が入っている

提出された課題はAUCで評価される

## 【問題2】学習と検証
<pre>
データを簡単に分析、前処理し、学習、検証するまでの一連の流れを作成・実行してください。

検証にはこのコンペティションで使用される評価指標を用いるようにしてください。学習に用いる手法は指定しません。

In [3]:
os.getcwd()

'/Users/morishuuya/Desktop/DIC/diveintocode-ml'

In [4]:
test_data_raw = pd.read_csv("/Users/morishuuya/Desktop/dataset/from kaggle/Home Credit Default/application_test.csv")
train_data_raw=pd.read_csv("/Users/morishuuya/Desktop/dataset/from kaggle/Home Credit Default/application_train.csv")

In [71]:
train_data = train_data_raw.copy(deep=True)
test_data = test_data_raw.copy(deep=True)

In [6]:
#欠損値がある列を削除
droped_train = train_data.dropna(axis=1)
droped_test = test_data.dropna(axis=1)

#train, testのデータで共通する列を抜きだす
drop_column_intersection = droped_train.columns & droped_test.columns

droped_train_inter = train_data[drop_column_intersection]
droped_train_inter.insert(1, "TARGET", train_data.loc[:, "TARGET"])
droped_test_inter = test_data[drop_column_intersection]

#機械学習させる上で文字列を使えないので、objectタイプを削除する
on_train = droped_train_inter.select_dtypes(include="number")
on_test = droped_test_inter.select_dtypes(include="number")

#機械学習用のtrain_test_splitのために説明変数と目的変数を分ける
X = on_train.drop(columns="TARGET")
y = on_train["TARGET"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [8]:
RFC = RandomForestClassifier(n_estimators=10)
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## 【問題3】テストデータに対する推定
テストデータ（application_test.csv）に対して推定を行い、Kaggleに提出を行ってください。

正しく提出が行えていれば、精度は低くても構いません。

In [9]:
y_pred_1 = RFC.predict(X_test)
fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, y_pred_1)
auc_1 = auc(fpr_1, tpr_1)
print("accuracy：", accuracy_score(y_test, y_pred_1))
print("recall:", recall_score(y_test, y_pred_1))
print("precision:", precision_score(y_test, y_pred_1))
print("f1:",f1_score(y_test, y_pred_1))
print("matrix:\n", confusion_matrix(y_test, y_pred_1))
print("-"*20)
print("auc:",auc_1)

accuracy： 0.9200291370743255
recall: 0.002134296503037268
precision: 0.1566265060240964
f1: 0.004211208292840946
matrix:
 [[70717    70]
 [ 6078    13]]
--------------------
auc: 0.5005727071818307


### kaggle提出　　public_score:0.49024

## 【問題4】特徴量エンジニアリング
<pre>
精度を上げるために以下のような観点で 特徴量エンジニアリング（Feature Engineering） を行ってください。

どの特徴量を使うか
どう前処理をするか
何をした時に検証データに対する評価指標がどのようになったかをまとめてください。最低5パターンの学習・検証を行ってください。

精度が高かったものに関してはテストデータに対しても推定を行い、Kaggleに提出を行ってください。

### パターン１：RandomForestの重要な特徴量でソートし、高い方から５つ選んで学習する

In [10]:
features = X_train.columns
importances = RFC.feature_importances_

print("重要度が高い特徴量順:")
print(sorted(zip(map(lambda x: round(x, 2), RFC.feature_importances_), features), 
             reverse=True))

重要度が高い特徴量順:
[(0.12, 'DAYS_REGISTRATION'), (0.12, 'DAYS_ID_PUBLISH'), (0.12, 'DAYS_BIRTH'), (0.11, 'SK_ID_CURR'), (0.11, 'AMT_CREDIT'), (0.1, 'DAYS_EMPLOYED'), (0.08, 'REGION_POPULATION_RELATIVE'), (0.08, 'AMT_INCOME_TOTAL'), (0.07, 'HOUR_APPR_PROCESS_START'), (0.02, 'CNT_CHILDREN'), (0.01, 'REGION_RATING_CLIENT_W_CITY'), (0.01, 'REGION_RATING_CLIENT'), (0.01, 'FLAG_WORK_PHONE'), (0.01, 'FLAG_PHONE'), (0.01, 'FLAG_EMAIL'), (0.01, 'FLAG_DOCUMENT_3'), (0.0, 'REG_REGION_NOT_WORK_REGION'), (0.0, 'REG_REGION_NOT_LIVE_REGION'), (0.0, 'REG_CITY_NOT_WORK_CITY'), (0.0, 'REG_CITY_NOT_LIVE_CITY'), (0.0, 'LIVE_REGION_NOT_WORK_REGION'), (0.0, 'LIVE_CITY_NOT_WORK_CITY'), (0.0, 'FLAG_MOBIL'), (0.0, 'FLAG_EMP_PHONE'), (0.0, 'FLAG_DOCUMENT_9'), (0.0, 'FLAG_DOCUMENT_8'), (0.0, 'FLAG_DOCUMENT_7'), (0.0, 'FLAG_DOCUMENT_6'), (0.0, 'FLAG_DOCUMENT_5'), (0.0, 'FLAG_DOCUMENT_4'), (0.0, 'FLAG_DOCUMENT_21'), (0.0, 'FLAG_DOCUMENT_20'), (0.0, 'FLAG_DOCUMENT_2'), (0.0, 'FLAG_DOCUMENT_19'), (0.0, 'FLAG_DOCUMENT_18'),

<pre>
使う特徴量を5つにする
'DAYS_REGISTRATION'
'DAYS_BIRTH'
'DAYS_ID_PUBLISH'
'DAYS_EMPLOYED'
'AMT_CREDIT'

In [11]:
importance_feature = ["DAYS_REGISTRATION", "DAYS_BIRTH", "DAYS_ID_PUBLISH", "DAYS_EMPLOYED", "AMT_CREDIT"]
train_data_pattern1 = on_train[importance_feature]
test_data_pattern1 = on_test[importance_feature]
TARGET = on_train["TARGET"]

In [12]:
X_pattern1 = train_data_pattern1
y_pattern1 = TARGET

In [13]:
X_train_pattern1, X_test_pattern1, y_train_pattern1, y_test_pattern1 = train_test_split(X_pattern1, y_pattern1, test_size=0.25,
                                                                                       random_state=42)

In [103]:
RFC_pattern1 = RandomForestClassifier(n_estimators=100)

In [104]:
RFC_pattern1.fit(X_train_pattern1, y_train_pattern1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [105]:
y_pred_pattern1 = RFC_pattern1.predict(X_test_pattern1)

In [106]:
fpr_pattern1, tpr_pattern1, thresholds_pattern1 = roc_curve(y_test_pattern1, y_pred_pattern1)
auc_pattern1 = auc(fpr_pattern1, tpr_pattern1)
print("accuracy：", accuracy_score(y_test, y_pred_pattern1))
print("recall:", recall_score(y_test, y_pred_pattern1))
print("precision:", precision_score(y_test, y_pred_pattern1))
print("f1:",f1_score(y_test, y_pred_pattern1))
print("matrix:\n", confusion_matrix(y_test, y_pred_pattern1))
print("-"*20)
print("auc:",auc_pattern1)

accuracy： 0.9205364343505295
recall: 0.0
precision: 0.0
f1: 0.0
matrix:
 [[70769    18]
 [ 6091     0]]
--------------------
auc: 0.5004875289786099


### パターン2：勘で３つ抜き出して　学習させる

In [108]:
pattern2_selected_columns = ["AMT_INCOME_TOTAL", "NAME_HOUSING_TYPE", "NAME_EDUCATION_TYPE"]

In [109]:
train_data_pattern2 = train_data[pattern2_selected_columns].copy(deep=True)

In [111]:
train_data_pattern2 = pd.get_dummies(train_data_pattern2, drop_first=True)

In [112]:
X_train_pattern2, X_test_pattern2, y_train_pattern2, y_test_pattern2 = train_test_split(train_data_pattern2, TARGET, test_size=0.25,
                                                                                       random_state=42)

In [113]:
RFC_pattern2 = RandomForestClassifier(n_estimators=100)

In [114]:
RFC_pattern2.fit(X_train_pattern2, y_train_pattern2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [115]:
y_pred_pattern2 = RFC_pattern2.predict(X_test_pattern2)

In [184]:
fpr_pattern2, tpr_pattern2, thresholds_pattern2 = roc_curve(y_test_pattern2, y_pred_pattern2)
auc_pattern2 = auc(fpr_pattern2, tpr_pattern2)
print("accuracy：", accuracy_score(y_test_pattern2, y_pred_pattern2))
print("recall:", recall_score(y_test_pattern2, y_pred_pattern2))
print("precision:", precision_score(y_test_pattern2, y_pred_pattern2))
print("f1:",f1_score(y_test_pattern2, y_pred_pattern2))
print("matrix:\n", confusion_matrix(y_test_pattern2, y_pred_pattern2))
print("-"*20)
print("auc:",auc_pattern2)

accuracy： 0.918780405317516
recall: 0.0006460991762235503
precision: 0.06557377049180328
f1: 0.0012795905310300703
matrix:
 [[70630    57]
 [ 6187     4]]
--------------------
auc: 0.49991986371234964


### パターン3：上記の特徴量をlabel encordingで試してみる

In [117]:
pattern3_selected_columns = ["AMT_INCOME_TOTAL", "NAME_HOUSING_TYPE", "NAME_EDUCATION_TYPE"]

In [118]:
train_data_pattern3 = train_data[pattern3_selected_columns].copy(deep=True)

In [119]:
le = LabelEncoder()

In [120]:
for col in train_data_pattern3:
    if train_data_pattern3[col].dtype == "object":
        train_data_pattern3[col] = le.fit_transform(train_data_pattern3[col])

In [121]:
X_train_pattern3, X_test_pattern3, y_train_pattern3, y_test_pattern3 = train_test_split(train_data_pattern3, TARGET, test_size=0.25,
                                                                                       random_state=42)

In [122]:
RFC_pattern3 = RandomForestClassifier(n_estimators=100)

In [123]:
RFC_pattern3.fit(X_train_pattern3, y_train_pattern3)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [124]:
y_pred_pattern3 = RFC_pattern3.predict(X_test_pattern3)

In [183]:
fpr_pattern3 ,tpr_pattern3, thresholds_pattern3 = roc_curve(y_test_pattern3, y_pred_pattern3)
auc_pattern3 = auc(fpr_pattern3, tpr_pattern3)
print("accuracy：", accuracy_score(y_test_pattern3, y_pred_pattern3))
print("recall:", recall_score(y_test_pattern3, y_pred_pattern3))
print("precision:", precision_score(y_test_pattern3, y_pred_pattern3))
print("f1:",f1_score(y_test_pattern3, y_pred_pattern3))
print("matrix:\n", confusion_matrix(y_test_pattern3, y_pred_pattern3))
print("-"*20)
print("auc:",auc_pattern3)

accuracy： 0.918780405317516
recall: 0.0006460991762235503
precision: 0.06557377049180328
f1: 0.0012795905310300703
matrix:
 [[70630    57]
 [ 6187     4]]
--------------------
auc: 0.49991986371234964


### パターン4：上記の特徴量を標準化してみる

In [138]:
pattern4_selected_columns = ["AMT_INCOME_TOTAL", "NAME_HOUSING_TYPE", "NAME_EDUCATION_TYPE"]

In [127]:
sc = StandardScaler()

In [141]:
train_data_pattern4 = train_data[pattern4_selected_columns].copy(deep=True)

In [129]:
train_data_pattern4["AMT_INCOME_TOTAL"].ravel()

array([202500., 270000.,  67500., ..., 153000., 171000., 157500.])

In [130]:
train_data_pattern4["AMT_INCOME_TOTAL"] = sc.fit_transform(train_data_pattern4["AMT_INCOME_TOTAL"].values.reshape(-1, 1))

In [131]:
train_data_pattern4 = pd.get_dummies(train_data_pattern4, drop_first=True)

In [132]:
X_train_pattern4, X_test_pattern4, y_train_pattern4, y_test_pattern4 = train_test_split(train_data_pattern4, TARGET, test_size=0.25,
                                                                                       random_state=42)

In [133]:
RFC_pattern4 = RandomForestClassifier(n_estimators=100)

In [135]:
RFC_pattern4.fit(X_train_pattern4, y_train_pattern4)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [136]:
y_pred_pattern4 = RFC_pattern4.predict(X_test_pattern4)

In [181]:
fpr_pattern4 ,tpr_pattern4, thresholds_pattern4 = roc_curve(y_test_pattern4, y_pred_pattern4)
auc_pattern4 = auc(fpr_pattern4, tpr_pattern4)
print("accuracy：", accuracy_score(y_test_pattern4, y_pred_pattern4))
print("recall:", recall_score(y_test_pattern4, y_pred_pattern4))
print("precision:", precision_score(y_test_pattern4, y_pred_pattern4))
print("f1:",f1_score(y_test_pattern4, y_pred_pattern4))
print("matrix:\n", confusion_matrix(y_test_pattern4, y_pred_pattern4))
print("-"*20)
print("auc:",auc_pattern4)

accuracy： 0.918780405317516
recall: 0.00048457438216766274
precision: 0.05084745762711865
f1: 0.00096
matrix:
 [[70631    56]
 [ 6188     3]]
--------------------
auc: 0.4998461747517386


### パターン5：欠損値をデータとして扱ってみる

In [149]:
pattern5_selected_columns = ["AMT_ANNUITY", "AMT_GOODS_PRICE" , "NAME_TYPE_SUITE"]

In [150]:
train_data_pattern5 = train_data[pattern5_selected_columns].copy(deep=True)

In [153]:
train_data_pattern5.isnull().sum()

AMT_ANNUITY          12
AMT_GOODS_PRICE     278
NAME_TYPE_SUITE    1292
dtype: int64

In [164]:
train_data_pattern5 = train_data_pattern5.isnull()

In [172]:
train_data_pattern5 = pd.get_dummies(train_data_pattern5, columns=["AMT_ANNUITY", "AMT_GOODS_PRICE" , "NAME_TYPE_SUITE"],
                                     drop_first=True)

In [175]:
X_train_pattern5, X_test_pattern5, y_train_pattern5, y_test_pattern5 = train_test_split(train_data_pattern5, TARGET, test_size=0.25,
                                                                                       random_state=42)

In [176]:
RFC_pattern5 = RandomForestClassifier(n_estimators=100)

In [177]:
RFC_pattern5.fit(X_train_pattern5, y_train_pattern5)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [178]:
y_pred_pattern5 = RFC_pattern5.predict(X_test_pattern5)

In [180]:
fpr_pattern5 ,tpr_pattern5, thresholds_pattern5 = roc_curve(y_test_pattern5, y_pred_pattern5)
auc_pattern5 = auc(fpr_pattern5, tpr_pattern5)
print("accuracy：", accuracy_score(y_test_pattern5, y_pred_pattern5))
print("recall:", recall_score(y_test_pattern5, y_pred_pattern5))
print("precision:", precision_score(y_test_pattern5, y_pred_pattern5))
print("f1:",f1_score(y_test_pattern5, y_pred_pattern5))
print("matrix:\n", confusion_matrix(y_test_pattern5, y_pred_pattern5))
print("-"*20)
print("auc:",auc_pattern5)

accuracy： 0.9194698093082546
recall: 0.0
precision: 0.0
f1: 0.0
matrix:
 [[70687     0]
 [ 6191     0]]
--------------------
auc: 0.5
