In [162]:
## 必要なライブラリのセットアップ
### 基礎ライブラリ
import pandas as pd
### SKlearn実行用ライブラリ
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
### 前処理
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import RFE
### モデル検証
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [163]:
## テストデータの読み込み

In [164]:
df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
X = df.iloc[:, :-1] # 最終列が審査結果のため最終列以前を特徴量Xとして読込
X = X.drop("Loan_ID",axis=1) # 1列目はIDで不要のため削除
y_pre = df.iloc[:, [-1]]    # 最終列を正解データとして読込
# Seriesのmap functionで値を置換するんやね
class_mapping = {'N':1, 'Y':0}
y = y_pre.copy()
y.loc[:,'Loan_Status'] = y_pre['Loan_Status'].map(class_mapping)
class_mapping = {'N':1, 'Y':0}
y = y_pre.copy()
y.loc[:,'Loan_Status'] = y_pre['Loan_Status'].map(class_mapping)
print('*****************************************')
print("特徴量テーブルのカラム構成")
print(X.columns)
print("正解スコアのカラム構成")
print(y.columns)

print('*****************************************')

*****************************************
特徴量テーブルのカラム構成
Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')
正解スコアのカラム構成
Index(['Loan_Status'], dtype='object')
*****************************************


In [165]:
## テストデータの前処理    

In [178]:
### One-hot encoding
#### object型カラムの2値配列を生成
dummy_columns_indexes = []
for i in range(0,X.shape[1]):    
 if X.dtypes[i].str == '|O':
    dummy_columns_indexes.append(True)
 else:
    dummy_columns_indexes.append(False)
    
#### 訓練用データの全カラム名配列
all_columns_name = X.columns.values
#### 訓練用データのobjectカラム名配列
objects_columns_name = X.loc[:,dummy_columns_indexes].columns

#### get_dummy function execute!
X_dummy = pd.get_dummies(X
                   ,dummy_na =True
                   ,columns=objects_columns_name)

X_dummy.shape

array(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes',
       'Married_nan', 'Dependents_0', 'Dependents_1', 'Dependents_2',
       'Dependents_3+', 'Dependents_nan', 'Education_Graduate',
       'Education_Not Graduate', 'Education_nan', 'Self_Employed_No',
       'Self_Employed_Yes', 'Self_Employed_nan', 'Property_Area_Rural',
       'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'], dtype=object)

In [181]:
##  Imputerで欠損値処理
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values = "NaN",strategy = 'mean', axis = 0)
imp.fit(X_dummy)

# 学習済みImputerを適用しX_newの欠損値を置換
X_dummy_impute = pd.DataFrame(imp.transform(X_dummy), columns=X_dummy.columns.values)
X_dummy_impute.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [183]:
### RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier

selector = RFE(GradientBoostingClassifier(random_state=1), n_features_to_select=10, step=.05)
selector.fit(X_dummy_impute, y.as_matrix().ravel())

X_fin = pd.DataFrame(selector.transform(X_dummy_impute), columns=X_dummy_impute.columns[selector.support_])
print('X_fin shape:(%i,%i)' % X_fin.shape)

X_fin.head()

X_fin shape:(614,10)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_No,Married_nan,Dependents_1,Self_Employed_nan,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0


In [186]:
## スコアデータの読み込み
#### dtypeに引き渡す用の配列を生成する
####  objcectをfor roopで生成する
obj_dict = []
for i in range (0,objects_columns_name.values.size):
    obj_dict.append(object)
    
score_dtype_dict = dict(zip(objects_columns_name.values,obj_dict))
## スコアデータを型指定して読み込み
df_score = pd.read_csv('./data/av_loan_test_Y3wMUE5_7gLdaTN.csv'
                       , header=0
                      ,  dtype = score_dtype_dict)

X_score  = df_score.iloc[:, :-1]            # 最終列が審査結果のため最終列以前を特徴量Xとして読込
ID = X_score.iloc[:, [0]]             # 最初列がPK（Loan_ID）なのでID情報としてセット
X_score  = X_score.drop('Loan_ID', axis=1)  # 1列目(Loan_ID)は特徴量ベクトルから削除(drop関数におけるaxis=1は列方向)

print('*****************************************')
print("特徴量テーブルのカラム構成")
print(X_score.columns)
print(X_score.shape)
print('*****************************************')

*****************************************
特徴量テーブルのカラム構成
Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')
(333, 10)
*****************************************


In [180]:
### One-hot encoding
#### object型カラムの2値配列を生成
dummy_columns_indexes_s= []
for i in range(0,X.shape[1]):    
 if X.dtypes[i].str == '|O':
    dummy_columns_indexes.append(True)
 else:
    dummy_columns_indexes.append(False)
    
#### 訓練用データの全カラム名配列
all_columns_name = X.columns.values
#### 訓練用データのobjectカラム名配列
objects_columns_name = X.loc[:,dummy_columns_indexes].columns

#### get_dummy function execute!
X_dummy = pd.get_dummies(X
                   ,dummy_na =True
                   ,columns=objects_columns_name)

X_dummy.shape

array(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Married_No', 'Married_nan',
       'Dependents_1', 'Self_Employed_nan', 'Property_Area_Semiurban'],
      dtype=object)

In [157]:

### 学習データとカラム構成を統一する

In [158]:
### imputer

In [8]:
### RFEでカラム生成

In [9]:
## 学習実行

In [10]:
### Pipe-Line生成

In [11]:
### fitで学習

In [12]:
## 評価プロセス 

In [13]:
### 各評価指標でDF生成

In [14]:
### ビジュアライズ

In [None]:
### ベストモデルの保存