## ランダムフォレスト　デフォルト

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from sklearn import tree


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(92305, 10)

In [4]:
train_pkl.describe()

Unnamed: 0,id,loan_amnt,term,interest_rate,employment_length,credit_score,application_type,loan_status
count,92305.0,92305.0,92305.0,92305.0,92305.0,92305.0,92305.0,92305.0
mean,121124.538606,1572.540658,3.532149,14.725193,6.698435,680.740141,0.007042,0.45887
std,69873.46366,843.143393,0.883812,4.839855,3.688196,27.342747,0.08362,0.498308
min,0.0,323.797279,3.0,5.704849,0.0,655.516957,0.0,0.0
25%,60491.0,1016.508908,3.0,11.25632,3.0,659.327071,0.0,0.0
50%,121097.0,1249.990558,3.0,14.026507,9.0,677.793438,0.0,0.0
75%,181620.0,2193.009346,5.0,17.965103,10.0,697.904809,0.0,1.0
max,242149.0,3851.867974,5.0,27.84603,10.0,807.293196,1.0,1.0


In [5]:
# object 型削除
train_pkl.drop(['grade', 'purpose'], axis=1, inplace=True)

In [6]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92305 entries, 21860 to 236408
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 92305 non-null  int64  
 1   loan_amnt          92305 non-null  float64
 2   term               92305 non-null  int16  
 3   interest_rate      92305 non-null  float64
 4   employment_length  92305 non-null  int16  
 5   credit_score       92305 non-null  float64
 6   application_type   92305 non-null  int8   
 7   loan_status        92305 non-null  int8   
dtypes: float64(3), int16(2), int64(1), int8(2)
memory usage: 4.0 MB


## 訓練データとテストデータに分割

In [7]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [8]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1]
train_y = train.loan_status

In [9]:
train_X.shape, train_y.shape

((73844, 6), (73844,))

## 訓練データで訓練

In [10]:
RF = RandomForestClassifier(random_state=42)

In [11]:
RF = RF.fit(train_X, train_y)

In [12]:
RF.feature_importances_

array([0.27783995, 0.02362444, 0.3591572 , 0.04618476, 0.29086303,
       0.00233062])

In [13]:
sorted(
    zip(map(lambda x: round(x, 3), RF.feature_importances_), train_X.columns),
    reverse=True)

[(0.359, 'interest_rate'),
 (0.291, 'credit_score'),
 (0.278, 'loan_amnt'),
 (0.046, 'employment_length'),
 (0.024, 'term'),
 (0.002, 'application_type')]

In [14]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(RF, out_file=None,
#                                feature_names=train.iloc[:, 1:].columns,
#                                class_names=train.Survived.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

## テストデータで実行

In [15]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1]
test_y = test.loan_status

In [16]:
test_x.shape, test_y.shape

((18461, 6), (18461,))

In [17]:
train_y.sum() / train_y.shape[0], test_y.sum() / test_y.shape[0]

(0.45804669302854667, 0.4621634797681599)

In [18]:
pred_y = RF.predict(test_x)

In [19]:
pred_y.shape

(18461,)

In [20]:
test_y.sum(), pred_y.sum()

(8532, 7780)

In [21]:
confusion_matrix(test_y, pred_y)

array([[6762, 3167],
       [3919, 4613]], dtype=int64)

In [22]:
accuracy_score(test_y, pred_y)

0.6161638047776393

In [23]:
f1_score(test_y, pred_y)

0.5655958803334967

In [24]:
recall_score(test_y, pred_y)

0.5406704172526957

In [25]:
confusion_matrix(test_y, pred_y)[1, 1] / (confusion_matrix(test_y, pred_y)[1, 1] + confusion_matrix(test_y, pred_y)[1, 0])

0.5406704172526957

In [26]:
precision_score(test_y, pred_y)

0.5929305912596401

In [27]:
confusion_matrix(test_y, pred_y)[1, 1] / (confusion_matrix(test_y, pred_y)[1, 1] + confusion_matrix(test_y, pred_y)[0, 1])

0.5929305912596401

In [28]:
2 / ((1 / precision_score(test_y, pred_y)) + (1 / recall_score(test_y, pred_y)))

0.5655958803334967

## 検証データで実行

In [29]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [30]:
valid.shape

(26900, 9)

In [31]:
# ID の保存
valid_pass = valid.id.values

In [32]:
valid_X = valid.copy()

In [33]:
valid_X.drop(['id', 'grade', 'purpose'], axis=1, inplace=True)

In [34]:
valid_X.describe()

Unnamed: 0,loan_amnt,term,interest_rate,employment_length,credit_score,application_type
count,26900.0,26900.0,26900.0,26900.0,26900.0,26900.0
mean,1515.095458,3.433309,13.778459,6.700669,683.840319,0.007175
std,827.710449,0.823945,4.587711,3.637777,29.685781,0.084401
min,353.090088,3.0,5.745858,0.0,655.435675,0.0
25%,753.766957,3.0,10.846102,3.0,659.589762,0.0
50%,1213.75448,3.0,13.551492,9.0,678.70787,0.0
75%,2147.699462,3.0,17.128929,10.0,698.684676,0.0
max,3812.969623,5.0,27.503147,10.0,811.676356,1.0


In [35]:
valid_X.shape, train_X.shape

((26900, 6), (73844, 6))

In [36]:
pred_valid_y = RF.predict(valid_X)

In [37]:
valid_pass.shape, pred_valid_y.shape

((26900,), (26900,))

In [38]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [39]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['result'])

In [40]:
result_df.head()

Unnamed: 0,result
242150,0
242151,1
242152,0
242153,0
242154,0


In [41]:
result_df.to_csv("./RF_3_1.csv", header=False)