In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
sample_submit = pd.read_csv('./data/submit.csv',header=None, index_col=0)

In [3]:
sample_submit.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
1496754,0
84909594,0
1165403,0
91354446,0
85636932,0


In [4]:
test_df.shape,sample_submit.shape

((26906, 9), (26906, 1))

In [5]:
df = train_df.drop(columns=['id'])

In [6]:
df = df.dropna()

In [7]:
df.columns

Index(['loan_amnt', 'term', 'interest_rate', 'grade', 'employment_length',
       'purpose', 'credit_score', 'application_type', 'loan_status'],
      dtype='object')

In [8]:
X = df[['term', 'interest_rate', 'grade', 'purpose', 'credit_score', 'application_type']]
y = df['loan_status']

In [9]:
X = pd.get_dummies(X)
y = pd.get_dummies(y)

In [10]:
X.head()

Unnamed: 0,interest_rate,credit_score,term_3 years,term_5 years,grade_A1,grade_A2,grade_A3,grade_A4,grade_A5,grade_B1,...,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,application_type_Individual,application_type_Joint App
1,16.29,700.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,21.98,670.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,8.59,710.0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,13.99,680.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,7.35,790.0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0


In [11]:
y = y.drop(columns=['FullyPaid'])

In [12]:
y.head()

Unnamed: 0,ChargedOff
1,1
2,0
3,0
4,0
5,0


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y['ChargedOff'], random_state=0)

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train))

LogisticRegression()

In [15]:
pred = lr.predict(X_test)

print(pred.sum(), y_test.sum())

294 ChargedOff    8909
dtype: int64


In [16]:
# f1_scoreのインポート
from sklearn.metrics import f1_score

# 実測値y_test,予測値predを使ってf1_scoreを計算
f_one = f1_score(y_test, pred)

# 評価結果の表示
print(f_one)

0.036509833749864176


In [17]:
from sklearn.metrics import confusion_matrix

# 実測値y_test,予測値predを使ってconfusion_matrixを作成
cm = confusion_matrix(y_test, pred)

# 混同行列の表示
print(cm)

[[36760   126]
 [ 8741   168]]


### ダウンサンプリング

クエストにあるやり方

In [18]:
# 学習用データの説明変数と目的変数を結合
train = pd.concat([X_train,y_train],axis=1)

# 完済のデータと貸し倒れのデータを別々の変数に代入
fp = train[train['ChargedOff'] == 0]
co = train[train['ChargedOff'] == 1]

# 貸し倒れのデータ数と同じ数だけ完済のデータをランダムに取り出し
fp = fp.sample(n=co.shape[0], random_state=0)

# 完済のデータと貸し倒れのデータを縦結合
train = pd.concat([fp, co])

# 説明変数と目的変数をそれぞれ別の変数に代入
X_train = train.drop(columns=['ChargedOff'])
print(X_train.shape)

y_train = train['ChargedOff']
print(y_train.shape)

(71272, 54)
(71272,)


In [19]:
lr2 = LogisticRegression()
lr2.fit(X_train, np.ravel(y_train))

LogisticRegression()

In [20]:
f1_2_train = f1_score(y_train, lr2.predict(X_train))

# 評価結果の表示
print("F1_train",f1_2_train)

F1_train 0.6422277332207683


In [21]:
f1_2_test = f1_score(y_test, lr2.predict(X_test))

# 評価結果の表示
print("F1_test",f1_2_test)

F1_test 0.38744788517233697


In [22]:
cm2_train = confusion_matrix(y_train, lr2.predict(X_train))

# 混同行列の表示
print(cm2_train)

[[19807 15829]
 [11293 24343]]


In [23]:
cm2_test = confusion_matrix(y_test, lr2.predict(X_test))

# 混同行列の表示
print(cm2_test)

[[20461 16425]
 [ 2822  6087]]


UnderSamplingのやりかたもあるらしい

## 提出

In [24]:
test_df.shape

(26906, 9)

In [25]:
test = test_df.drop(columns=['id'])
#print(test.shape)

#test = test.fillna(0)
#print(test.shape)

test_X = test[['term', 'interest_rate', 'grade', 'purpose', 'credit_score', 'application_type']]

test_X = pd.get_dummies(test_X)
test_X.shape

(26906, 54)

In [26]:
sample_submit[1] = lr2.predict(test_X)
sample_submit.to_csv('submit.csv', header=None)
sample_submit.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
1496754,0
84909594,0
1165403,0
91354446,1
85636932,1
