In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
sample_submit = pd.read_csv('./data/sample_submit.csv',header=None, index_col=0)

In [37]:
train_df = train_df.drop(columns=['id'])
df = train_df.dropna()

In [38]:
df.shape

(1600, 7)

In [39]:
df['charges'].value_counts()

0    1256
1     198
2     146
Name: charges, dtype: int64

In [40]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [41]:
X = df.drop(columns=['charges'])
y = df['charges']

In [42]:
X = pd.get_dummies(X)

In [43]:
X.shape, y.shape

((1600, 11), (1600,))

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [45]:
y_train.value_counts()

0    1005
1     158
2     117
Name: charges, dtype: int64

# ロジスティック回帰
何も考えずにすべての変数を突っ込んだ

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(StandardScaler(),LogisticRegression(C=1, max_iter=500))

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1, max_iter=500))])

In [47]:
from sklearn.metrics import accuracy_score

# 正答率
acc_train = accuracy_score(y_train, pipe.predict(X_train))
acc_test = accuracy_score(y_test, pipe.predict(X_test))

print("acc_train: "+ str(acc_train) + "   acc_test: "+ str(acc_test))

acc_train: 0.878125   acc_test: 0.859375


In [48]:
from sklearn.metrics import f1_score

f1_train = f1_score(y_train, pipe.predict(X_train), average='macro')
f1_test = f1_score(y_test, pipe.predict(X_test), average='macro')

# 評価結果の表示
print("f1_train:", f1_train, "f1_test", f1_test)

f1_train: 0.7437793556820361 f1_test 0.672745732597246


In [49]:
from sklearn.metrics import confusion_matrix

cm_train = confusion_matrix(y_train, pipe.predict(X_train))
cm_test = confusion_matrix(y_test, pipe.predict(X_test))

# 混同行列の表示
print(cm_train)
print(cm_test)

[[960  39   6]
 [ 66  75  17]
 [  7  21  89]]
[[242   7   2]
 [ 19  14   7]
 [  3   7  19]]


## 提出

In [50]:
test = test_df.drop(columns=['id'])


test_X = pd.get_dummies(test)
test_X.shape

(400, 11)

In [51]:
sample_submit[1] = pipe.predict(test_X)
sample_submit.to_csv('submit_lr.csv', header=None)
sample_submit.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
13,0
23,2
27,0
28,0
29,0


In [53]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,26,32.665465,3,0,1,1,0,0,0,1,0
1,41,29.798725,1,0,1,1,0,0,0,0,1
2,28,32.722029,0,0,1,0,1,0,1,0,0
3,20,38.429831,2,1,0,1,0,0,0,1,0
4,45,29.641854,1,1,0,1,0,0,1,0,0


In [54]:
X = X.drop(columns=['sex_female','smoker_no','region_northeast','region_northwest','region_southeast','region_southwest'])

In [55]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(StandardScaler(),LogisticRegression(C=1, max_iter=500))

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1, max_iter=500))])

In [56]:
f1_train = f1_score(y_train, pipe.predict(X_train), average='macro')
f1_test = f1_score(y_test, pipe.predict(X_test), average='macro')

# 評価結果の表示
print("f1_train:", f1_train, "f1_test", f1_test)

f1_train: 0.7441800794941712 f1_test 0.6795692936043812


In [61]:
test = test_df.drop(columns=['id'])

test_X = pd.get_dummies(test)

test_X = test_X.drop(columns=['sex_female','smoker_no','region_northeast','region_northwest','region_southeast','region_southwest'])

test_X.shape

(400, 5)

In [63]:
test_X.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,24.365178,1,1,0
1,59,33.997763,0,1,1
2,42,29.28345,0,0,0
3,30,24.903725,1,1,0
4,41,29.644536,0,0,0


In [62]:
sample_submit[1] = pipe.predict(test_X)
sample_submit.to_csv('submit_lr_downfeatures.csv', header=None)
sample_submit.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
13,0
23,2
27,0
28,0
29,0


- ダウンサンプリングしてみる<br>
- 少ない方は同じのを多い方は何度もランダムに抽出
- アンサンブル

## ダウンサンプリング

In [30]:
# 学習用データの説明変数と目的変数を結合
train = pd.concat([X_train,y_train],axis=1)

# 完済のデータと貸し倒れのデータを別々の変数に代入
fp = train[train['ChargedOff'] == 0]
co = train[train['ChargedOff'] == 1]

# 貸し倒れのデータ数と同じ数だけ完済のデータをランダムに取り出し
fp = fp.sample(n=co.shape[0], random_state=0)

# 完済のデータと貸し倒れのデータを縦結合
train = pd.concat([fp, co])

# 説明変数と目的変数をそれぞれ別の変数に代入
X_train = train.drop(columns=['ChargedOff'])
print(X_train.shape)

y_train = train['ChargedOff']
print(y_train.shape)

KeyError: 'ChargedOff'