In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
sample_submit = pd.read_csv('./data/sample_submit.csv',header=None, index_col=0)

In [3]:
train_df = train_df.drop(columns=['id'])
df = train_df.dropna()

In [4]:
df['charges'].value_counts()

0    1256
1     198
2     146
Name: charges, dtype: int64

In [5]:
X = df.drop(columns=['charges'])
y = df['charges']

In [6]:
X = pd.get_dummies(X)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_train.shape, y_test.shape

((1280, 11), (320,))

# ダウンサンプリング
目的変数`1`のサンプル数に`０`のサンプル数をそろえた

In [8]:
# 学習用データの説明変数と目的変数を結合
train = pd.concat([X_train,y_train],axis=1)

# 完済のデータと貸し倒れのデータを別々の変数に代入
low = train[train['charges'] == 0]
mid = train[train['charges'] == 1]
high = train[train['charges'] == 2]


# 貸し倒れのデータ数と同じ数だけ完済のデータをランダムに取り出し
low = low.sample(n=mid.shape[0], random_state=0)

# 完済のデータと貸し倒れのデータを縦結合
train = pd.concat([low, mid, high])

# 説明変数と目的変数をそれぞれ別の変数に代入
X_train = train.drop(columns=['charges'])
print(X_train.shape)

y_train = train['charges']
print(y_train.shape)

(433, 11)
(433,)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(StandardScaler(),LogisticRegression(C=1, max_iter=500))

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1, max_iter=500))])

In [10]:
from sklearn.metrics import accuracy_score

# 正答率
acc_train = accuracy_score(y_train, pipe.predict(X_train))
acc_test = accuracy_score(y_test, pipe.predict(X_test))

print("acc_train: "+ str(acc_train) + "   acc_test: "+ str(acc_test))

acc_train: 0.7274826789838337   acc_test: 0.8


In [11]:
from sklearn.metrics import f1_score

f1_train = f1_score(y_train, pipe.predict(X_train), average='macro')
f1_test = f1_score(y_test, pipe.predict(X_test), average='macro')

# 評価結果の表示
print("f1_train:", f1_train, "f1_test", f1_test)

f1_train: 0.7302492914569957 f1_test 0.6511111111111111


In [86]:
from sklearn.metrics import confusion_matrix

cm_train = confusion_matrix(y_train, pipe.predict(X_train))
cm_test = confusion_matrix(y_test, pipe.predict(X_test))

# 混同行列の表示
print(cm_train)
print(cm_test)

[[142  14   2]
 [ 51  91  16]
 [  5  23  89]]
[[222  25   4]
 [ 12  21   7]
 [  1   9  19]]


## 提出

In [12]:
test = test_df.drop(columns=['id'])

test_X = pd.get_dummies(test)
test_X.shape

(400, 11)

In [14]:
sample_submit[1] = pipe.predict(test_X)
sample_submit.to_csv('submit_ds.csv', header=None)
sample_submit.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
13,0
23,2
27,0
28,0
29,0


## アンサンブル

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

train = pd.concat([X_train,y_train],axis=1)
train.shape

(1280, 12)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

train = pd.concat([X_train,y_train],axis=1)

for i in range(5):
    low = train[train['charges'] == 0]
    mid = train[train['charges'] == 1]
    high = train[train['charges'] == 2]

    # 貸し倒れのデータ数と同じ数だけ完済のデータをランダムに取り出し
    low = low.sample(n=mid.shape[0])

    # 完済のデータと貸し倒れのデータを縦結合
    train = pd.concat([low, mid, high])

    # 説明変数と目的変数をそれぞれ別の変数に代入
    X_train = train.drop(columns=['charges'])
    y_train = train['charges']
    
    pipe = make_pipeline(StandardScaler(),LogisticRegression(C=1, max_iter=500))

    pipe.fit(X_train, y_train)
    
    pred = pipe.predict(X_test)
    
    
    if i == 0:
        pred_list = pred
    else:
        pred_list = np.vstack([pred_list, pred])
