### 電車遅延分数による定時 or 遅刻を、ナーブベイズで分類

In [22]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
# クラス（定時＝0、遅刻＝1）
LateTarget = [0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1]

In [4]:
# 特徴量（電車遅延分数）
MinutesLate = [0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,1,1,1,1,
1,1,1,1,1,1,1,2,2,2,
2,2,2,2,2,2,2,3,3,3,
3,3,3,3,3,4,4,4,4,4,
4,4,5,5,5,5,5,5,6,6,
3,3,4,4,4,5,5,5,5,6,
6,6,6,6,7,7,7,7,7,7,
7,8,8,8,8,8,8,8,8,9,
9,9,9,9,9,9,10,10,10,10]

In [8]:
# データフレームの作成
ml_df = pd.DataFrame({'LateTarget':LateTarget, 'MinutesLate':MinutesLate})

In [9]:
ml_df[55:65]

Unnamed: 0,LateTarget,MinutesLate
55,0,5
56,0,5
57,0,5
58,0,6
59,0,6
60,1,3
61,1,3
62,1,4
63,1,4
64,1,4


In [11]:
# ガウシアンを使用
clf = GaussianNB()

In [13]:
# 特徴量とターゲット（定時/遅刻）を訓練
clf = clf.fit(ml_df['MinutesLate'].values.reshape(-1, 1), ml_df['LateTarget'])

In [14]:
# MinutesLateクラスの確率取得
ypredprob = clf.predict_proba(ml_df['MinutesLate'].values.reshape(-1, 1))

In [15]:
ml_df['prob'] = ypredprob[:, 1]

In [16]:
ml_df[55:65]

Unnamed: 0,LateTarget,MinutesLate,prob
55,0,5,0.558806
56,0,5,0.558806
57,0,5,0.558806
58,0,6,0.847262
59,0,6,0.847262
60,1,3,0.070537
61,1,3,0.070537
62,1,4,0.232496
63,1,4,0.232496
64,1,4,0.232496


In [18]:
# 特徴量（電車遅延時間）ごとの確率
list(ml_df['prob'].unique())

[0.0015736833310562891,
 0.005444352749944612,
 0.01952603620790533,
 0.07053685129948228,
 0.2324959743345932,
 0.5588055497774197,
 0.8472622824652528,
 0.9621902793943331,
 0.9918889954984073,
 0.9983780754022074,
 0.9996919786114439]

In [20]:
# 特徴量からターゲットを予測
y_pred = clf.predict(ml_df['MinutesLate'].values.reshape(-1, 1))
ml_df['prediction'] = y_pred
ml_df.head()

Unnamed: 0,LateTarget,MinutesLate,prob,prediction
0,0,0,0.001574,0
1,0,0,0.001574,0
2,0,0,0.001574,0
3,0,0,0.001574,0
4,0,0,0.001574,0


In [21]:
ml_df.tail()

Unnamed: 0,LateTarget,MinutesLate,prob,prediction
95,1,9,0.998378,1
96,1,10,0.999692,1
97,1,10,0.999692,1
98,1,10,0.999692,1
99,1,10,0.999692,1


In [27]:
# 混合行列で評価
confusion_matrix(ml_df['LateTarget'], ml_df['prediction'], labels=[0, 1])

array([[52,  8],
       [ 5, 35]], dtype=int64)

In [25]:
# 正解率
accuracy_score(ml_df['LateTarget'], ml_df['prediction'])

0.87