### 電車遅延分数による定時 or 遅刻を、ナーブベイズで分類
条件付き確率の時のデータから学習データを作成してやってみる。

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# 定時と遅刻のデータを作成
# 電車遅延時間、定時に来た人数
in_time = [(0, 15), (1, 12), (2, 10), (3, 8),
           (4, 7), (5, 6), (6, 2), (7, 0),
           (8, 0), (9, 0), (10, 0)]
# 電車遅延時間、遅刻した人数
too_late = [(0, 0),(1, 0), (2, 0), (3, 2), (4, 3), 
            (5, 4), (6, 5), (7, 7), (8, 8),
            (9, 7), (10, 4)]

In [5]:
# 定時に来た人と遅延した人の合計人数
num = np.sum(in_time, axis=0)[1] + np.sum(too_late, axis= 0)[1]

In [6]:
ml_data_np = np.zeros([num, 2])

In [7]:
len(in_time)

11

In [8]:
cnt = 0
# 定時に来た人のデータ作成
for in_time_line in in_time:
    for i in range(in_time_line[1]):
        ml_data_np[cnt][0] = in_time_line[0]
        # 定時に来たので目的変数に  0 遅刻でない
        ml_data_np[cnt][1] = 0
        cnt += 1

In [9]:
# 遅刻した人のデータ作成
for too_late_line in too_late:
    for i in range(too_late_line[1]):
        ml_data_np[cnt][0] = too_late_line[0]
        # 遅刻したので目的変数に 1 遅刻
        ml_data_np[cnt][1] = 1
        cnt += 1

In [10]:
ml_data_pd = pd.DataFrame(ml_data_np, columns=['MinutesLate', 'LateTarget'])

In [11]:
clf = GaussianNB()

In [12]:
clf = clf.fit(ml_data_pd['MinutesLate'].values.reshape(-1, 1), ml_data_pd['LateTarget'])

In [13]:
# 各遅延時間ごとの確率を取得
y_train_prob = clf.predict_proba(ml_data_pd['MinutesLate'].values.reshape(-1, 1))

In [14]:
ml_data_pd['prob'] = y_train_prob[:, 1]

In [15]:
ml_data_pd.head()

Unnamed: 0,MinutesLate,LateTarget,prob
0,0.0,0.0,0.001574
1,0.0,0.0,0.001574
2,0.0,0.0,0.001574
3,0.0,0.0,0.001574
4,0.0,0.0,0.001574


In [16]:
list(ml_data_pd['prob'].unique())

[0.0015736833310562891,
 0.005444352749944612,
 0.01952603620790533,
 0.07053685129948228,
 0.2324959743345932,
 0.5588055497774197,
 0.8472622824652528,
 0.9621902793943331,
 0.9918889954984073,
 0.9983780754022074,
 0.9996919786114439]

In [17]:
# 予測
y_pred = clf.predict(ml_data_pd['MinutesLate'].values.reshape(-1, 1))

ml_data_pd['prediction'] = y_pred

In [18]:
ml_data_pd.head()

Unnamed: 0,MinutesLate,LateTarget,prob,prediction
0,0.0,0.0,0.001574,0.0
1,0.0,0.0,0.001574,0.0
2,0.0,0.0,0.001574,0.0
3,0.0,0.0,0.001574,0.0
4,0.0,0.0,0.001574,0.0


In [19]:
ml_data_pd.tail()

Unnamed: 0,MinutesLate,LateTarget,prob,prediction
95,9.0,1.0,0.998378,1.0
96,10.0,1.0,0.999692,1.0
97,10.0,1.0,0.999692,1.0
98,10.0,1.0,0.999692,1.0
99,10.0,1.0,0.999692,1.0


In [20]:
confusion_matrix(ml_data_pd['LateTarget'], ml_data_pd['prediction'])

array([[52,  8],
       [ 5, 35]], dtype=int64)

In [21]:
accuracy_score(ml_data_pd['LateTarget'], ml_data_pd['prediction'])

0.87