In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline

### 베이즈 정리
- P(H|E) = P(E|H) x P(H) / P(E)

In [3]:
apparel = ['y','n','y','y','y','y','y','n','y','y','y','y']
leisure = ['n','y','n','n','y','y','n','y','y','n','n','y']
furniture = ['n','n','n','y','y','y','y','n','n','n','n','y']
kitchenware = ['n','n','y','y','y','y','y','y','n','y','n','n']
gender = ['m','m','f','f','m','f','f','f','m','f','m','f']

In [8]:
buy = pd.DataFrame([apparel, leisure, furniture, kitchenware, gender], index=['apparel', 'leisure', 'furniture', 'kitchenware', 'gender']).T
buy

Unnamed: 0,apparel,leisure,furniture,kitchenware,gender
0,y,n,n,n,m
1,n,y,n,n,m
2,y,n,n,y,f
3,y,n,y,y,f
4,y,y,y,y,m
5,y,y,y,y,f
6,y,n,y,y,f
7,n,y,n,y,f
8,y,y,n,n,m
9,y,n,n,y,f


In [14]:
cust_Q = pd.DataFrame([['y', 'y', 'n', 'y', None]], columns=['apparel', 'leisure', 'furniture', 'kitchenware', 'gender'])
cust_Q

Unnamed: 0,apparel,leisure,furniture,kitchenware,gender
0,y,y,n,y,


### 고객 Q가 상품을 구매한 사건 $E_q : (A=y) and (L=y) and (F=n) and (K=y)$
* A:  apparel, L:  leisure, F:  furniture, K:  kitchenware

### 고객 Q가 남자일 확률 $P(G=m | E_q)$
$P(G=m | E_Q) = \frac{P(E_Q|G=m)\times P(G=m)}{P(E_Q)}$

In [146]:
temp = buy.melt(id_vars='gender', var_name='category')
temp['c'] = 1
temp = temp.pivot_table(index='value', columns=['category','gender'], aggfunc='sum').sort_index(ascending=False)
temp

Unnamed: 0_level_0,c,c,c,c,c,c,c,c
category,apparel,apparel,furniture,furniture,kitchenware,kitchenware,leisure,leisure
gender,f,m,f,m,f,m,f,m
value,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
y,6,4,4,1,6,1,3,3
n,1,1,3,4,1,4,4,2


In [147]:
temp.columns =  pd.MultiIndex.from_tuples([col[1:] for col in temp.columns])
temp

Unnamed: 0_level_0,apparel,apparel,furniture,furniture,kitchenware,kitchenware,leisure,leisure
Unnamed: 0_level_1,f,m,f,m,f,m,f,m
value,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
y,6,4,4,1,6,1,3,3
n,1,1,3,4,1,4,4,2


In [148]:
temp = temp.swaplevel(axis=1).sort_index(level=[1, 0], axis=1, ascending=[True, False]).iloc[:,[0,1,6,7,2,3,4,5]]
temp

Unnamed: 0_level_0,m,f,m,f,m,f,m,f
Unnamed: 0_level_1,apparel,apparel,leisure,leisure,furniture,furniture,kitchenware,kitchenware
value,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
y,4,6,3,3,1,4,1,6
n,1,1,2,4,4,3,4,1


In [156]:
temp = pd.concat([temp, pd.DataFrame([[f'{temp.iloc[i, j]}/{temp.sum()[j]}' for j in range(temp.shape[1])] for i in range(2)], columns=temp.columns, index=['y_rate', 'n_rate'])])
temp

Unnamed: 0_level_0,m,f,m,f,m,f,m,f
Unnamed: 0_level_1,apparel,apparel,leisure,leisure,furniture,furniture,kitchenware,kitchenware
y,4,6,3,3,1,4,1,6
n,1,1,2,4,4,3,4,1
y_rate,4/5,6/7,3/5,3/7,1/5,4/7,1/5,6/7
n_rate,1/5,1/7,2/5,4/7,4/5,3/7,4/5,1/7


$P(E_Q|G=m)=P(A=y|G=m)\times P(L=y|G=m)\times P(F=n|G=m)\times P(K=y|G=m)$

$P(E_Q|G=m)=\frac{4}{5}\times\frac{3}{5}\times\frac{4}{5}\times\frac{1}{5} = \frac{48}{625} = 0.0768$

$P(G=m|E_Q) = \frac{0.0768 \times 0.4167}{P(E_Q)} = \frac{0.0320}{P(E_Q)}$

### 고객 Q가 여자일 확률 $P(G=f | E_Q)$

$P(G=f|E_Q) = \frac{P(E_Q|G=f) \times P(G=f)}{P(E_Q)}$

$P(E_Q|G=f)=P(A=y|G=f)\times P(L=y|G=f)\times P(F=n|G=f)\times P(K=y|G=f) = \frac{6}{7} \times \frac{3}{7}\times \frac{3}{7}\times \frac{6}{7} = \frac{324}{2401} = 0.1349$

$P(G=f|E_Q) = \frac{0.1349 \times 0.5733}{P(E_Q)} = \frac{0.0787}{P(E_Q)}$

### $P(E_Q)$

$P(E_Q) = P(E_Q|G=m)\times P(G=m) + P(E_Q|G=f)\times P(G=f)$

$P(E_Q) = 0.0320+0.0787 = 0.1107$

$P(G=m|E_Q) = \frac{0.0320}{0.1107} = 0.2891$

$P(G=f|E_Q) = \frac{0.0787}{0.1107} = 0.7109$

$P(G=f|E_Q) > P(G=m|E_Q)$

In [166]:
buy['age'] = [40,39,35,30,45,31,38,29,28,34,46,42]
buy

Unnamed: 0,apparel,leisure,furniture,kitchenware,gender,age
0,y,n,n,n,m,40
1,n,y,n,n,m,39
2,y,n,n,y,f,35
3,y,n,y,y,f,30
4,y,y,y,y,m,45
5,y,y,y,y,f,31
6,y,n,y,y,f,38
7,n,y,n,y,f,29
8,y,y,n,n,m,28
9,y,n,n,y,f,34


### 고객 T가 상품을 구매한 사건 $E_T : (Age=44) and (A=y)and(L=y)and(F=n)and(K=y)$

In [167]:
buy.groupby('gender').agg({'age':['mean', 'std']})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,mean,std
gender,Unnamed: 1_level_2,Unnamed: 2_level_2
f,34.142857,4.670067
m,39.6,7.162402


### 고객 T가 남성인 경우

In [174]:
import scipy as sp
rv = sp.stats.norm(39.60, 7.16)
rv.pdf(44)

0.04613104686742316

In [182]:
rv.cdf(44) - rv.cdf(43)

0.04800907948339117

$P(G=m|E_T) = \frac{0.0768 \times 0.4167}{P(E_T)} \times 0.0461 = \frac{0.0015}{P(E_T)}$

### 고객 T가 여성인 경우

In [183]:
rv = sp.stats.norm(34.14, 4.67)
rv.pdf(44)

0.00919592358557332

$P(G=m|E_T) = \frac{0.1349 \times 0.5833}{P(E_T)} \times 0.0092 = \frac{0.0007}{P(E_T)}$

$P(G=m|E_T) > P(G=f|E_T)$

# 베이즈 분류기를 이용한 스팸 메일 판정

In [211]:
with open('spambase.names', 'r') as file:
    colnames = ['word_' + txt.split(': ')[0].split('freq_')[1] for txt in file.readlines()[33:-3]]
colnames.extend(['acap', 'lcap', 'tcap', 'spam'])
colnames

['word_make',
 'word_address',
 'word_all',
 'word_3d',
 'word_our',
 'word_over',
 'word_remove',
 'word_internet',
 'word_order',
 'word_mail',
 'word_receive',
 'word_will',
 'word_people',
 'word_report',
 'word_addresses',
 'word_free',
 'word_business',
 'word_email',
 'word_you',
 'word_credit',
 'word_your',
 'word_font',
 'word_000',
 'word_money',
 'word_hp',
 'word_hpl',
 'word_george',
 'word_650',
 'word_lab',
 'word_labs',
 'word_telnet',
 'word_857',
 'word_data',
 'word_415',
 'word_85',
 'word_technology',
 'word_1999',
 'word_parts',
 'word_pm',
 'word_direct',
 'word_cs',
 'word_meeting',
 'word_original',
 'word_project',
 'word_re',
 'word_edu',
 'word_table',
 'word_conference',
 'word_;',
 'word_(',
 'word_[',
 'word_!',
 'word_$',
 'word_#',
 'acap',
 'lcap',
 'tcap',
 'spam']

In [208]:
raw = pd.read_csv('spambase.data', header=None)
raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [212]:
raw.columns = colnames
raw

Unnamed: 0,word_make,word_address,word_all,word_3d,word_our,word_over,word_remove,word_internet,word_order,word_mail,...,word_;,word_(,word_[,word_!,word_$,word_#,acap,lcap,tcap,spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [238]:
from sklearn.model_selection import train_test_split

In [248]:
i = 0
X = raw.drop('spam', axis=1)
y = raw.spam
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)

### LDA 모델

In [253]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix

In [254]:
lda_model = LDA().fit(X_train, y_train)

In [259]:
lda_model.classes_

array([0, 1], dtype=int64)

In [256]:
cm_lda = confusion_matrix(y_train, lda_model.predict(X_train))
cm_lda

array([[1614,   78],
       [ 234,  835]], dtype=int64)

In [266]:
round((cm_lda[0][0] + cm_lda[1][1]) / cm_lda.sum()*100, 2)

88.7

### Naive Bayes 모델

In [283]:
from sklearn.naive_bayes import *

In [261]:
nb_model = BernoulliNB().fit(X_train, y_train)

In [262]:
nb_model.classes_

array([0, 1], dtype=int64)

In [263]:
cm_nb = confusion_matrix(y_train, nb_model.predict(X_train))
cm_nb

array([[1582,  110],
       [ 199,  870]], dtype=int64)

In [265]:
round((cm_nb[0][0] + cm_nb[1][1]) / cm_nb.sum()*100, 2)

88.81

In [267]:
def get_acc(x,y,model):
    cm = confusion_matrix(y, model.predict(x))
    acc = round((cm[0][0] + cm[1][1])/cm.sum()*100,2)
    return acc

In [334]:
# Bernoulli
result = pd.DataFrame()
for i in range(10):
    X = raw.drop('spam', axis=1)
    y = raw.spam
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)
    
    lda_model = LDA().fit(X_train, y_train)
    lda_train_acc = get_acc(X_train, y_train, lda_model)
    lda_val_acc = get_acc(X_val, y_val, lda_model)
    
    nb_model = BernoulliNB().fit(X_train, y_train)
    nb_train_acc = get_acc(X_train, y_train, nb_model)
    nb_val_acc = get_acc(X_val, y_val, nb_model)
    result = result.append({'nb_train':nb_train_acc, 'lda_train':lda_train_acc, 'nb_val':nb_val_acc, 'lda_val':lda_val_acc}, ignore_index=True)
result_avg = result.mean()
result_avg.name = '평균'
result = result.append(result_avg)
result

Unnamed: 0,nb_train,lda_train,nb_val,lda_val
0,88.81,88.7,88.48,88.48
1,88.77,88.59,88.91,88.91
2,88.7,88.92,88.8,87.5
3,88.34,88.7,89.24,87.5
4,88.34,89.97,89.57,89.24
5,88.66,89.28,90.11,89.57
6,89.53,88.92,87.39,88.26
7,89.21,89.5,88.7,90.65
8,88.92,88.7,88.48,89.35
9,88.84,88.77,90.11,90.11


In [335]:
# Gaussian
result = pd.DataFrame()
for i in range(10):
    X = raw.drop('spam', axis=1)
    y = raw.spam
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)
    
    lda_model = LDA().fit(X_train, y_train)
    lda_train_acc = get_acc(X_train, y_train, lda_model)
    lda_val_acc = get_acc(X_val, y_val, lda_model)
    
    nb_model = GaussianNB().fit(X_train, y_train)
    nb_train_acc = get_acc(X_train, y_train, nb_model)
    nb_val_acc = get_acc(X_val, y_val, nb_model)
    result = result.append({'nb_train':nb_train_acc, 'lda_train':lda_train_acc, 'nb_val':nb_val_acc, 'lda_val':lda_val_acc}, ignore_index=True)
result_avg = result.mean()
result_avg.name = '평균'
result = result.append(result_avg)
result

Unnamed: 0,nb_train,lda_train,nb_val,lda_val
0,83.16,88.7,83.48,88.48
1,82.04,88.59,83.48,88.91
2,81.96,88.92,82.39,87.5
3,82.87,88.7,81.74,87.5
4,82.65,89.97,82.61,89.24
5,81.82,89.28,80.65,89.57
6,81.75,88.92,84.24,88.26
7,82.54,89.5,82.07,90.65
8,82.9,88.7,82.39,89.35
9,82.76,88.77,82.17,90.11


In [336]:
# Multinomial
result = pd.DataFrame()
for i in range(10):
    X = raw.drop('spam', axis=1)
    y = raw.spam
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)
    
    lda_model = LDA().fit(X_train, y_train)
    lda_train_acc = get_acc(X_train, y_train, lda_model)
    lda_val_acc = get_acc(X_val, y_val, lda_model)
    
    nb_model = MultinomialNB().fit(X_train, y_train)
    nb_train_acc = get_acc(X_train, y_train, nb_model)
    nb_val_acc = get_acc(X_val, y_val, nb_model)
    result = result.append({'nb_train':nb_train_acc, 'lda_train':lda_train_acc, 'nb_val':nb_val_acc, 'lda_val':lda_val_acc}, ignore_index=True)
result_avg = result.mean()
result_avg.name = '평균'
result = result.append(result_avg)
result

Unnamed: 0,nb_train,lda_train,nb_val,lda_val
0,80.37,88.7,80.54,88.48
1,79.32,88.59,76.74,88.91
2,78.23,88.92,79.13,87.5
3,77.8,88.7,79.89,87.5
4,78.3,89.97,79.13,89.24
5,79.65,89.28,78.04,89.57
6,80.08,88.92,83.15,88.26
7,80.08,89.5,79.78,90.65
8,79.25,88.7,79.35,89.35
9,79.03,88.77,77.07,90.11


In [337]:
# Complement
result = pd.DataFrame()
for i in range(10):
    X = raw.drop('spam', axis=1)
    y = raw.spam
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)
    
    lda_model = LDA().fit(X_train, y_train)
    lda_train_acc = get_acc(X_train, y_train, lda_model)
    lda_val_acc = get_acc(X_val, y_val, lda_model)
    
    nb_model = ComplementNB().fit(X_train, y_train)
    nb_train_acc = get_acc(X_train, y_train, nb_model)
    nb_val_acc = get_acc(X_val, y_val, nb_model)
    result = result.append({'nb_train':nb_train_acc, 'lda_train':lda_train_acc, 'nb_val':nb_val_acc, 'lda_val':lda_val_acc}, ignore_index=True)
result_avg = result.mean()
result_avg.name = '평균'
result = result.append(result_avg)
result

Unnamed: 0,nb_train,lda_train,nb_val,lda_val
0,79.36,88.7,80.65,88.48
1,79.07,88.59,76.74,88.91
2,78.49,88.92,78.48,87.5
3,77.91,88.7,80.11,87.5
4,78.34,89.97,78.48,89.24
5,79.75,89.28,77.83,89.57
6,80.26,88.92,83.37,88.26
7,79.86,89.5,78.91,90.65
8,78.88,88.7,79.57,89.35
9,79.03,88.77,77.28,90.11


In [339]:
# Bernoulli
result = pd.DataFrame()
for i in range(10):
    X = raw.drop('spam', axis=1)
    y = raw.spam
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6001, random_state=i)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=i)
    
    lda_model = LDA().fit(X_train, y_train)
    lda_train_acc = get_acc(X_train, y_train, lda_model)
    lda_val_acc = get_acc(X_val, y_val, lda_model)
    lda_test_acc = get_acc(X_test, y_test, lda_model)
    
    nb_model = BernoulliNB().fit(X_train, y_train)
    nb_train_acc = get_acc(X_train, y_train, nb_model)
    nb_val_acc = get_acc(X_val, y_val, nb_model)
    nb_test_acc = get_acc(X_test, y_test, nb_model)
    result = result.append({'nb_train':nb_train_acc, 'lda_train':lda_train_acc, 'nb_val':nb_val_acc, 'lda_val':lda_val_acc, 'nb_test':nb_test_acc, 'lda_test': lda_test_acc}, ignore_index=True)
result_avg = result.mean()
result_avg.name = '평균'
result = result.append(result_avg)
result

Unnamed: 0,nb_train,lda_train,nb_val,lda_val,nb_test,lda_test
0,88.81,88.7,88.48,88.48,86.96,87.28
1,88.77,88.59,88.91,88.91,87.83,88.04
2,88.7,88.92,88.8,87.5,87.5,88.59
3,88.34,88.7,89.24,87.5,88.04,87.83
4,88.34,89.97,89.57,89.24,88.48,89.46
5,88.66,89.28,90.11,89.57,86.85,86.85
6,89.53,88.92,87.39,88.26,86.41,89.13
7,89.21,89.5,88.7,90.65,86.63,86.2
8,88.92,88.7,88.48,89.35,87.5,88.26
9,88.84,88.77,90.11,90.11,88.15,88.59
