In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline

### 베이즈 정리
- P(H|E) = P(E|H) x P(H) / P(E)

In [3]:
apparel = ['y','n','y','y','y','y','y','n','y','y','y','y']
leisure = ['n','y','n','n','y','y','n','y','y','n','n','y']
furniture = ['n','n','n','y','y','y','y','n','n','n','n','y']
kitchenware = ['n','n','y','y','y','y','y','y','n','y','n','n']
gender = ['m','m','f','f','m','f','f','f','m','f','m','f']

In [8]:
buy = pd.DataFrame([apparel, leisure, furniture, kitchenware, gender], index=['apparel', 'leisure', 'furniture', 'kitchenware', 'gender']).T
buy

Unnamed: 0,apparel,leisure,furniture,kitchenware,gender
0,y,n,n,n,m
1,n,y,n,n,m
2,y,n,n,y,f
3,y,n,y,y,f
4,y,y,y,y,m
5,y,y,y,y,f
6,y,n,y,y,f
7,n,y,n,y,f
8,y,y,n,n,m
9,y,n,n,y,f


In [14]:
cust_Q = pd.DataFrame([['y', 'y', 'n', 'y', None]], columns=['apparel', 'leisure', 'furniture', 'kitchenware', 'gender'])
cust_Q

Unnamed: 0,apparel,leisure,furniture,kitchenware,gender
0,y,y,n,y,


### 고객 Q가 상품을 구매한 사건 $E_q : (A=y) and (L=y) and (F=n) and (K=y)$
* A:  apparel, L:  leisure, F:  furniture, K:  kitchenware

### 고객 Q가 남자일 확률 $P(G=m | E_q)$
$P(G=m | E_Q) = \frac{P(E_Q|G=m)\times P(G=m)}{P(E_Q)}$

In [146]:
temp = buy.melt(id_vars='gender', var_name='category')
temp['c'] = 1
temp = temp.pivot_table(index='value', columns=['category','gender'], aggfunc='sum').sort_index(ascending=False)
temp

Unnamed: 0_level_0,c,c,c,c,c,c,c,c
category,apparel,apparel,furniture,furniture,kitchenware,kitchenware,leisure,leisure
gender,f,m,f,m,f,m,f,m
value,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
y,6,4,4,1,6,1,3,3
n,1,1,3,4,1,4,4,2


In [147]:
temp.columns =  pd.MultiIndex.from_tuples([col[1:] for col in temp.columns])
temp

Unnamed: 0_level_0,apparel,apparel,furniture,furniture,kitchenware,kitchenware,leisure,leisure
Unnamed: 0_level_1,f,m,f,m,f,m,f,m
value,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
y,6,4,4,1,6,1,3,3
n,1,1,3,4,1,4,4,2


In [148]:
temp = temp.swaplevel(axis=1).sort_index(level=[1, 0], axis=1, ascending=[True, False]).iloc[:,[0,1,6,7,2,3,4,5]]
temp

Unnamed: 0_level_0,m,f,m,f,m,f,m,f
Unnamed: 0_level_1,apparel,apparel,leisure,leisure,furniture,furniture,kitchenware,kitchenware
value,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
y,4,6,3,3,1,4,1,6
n,1,1,2,4,4,3,4,1


In [156]:
temp = pd.concat([temp, pd.DataFrame([[f'{temp.iloc[i, j]}/{temp.sum()[j]}' for j in range(temp.shape[1])] for i in range(2)], columns=temp.columns, index=['y_rate', 'n_rate'])])
temp

Unnamed: 0_level_0,m,f,m,f,m,f,m,f
Unnamed: 0_level_1,apparel,apparel,leisure,leisure,furniture,furniture,kitchenware,kitchenware
y,4,6,3,3,1,4,1,6
n,1,1,2,4,4,3,4,1
y_rate,4/5,6/7,3/5,3/7,1/5,4/7,1/5,6/7
n_rate,1/5,1/7,2/5,4/7,4/5,3/7,4/5,1/7


$P(E_Q|G=m)=P(A=y|G=m)\times P(L=y|G=m)\times P(F=n|G=m)\times P(K=y|G=m)$

$P(E_Q|G=m)=\frac{4}{5}\times\frac{3}{5}\times\frac{4}{5}\times\frac{1}{5} = \frac{48}{625} = 0.0768$

$P(G=m|E_Q) = \frac{0.0768 \times 0.4167}{P(E_Q)} = \frac{0.0320}{P(E_Q)}$

### 고객 Q가 여자일 확률 $P(G=f | E_Q)$

$P(G=f|E_Q) = \frac{P(E_Q|G=f) \times P(G=f)}{P(E_Q)}$

$P(E_Q|G=f)=P(A=y|G=f)\times P(L=y|G=f)\times P(F=n|G=f)\times P(K=y|G=f) = \frac{6}{7} \times \frac{3}{7}\times \frac{3}{7}\times \frac{6}{7} = \frac{324}{2401} = 0.1349$

$P(G=f|E_Q) = \frac{0.1349 \times 0.5733}{P(E_Q)} = \frac{0.0787}{P(E_Q)}$

### $P(E_Q)$

$P(E_Q) = P(E_Q|G=m)\times P(G=m) + P(E_Q|G=f)\times P(G=f)$

$P(E_Q) = 0.0320+0.0787 = 0.1107$

$P(G=m|E_Q) = \frac{0.0320}{0.1107} = 0.2891$

$P(G=f|E_Q) = \frac{0.0787}{0.1107} = 0.7109$