## 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## 데이터셋 만들기

In [2]:
col = ['장르', '재미있는', '연인', '사랑', '맹렬한', '빠른', '총격', '날으는']

genre = ['코미디', '액션', '코미디', '액션', '액션']
V1 = [1, 0, 2, 1, 0]
V2 = [1, 0, 1, 0, 0]
V3 = [2, 0, 0, 0, 1]
V4 = [0, 1, 0, 1, 0]
V5 = [0, 1, 1, 0, 1]
V6 = [0, 1, 0, 2, 1]
V7 = [0, 0, 1, 0, 1]

dic = {'장르':genre, 'V1':V1, 'V2':V2, 'V3':V3, 'V4':V4, 'V5':V5, 'V6':V6, 'V7':V7}

In [3]:
data = pd.DataFrame(dic)

In [4]:
data.columns=col

In [5]:
data

Unnamed: 0,장르,재미있는,연인,사랑,맹렬한,빠른,총격,날으는
0,코미디,1,1,2,0,0,0,0
1,액션,0,0,0,1,1,1,0
2,코미디,2,1,0,0,1,0,1
3,액션,1,0,0,1,0,2,0
4,액션,0,0,1,0,1,1,1


## 0보다 큰 경우 'yes'로 반환하는 사용자 정의 함수를 생성

In [60]:
def convert(data):
    if data > 0:
        return 1
    return 0

In [61]:
columns = data.columns.difference(['장르'])

In [62]:
for i in columns:
    data[i] = data[i].apply(convert)

In [63]:
data

Unnamed: 0,장르,재미있는,연인,사랑,맹렬한,빠른,총격,날으는
0,코미디,1,1,1,0,0,0,0
1,액션,0,0,0,1,1,1,0
2,코미디,1,1,0,0,1,0,1
3,액션,1,0,0,1,0,1,0
4,액션,0,0,1,0,1,1,1


In [22]:
data2 = pd.DataFrame(data.iloc[:, 1:].values + 1, index = data['장르'], columns = sum_df.columns[:-1])

In [23]:
data2['sum'] = data2.sum(axis=1)

In [25]:
data2.groupby("장르").sum()

Unnamed: 0_level_0,재미있는,연인,사랑,맹렬한,빠른,총격,날으는,sum
장르,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
액션,4,3,4,5,5,7,4,32
코미디,5,4,4,2,3,2,3,23


## 나이브베이즈 모델 적용(Laplace 적용x)

In [10]:
from sklearn.naive_bayes import MultinomialNB

# alpha : laplace 추정기
# class_prior : 사전확률 직접 정의
# fit_prior : 사전확률 적합

model = MultinomialNB(alpha=0)
model.fit(data.iloc[:,1:], data['장르'])

MultinomialNB(alpha=0)

In [11]:
# alpha : alpha값 확인
model.alpha

0

In [7]:
# classes_ : 클래스 이름
model.classes_

array(['액션', '코미디'], dtype='<U3')

In [8]:
# class_count_ : 클래스 개수 반환
model.class_count_

array([3., 2.])

In [9]:
# feature_count_ : 각 클래스별 단어 개수 등장 횟수
model.feature_count_

array([[1., 0., 1., 2., 2., 4., 1.],
       [3., 2., 2., 0., 1., 0., 1.]])

In [10]:
# class_log_prior_ : class별 log를 씌운 사전확률
np.exp(model.class_log_prior_)

array([0.6, 0.4])

In [11]:
# feature_log_prob_ : 단어별 log를 씌운 사후확률
np.exp(model.feature_log_prob_)

array([[0.11111111, 0.05555556, 0.11111111, 0.16666667, 0.16666667,
        0.27777778, 0.11111111],
       [0.25      , 0.1875    , 0.1875    , 0.0625    , 0.125     ,
        0.0625    , 0.125     ]])

## 재미있는 맹렬한 빠른 단어가 나올경우

In [12]:
# test set 만들기
test = np.array([1, 0, 0, 1, 1, 0, 0]).reshape(1, -1)
test.shape

(1, 7)

In [13]:
# 예측
model.predict(test)

array(['액션'], dtype='<U3')

In [14]:
# 확률 확인
model.predict_proba(test)

array([[0.7032967, 0.2967033]])

## 나이브베이즈 모델 적용(Laplace 적용o)

In [6]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=1)
model.fit(data.iloc[:,1:], data['장르'])

MultinomialNB(alpha=1)

In [73]:
# 예측
model.predict(test)

array(['액션'], dtype='<U3')

In [74]:
model.feature_count_

array([[1., 0., 1., 2., 2., 3., 1.],
       [2., 2., 1., 0., 1., 0., 1.]])

In [15]:
# 확률 확인
model.predict_proba(test)

array([[0.7032967, 0.2967033]])

In [16]:
data.iloc[:, 1:]

Unnamed: 0,재미있는,연인,사랑,맹렬한,빠른,총격,날으는
0,1,1,2,0,0,0,0
1,0,0,0,1,1,1,0
2,2,1,0,0,1,0,1
3,1,0,0,1,0,2,0
4,0,0,1,0,1,1,1


In [17]:
data1 = data.groupby("장르").sum()

In [18]:
sum_df = pd.DataFrame(data1.values+1, columns = data1.columns, index = data1.index)

In [19]:
sum_df['sum'] = sum_df.sum(axis=1)

In [20]:
sum_df

Unnamed: 0_level_0,재미있는,연인,사랑,맹렬한,빠른,총격,날으는,sum
장르,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
액션,2,1,2,3,3,5,2,18
코미디,4,3,3,1,2,1,2,16


## 데이터 불러오기

## 결측치 채우기

In [None]:
# 결측치가 있는 행만 불러오기


## n이면 0, y면 1 적용

## train test 분리

In [None]:
import random

# random seed 설정하기

# train, test set분리 


## 나이브베이즈 모델 적용

In [None]:
from sklearn.naive_bayes import MultinomialNB



In [None]:
# test 데이터로 예측


In [None]:
# 정확도 확인
