# 네이버 영화평 감성 분석 - TfidfVectorizer

In [2]:
import numpy as np 
import pandas as pd

In [3]:
train_df = pd.read_csv('naver_movie_train.tsv', sep='\t')
test_df = pd.read_csv('naver_movie_test.tsv', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


### Tokenizer 함수 정의

In [5]:
from konlpy.tag import Okt
okt = Okt()

In [7]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [8]:
def okt_tokenizer(text):
    tokens = okt.morphs(text, stem=True)
    tokens = [word for word in tokens if not word in stopwords]
    return tokens

### TfidfVectorizer 로 변환

In [10]:
import warnings
warnings.filterwarnings(action='ignore')

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(
    tokenizer=okt_tokenizer, ngram_range=(1,2), max_df=0.9
)

In [12]:
%time tvect.fit(train_df.document)

Wall time: 5min 5s


TfidfVectorizer(max_df=0.9, ngram_range=(1, 2),
                tokenizer=<function okt_tokenizer at 0x000001D0A2E7B4C0>)

In [13]:
%time X_train_tv = tvect.transform(train_df.document)

Wall time: 3min 56s


In [14]:
X_test_tv = tvect.transform(test_df.document)

In [15]:
y_train = train_df.label.values
y_test = test_df.label.values

In [16]:
### Naive Bayes 분류기로 학습/예측/평가

In [17]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [18]:
nb.fit(X_train_tv, y_train)

MultinomialNB()

In [19]:
from sklearn.metrics import accuracy_score
pred = nb.predict(X_test_tv)
accuracy_score(y_test, pred)

0.8608837636493519

In [20]:
### 실제 테스트

In [21]:
reviews = ['아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~',
           '메시지와 작위성의 불협화음!!!']

In [22]:
reviews_tv = tvect.transform(reviews)
pred = nb.predict(reviews_tv)
pred

array([1, 0], dtype=int64)