# 네이버 영화평 감성 분석 - TfidfVectorizer

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('../00. data/NaverMovie/train.tsv', sep='\t')
test_df = pd.read_csv('../00. data/NaverMovie/test.tsv', sep='\t')

### Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt
okt = Okt()
def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

### TfidfVectorizer로 변환

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvector = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1, 2), max_df=0.9, min_df=3)

In [5]:
tvector.fit(train_df.document)

TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x000001AE1ECEF5E0>)

In [6]:
X_train_tvect = tvector.transform(train_df['document'])

In [7]:
X_test_tvect = tvector.transform(test_df['document'])

In [8]:
y_train = train_df.label.values
y_test = test_df.label.values

### LogisticRegression 학습/예측/평가

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
lr = LogisticRegression(C=3.5)
lr.fit(X_train_tvect, y_train)
pred = lr.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8584753546280233

### 실제 테스트

In [11]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [12]:
import re
review1 = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", '', review1)
review1_tvect = tvector.transform([review1])        # 형태소 분석은 tvector 안에 함수로 수행
re_pred = lr.predict(review1_tvect)
re_pred[0]

0

In [13]:
review2 = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", '', review2)
review2_tvect = tvector.transform([review2])        # 형태소 분석은 tvector 안에 함수로 수행
re2_pred = lr.predict(review2_tvect)
re2_pred[0]

1

In [14]:
reviews = ['진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까?']

In [15]:
reviews = list(map(lambda x: re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", '', x), reviews))

In [16]:
review_tvect = tvector.transform(reviews)
pred = lr.predict(review_tvect)
pred[0], pred[1]

(0, 1)

### 최적 하이퍼 파라미터 탐색

In [26]:
from sklearn.pipeline import Pipeline
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다', '을']
pipeline = Pipeline([
    ('tv', TfidfVectorizer(stop_words=stopwords)),
    ('lr', LogisticRegression())
])

In [27]:
params = {
    'tv__ngram_range': [(1, 2), (1, 3)],
    'tv__max_df': [700, 1000],
    'lr__C': [10, 30]
}

In [37]:
from sklearn.model_selection import GridSearchCV
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(train_df.document, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  9.1min finished
{'lr__C': 10, 'tv__max_df': 1000, 'tv__ngram_range': (1, 2)} 0.7957966988617069
