# 네이버 영화평 감성 분석 - TfidVectorizer

In [1]:
import numpy as np 
import pandas as pd 


In [2]:
train_df = pd.read_csv('../00.data/nsmc/train.tsv', sep='\t')
test_df = pd.read_csv('../00.data/nsmc/test.tsv', sep='\t')

### Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt

okt = Okt()
def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko


### TfidVectorizer 로 학습 / 변환

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvecter = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)

In [5]:
tvecter.fit(train_df.document)

TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x00000186458CE9D0>)

In [8]:
X_train_tvect = tvecter.transform(train_df.document)
X_test_tvect = tvecter.transform(test_df.document)

In [14]:
y_train = train_df.label.values
y_test = test_df.label.values

### LogisticRegression으로 학습/예측/평가

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
lr_clf = LogisticRegression(C=3.5)

In [16]:
lr_clf.fit(X_train_tvect, y_train)
pred = lr_clf.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8590672517603837

### 실제 테스트

In [17]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [18]:
import re
review1 = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review1)
review1

'진짜 개노잼이다 편이랑 같은 감독맞나러닝타임도 길어서 개지루함 ㄹㅇ'

In [23]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']
morphs = okt.morphs(review1, stem=True) #토큰화
review = ' '.join([word for word in morphs if not word in stopwords])

In [24]:
review_tvect = tvecter.transform([review])

In [25]:
pred = lr_clf.predict(review_tvect)

In [26]:
pred[0]

0

In [27]:
reviews = ['진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까?']

In [28]:
review_list = []
for review in reviews:
    review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review)
    morphs = okt.morphs(review)
    tmp = ' '.join([word for word in morphs if not word in stopwords])
    review_list.append(tmp)

In [29]:
review_tvect = tvecter.transform(review_list)
pred = lr_clf.predict(review_tvect)

In [30]:
pred[0],pred[1]

(0, 1)

In [35]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfid_vect', TfidfVectorizer(tokenizer=tw_tokenizer)),
    ('lr_clf', LogisticRegression())
])

In [32]:
pipeline.get_params()

{'memory': None,
 'steps': [('tfid_vect', TfidfVectorizer()), ('lr_clf', LogisticRegression())],
 'verbose': False,
 'tfid_vect': TfidfVectorizer(),
 'lr_clf': LogisticRegression(),
 'tfid_vect__analyzer': 'word',
 'tfid_vect__binary': False,
 'tfid_vect__decode_error': 'strict',
 'tfid_vect__dtype': numpy.float64,
 'tfid_vect__encoding': 'utf-8',
 'tfid_vect__input': 'content',
 'tfid_vect__lowercase': True,
 'tfid_vect__max_df': 1.0,
 'tfid_vect__max_features': None,
 'tfid_vect__min_df': 1,
 'tfid_vect__ngram_range': (1, 1),
 'tfid_vect__norm': 'l2',
 'tfid_vect__preprocessor': None,
 'tfid_vect__smooth_idf': True,
 'tfid_vect__stop_words': None,
 'tfid_vect__strip_accents': None,
 'tfid_vect__sublinear_tf': False,
 'tfid_vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfid_vect__tokenizer': None,
 'tfid_vect__use_idf': True,
 'tfid_vect__vocabulary': None,
 'lr_clf__C': 1.0,
 'lr_clf__class_weight': None,
 'lr_clf__dual': False,
 'lr_clf__fit_intercept': True,
 'lr_clf__intercept_scal

In [38]:
params ={
    'tfid_vect__max_df': [30000],
    'tfid_vect__min_df': [1],
    'tfid_vect__ngram_range': [(1, 2), (1, 3)],
    'lr_clf__C': [1]
}

In [39]:
from sklearn.model_selection import GridSearchCV
grid_pipe = GridSearchCV(pipeline, param_grid= params, cv=3, scoring='accuracy', verbose=1)

grid_pipe.fit(train_df.document, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 51.0min finished
{'lr_clf__C': 1, 'tfid_vect__max_df': 30000, 'tfid_vect__min_df': 1, 'tfid_vect__ngram_range': (1, 2)} 0.8448189531589742


In [43]:
pred = grid_pipe.predict(test_df.document)
acc = accuracy_score(y_test, pred)
print (f'TfidfVectorizer + LogisticRegression 정확도:{acc:.4f}')

TfidfVectorizer + LogisticRegression 정확도:0.8482
