# IMDB 영화평 테스트셋 분리 , 모델 만들기

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../static/data/IMDB.tsv', header=0, sep ='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [3]:
# 영어 이외의 문자는 공백으로 변환
import re

df['review'] = df.review.apply(lambda x : re.sub('[^a-zA-Z]', ' ',x))
# br태그를 공백으로 바꾸기
df['review'] = df.review.str.replace('<br />', ' ')

In [4]:
df = df[['review','sentiment']]
df.head(3)

Unnamed: 0,review,sentiment
0,With all this stuff going down at the moment ...,1
1,The Classic War of the Worlds by Timothy ...,1
2,The film starts with a manager Nicholas Bell...,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df, df.sentiment, test_size=0.25, stratify=df.sentiment, random_state=2021
)

In [6]:
X_train.to_csv('../static/data/imdb_train.csv', index=False)
X_test.to_csv('../static/data/imdb_test.csv', index=False)

In [7]:
df_train = pd.read_csv('../static/data/imdb_train.csv')
df_train.head(3)

Unnamed: 0,review,sentiment
0,Still Crazy is without a doubt the greate...,1
1,For me the only reason for having a look at t...,0
2,This is one of the funniest movies I have see...,1


In [8]:
df_test = pd.read_csv('../static/data/imdb_test.csv')
df_test.head(3)

Unnamed: 0,review,sentiment
0,Johnny Dangerously is a sort of hit and m...,1
1,Along with Fernando Fragata Jo o M rio Grilo...,1
2,Okay I absolutely LOVE Ben Stiller although...,0


In [23]:
X_train = df_train.review
y_train = df_train.sentiment
X_test = df_test.review
y_test = df_test.sentiment

In [10]:
# TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(df.review)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
# CountVectorizer
count_vect = CountVectorizer(stop_words='english')
count_vect.fit(df.review)
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

### 모델 저장

In [11]:
joblib.dump(tfidf_vect, '../static/model/imdb_tfidf.pkl')
joblib.dump(count_vect, '../static/model/imdb_count.pkl')

['../static/model/imdb_count.pkl']

### 불러오기

In [12]:
tfidf_vect = joblib.load('../static/model/imdb_tfidf.pkl')
counmt_vect = joblib.load('../static/model/imdb_count.pkl')

### 성능체크

In [13]:
pipeline1 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [14]:
pipeline2 = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [15]:
pipeline1.get_params()

{'memory': None,
 'steps': [('tfidf_vect', TfidfVectorizer(stop_words='english')),
  ('lr_clf', LogisticRegression())],
 'verbose': False,
 'tfidf_vect': TfidfVectorizer(stop_words='english'),
 'lr_clf': LogisticRegression(),
 'tfidf_vect__analyzer': 'word',
 'tfidf_vect__binary': False,
 'tfidf_vect__decode_error': 'strict',
 'tfidf_vect__dtype': numpy.float64,
 'tfidf_vect__encoding': 'utf-8',
 'tfidf_vect__input': 'content',
 'tfidf_vect__lowercase': True,
 'tfidf_vect__max_df': 1.0,
 'tfidf_vect__max_features': None,
 'tfidf_vect__min_df': 1,
 'tfidf_vect__ngram_range': (1, 1),
 'tfidf_vect__norm': 'l2',
 'tfidf_vect__preprocessor': None,
 'tfidf_vect__smooth_idf': True,
 'tfidf_vect__stop_words': 'english',
 'tfidf_vect__strip_accents': None,
 'tfidf_vect__sublinear_tf': False,
 'tfidf_vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf_vect__tokenizer': None,
 'tfidf_vect__use_idf': True,
 'tfidf_vect__vocabulary': None,
 'lr_clf__C': 1.0,
 'lr_clf__class_weight': None,
 'lr_clf__

In [16]:
pipeline2.get_params()

{'memory': None,
 'steps': [('count_vect', CountVectorizer(stop_words='english')),
  ('lr_clf', LogisticRegression())],
 'verbose': False,
 'count_vect': CountVectorizer(stop_words='english'),
 'lr_clf': LogisticRegression(),
 'count_vect__analyzer': 'word',
 'count_vect__binary': False,
 'count_vect__decode_error': 'strict',
 'count_vect__dtype': numpy.int64,
 'count_vect__encoding': 'utf-8',
 'count_vect__input': 'content',
 'count_vect__lowercase': True,
 'count_vect__max_df': 1.0,
 'count_vect__max_features': None,
 'count_vect__min_df': 1,
 'count_vect__ngram_range': (1, 1),
 'count_vect__preprocessor': None,
 'count_vect__stop_words': 'english',
 'count_vect__strip_accents': None,
 'count_vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'count_vect__tokenizer': None,
 'count_vect__vocabulary': None,
 'lr_clf__C': 1.0,
 'lr_clf__class_weight': None,
 'lr_clf__dual': False,
 'lr_clf__fit_intercept': True,
 'lr_clf__intercept_scaling': 1,
 'lr_clf__l1_ratio': None,
 'lr_clf__max_iter': 1

In [17]:
params = {
    'tfidf_vect__ngram_range' : [(1,2)],
    'tfidf_vect__max_df'  : [800, 900, 1000],
    'lr_clf__C': [8,10,20,30],
    'tfidf_vect__min_df': [2],
}

In [18]:
grid_pipe = GridSearchCV(pipeline1, param_grid= params, cv=5, scoring='accuracy', verbose=1)

grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  9.1min finished
{'lr_clf__C': 20, 'tfidf_vect__max_df': 1000, 'tfidf_vect__min_df': 2, 'tfidf_vect__ngram_range': (1, 2)} 0.8924266666666668


In [24]:
best = grid_pipe.best_estimator_
pred = grid_pipe.predict(X_test)
accuracy_score(y_test, pred)

0.88992

In [25]:
joblib.dump(best, '../static/model/tf_lr_imdb.pkl')

['../static/model/tf_lr_imdb.pkl']

In [26]:
params = {
    'count_vect__ngram_range' : [(1,1), (1,2), (1,3)],
    'count_vect__max_df'  : [700,800, 900, 1000],
    'lr_clf__C': [0.1, 1, 10, 50],
    'count_vect__min_df': [2,3,4,5],
}

In [27]:
grid_pipe = GridSearchCV(pipeline2, param_grid= params, cv=5, scoring='accuracy', verbose=1)

grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed: 114.1min finished
{'count_vect__max_df': 900, 'count_vect__min_df': 3, 'count_vect__ngram_range': (1, 3), 'lr_clf__C': 0.1} 0.8826666666666666


In [28]:
best = grid_pipe.best_estimator_
pred = grid_pipe.predict(X_test)
accuracy_score(y_test, pred)

0.87888

In [29]:
joblib.dump(best, '../static/model/co_lr_imdb.pkl')

['../static/model/co_lr_imdb.pkl']

### test

In [31]:
index = 100

In [32]:
y_test[100],pred[100]

(0, 0)

In [38]:
# 별점 1개, 7개,  10개 , 4개순
test_list = [''' Why is it that when a movie is boring and makes a pretense of being 'about something', we are supposed to take it more seriously than say, a comedy. This movie actually made me feel ill for movies in general. It took some time to watch another movie after this experience.

Whatever foreign-movie cliches this movie avoids, it substitutes stupidity in its place. The pouting, silent and pliable French lead is so seriously dull how can u watch her? The old man is supposed to be wise in the ways of the world and people, yet with that jaded disdain that gives him a sense of superiority. He really KNOWS people. The stuff this guy spits out is so outrageously phony and dull it's hard to keep a straight face. I suppose he is the sly fox who educates the innocent babe, never mind how false and boring it is. This is 'serious' filmmaking, remember.

This movie is like a boring sculpture sitting in a vacuum-it's hard to breathe in here. Each scene seems as if carved from lead. It violates what i think is one of the rules of effective dramaturgy: Not every single scene should be dramatic. These movies are supposed to move us and make us feel something, but they only put us in a trance with their emotional sameness.

Witness Woody Allen's Interiors, another narcoleptic masterpiece of dreadful seriousness. There are scenes in both these movies which go so far in trying to be heavy and dramatic they are comic. Further, they betray the labored effects of an unimaginative writer. Is the best way to show us a character is feeling something by having them push it in our face every other stupid scene?''',
'''
Krzysztof Kieslowski's third and last of the Three Colors trilgy' "Red" (1994), I found to be a more philosophical film than its predecessors, "Blue" (1993) and "White" (1994).

We follow Valentine, a young model living in Geneva. By literal accident, she meets a retired judge who spies his neighbours' phone calls, not for money but to feed his cynicism.

The film is primarily the story of the relationship between Valentine and the judge, and their relationship is evolved and portrayed in a very convincing manner, thematically centrering around love, loss, forgiveness and redemption.

Visually, once again it is a masterpiece, just as impressive a film as "Blue", and again drawing on symbolism in its visual storytelling. Also, Kieslowski's intelligent directional style, combined with an interesting plot, made me really like the end of the trilogy.

The ending is particularly intelligent and interesting in its execution. Wonder what the thoughts and intentions were for Kieslowski. Highly recommended to watch in concordance with its predecessors.
''',
'''
See Three Colors: Blue and Three Colors: White. They are both wonderful films and will give an added dimension to the finale Three Colors: Red. Red is a fantastic film. It can be enjoyed in a single viewing, and indeed, the climax of the film is very powerful in that first viewing. But, watch it again. Once you understand the use of symbolism and character parallels in this movie, you will see new things with each viewing. With the first viewing you understand that the film is the work of a brilliant mind. With each additional viewing, you find yourself discovering that it is, in fact, a work of genius. Red is meant to symbolize fraternity in the French flag. The story turns the theme of fraternity around to be viewed at angles one would never suspect. The facets of fraternity shared by the different characters is as deep as you care to peer. If you are used to the blatant "symbolism" in most mass films, you may find Red a bit slow. You may find yourself looking at a screen filled with intensity that you do not fathom... and yawning, wonder what all the excitement is about. This is not a mindless, vicarious experience. Everything is not explained to you. You must think as you watch. You must see... not simply look. Wonderful movie... one meant to be enjoyed by a wonderful moviegoer.
''',
'''
What's the fuss about? I am going to be the one who points out that the Emperor has no clothes here. Three Colours Red is nothing special at all. It starts slowly and doesn't get much more pacey as it goes along; it's moderately intriguing but that's it. It never hits any sort of heights.

The whole effect is so muted, so underplayed that nothing grabs the attention. Visually it may look good on occasions but dramatically speaking it is inert; there is nothing there. I thought that after the dog incident things would begin to happen and I would be drawn into a web of intrigue. Nope.

It's not that I dislike slow, arty films or French films (though I'm not keen), but this one just said very little to me. There are no answers to the meaning of life here. I am amazed that this vaguely pretentious movie has got such good reviews on IMDb and is awarded four stars in Halliwell's Film Guide. (The editor clearly had a bit of a dizzy spell when it came to the Three Colours films as he gives ALL of them maximum marks.)
''']

In [39]:
# 성능확인
grid_pipe.predict(test_list)

array([0, 1, 1, 0], dtype=int64)