In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

 # 1. 문제정의
 - 긍정리뷰와 부정리뷸 구분하는 감성분석을 해보자
 - 긍정/부정에서 자주 사용되는 단어를 확인해 보자

# 2.데이터 수집
- Large movie dataset을 다운로드

In [42]:
from sklearn.datasets import load_files # 파일 읽어오기

#train 데이터 받아오기
train_data_url='aclImdb/train/'
reviews_train= load_files(train_data_url,shuffle=True)

#test 데이터 받아오기
test_data_url='aclImdb/train/'
reviews_test= load_files(test_data_url,shuffle=True)

In [43]:
reviews_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [44]:
reviews_train['data'][0]

b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [45]:
#0은 부정
#1은 긍정
reviews_train['target']

array([1, 0, 1, ..., 0, 0, 0])

In [46]:
# 전체 Train데이터 개수는 25000개
len(reviews_train['data'])

25000

In [47]:
# 정답데이터의 개수를 확인
np.bincount(reviews_train['target'])# 값의 개수를 세주는 함수

array([12500, 12500], dtype=int64)

# 3. 데이터 전처리
- 일반 데이터:결측치제거,스케일링,특성공학,이상치 제거,,,
- 텍스트 데이터 
 - 오탈자 제거
 - 띄어쓰기 교정
 - 이모티콘 수정
 - 불필요한 글자 제거 (불용어 제거,stop word)
 - 데이터가 정형화 되있다면 : 토큰화, 수치화

In [48]:
reviews_train['data'][0]

b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty."

In [51]:
# br태그 제거
# 리스트 내포
text_train=[] #비어있는 리스트 만들기
for txt in reviews_train['data']:
    del_br=txt.replace(b"<br />",b"")
    text_train.append(del_br) # 값채우기

In [52]:
text_train=[txt.replace(b"<br />",b"") 
            for txt in reviews_train['data']] #for문을 리스트안에 적는다.

In [53]:
text_test=[txt.replace(b"<br />",b"") 
            for txt in reviews_test['data']]

In [54]:
text_train[0]

b"Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. Flawed but honest with a terrible honesty."

####  1.토큰화
- 텍스트:"러시아 월드컵 "
- 띄어 쓰기 단위로 토큰화 : 러시아, 월드컵
- 2-gram 단위로 토큰화: 러시아, 월드컵, 러시아월드컵 -> 단어의개수가 늘어난 효과 -> 데이터가 늘어난다 

- 토큰화 대상의 수가 크게 증가한다
- 이결과를 수치화 : 원핫인코딩, BOW,단어벡트 방법이 존재

####  BOW (bag of word,단어모음) 
- 단어토큰화, 단어사전 구축을 통한 수치화
- 문장을 하나의 벡터(하나의 행)로 만드는 방법

####  BOW (bag of word,단어모음) 
- 단어토큰화, 단어사전 구축을 통한 수치화

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
testCV= CountVectorizer()

In [57]:
test_txt= ['안녕하세요 저는 항나 예요.', # 1번문장
          '지금은 머신러닝 수업을 듣고 있슴둥',# 2번문장
          '배고파요 맥도날드 햄버거 먹고 싶어요',# 3번문장
          '왜 지금 점심시간 아니예요?']# 4번문장

In [58]:
testCV.fit(test_txt) # 토근화, 단어사전 구축

CountVectorizer()

In [59]:
testCV.vocabulary_ # 단어사전

{'안녕하세요': 8,
 '저는': 11,
 '항나': 15,
 '예요': 9,
 '지금은': 14,
 '머신러닝': 2,
 '수업을': 5,
 '듣고': 0,
 '있슴둥': 10,
 '배고파요': 4,
 '맥도날드': 1,
 '햄버거': 16,
 '먹고': 3,
 '싶어요': 6,
 '지금': 13,
 '점심시간': 12,
 '아니예요': 7}

In [60]:
testCV.transform(test_txt) # 수치화(벡터화)

<4x17 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [61]:
testCV.transform(test_txt).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]], dtype=int64)

#### 영화 리뷰 데이터로 토큰화, 수치화

In [62]:
movie_count =CountVectorizer()
movie_count.fit(text_train) # 토큰화,단어사전 구축

CountVectorizer()

In [63]:
# 단어사전 확인
len(movie_count.vocabulary_)

75911

In [64]:
# 수치화(벡터화)
# 문제 데이터
X_train =movie_count.transform(text_train)
X_test =movie_count.transform(text_test)

In [65]:
# 정답 데이터
y_train = reviews_train['target']
y_test=reviews_test['target']

# 4.탐색적 데이터 분석 -skip

# 5.모델 선택 및 하이퍼 파라미터 튜닝

In [66]:
# SVM
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
svm=LinearSVC()

In [67]:
svm_result=cross_val_score(svm,X_train,y_train,cv=5)
svm_result.mean()



0.86284

# 6. 모델 학습


In [68]:
# 0은 부정 1은 긍정
svm.fit(X_train,y_train)

LinearSVC()

# 7.평가

In [69]:
# 1번은 긍정 2번은 긍정 3번은 부정 리뷰
reviews = ["Wow. This is one of the most mind bending things in media. It makes Age Of Ultron better, it is very funny, the characters are so suprising and it also plays well into the MCU in other ways. The episode before the final one has great writing and the final showdowns are like watching an MCU film. This whole thing would be great as an MCU film, the way it plays. The return of people blipped also was interesting here and the story of Infinity War playing into the Vision storyline and how Wanda really just wanted more time with him. It is defismtly emotional and touching. Disney+ started off right with their first MCU tie in. The moral of this is that it seems to expose witchcraft rather then embrace it, which is good. (It might be exposing it). Jesus is our only hope.",
            """ "We are an unusual couple, you know." "Oh, I don't think that was ever in question." Now, before I begin, there are many opinions of this show. Many fans and people just introduced to the MCU have strong opinions which has made this a very divided addition to the MCU. Critically, it's been praised and there aren't many people who don't like it. But with the unique approach, some have been upset for such the high ratings. What I'm trying to say is I ask you to bear with me and respect my opinion. If you differ with it, there's nothing you can do to change my mind. The absolute masterpiece that is WandaVision blends the style of classic sitcoms with the MCU, in which Wanda Maximoff and Vision - two super-powered beings living their ideal suburban lives - begin to suspect that everything is not as it seems. From that teaser at the Super Bowl (I believe) over a year ago, this had been something exciting to look forward to. Literally no one knew what this would be about going in, really. I refrained from all trailers to go in as blind as possible. What I got was more than what I could imagine. Trying something new is what I've hoped for in the MCU for a while. As much as I do love a good majority of their movies, there's a fixed setup for almost all. WandaVision is something totally different and just what we needed. Had this been a show outside of the MCU, I think it would still be brilliant. Each episode is designed to go through a different era of television. How they handled it worked incredibly well. There's not a way of trying to modernize these old sitcoms from the 1950s and so forth, so it's like a replication of classic television. The sets, costumes, and camera lens and moment are amazing at doing so. One of my favorite games while watching is trying to see what show this episode was mainly based on. Accomplishing this couldn't have been easy and I applaud them for how they managed to get the feel of each era perfectly. At the heart of this story are amazing performances. Elizabeth Olson hasn't had such an amazing display of acting since Martha Marcy May Marlene. So much emotion is put forth into Wanda that she is by far the most developed character in the MCU even if she came in late. There are some really heavy scenes and she portrayed those flawlessly. It doesn't feel much like watching a magical being, but we understand she's just another person in this world. Scene 8 showcases it all. Alongside her is Paul Bettany. With his character of Vision — a literal computer-god-being — it's hard to get complete range. He has such a good start with the series by delivering comedy. He's really funny in it (and Olsen too). Once the mystery starts to unfold, he builds more and more character until episode 5 when he unleashes his full capabilities. Never has Vision felt so human before. I'd also like to highlight Kathryn Hahn, because she's such an amazing actress. Playing the nosy-neighbor of Agnes must've been such a fun time. She takes up all the screen time she can get. I won't spoil a thing, but later on in the series she gets her moment — literally everyone's favorite moment — and you just love her even more. Teyonah Paris, Kat Dennings, and Randall Park also do well with their supporting roles. This is an ensemble piece for sure and the way they work off of each other shows dedication. It's hard to discuss so much without spoiling because certain things do need to be addressed, but I shall refrain. Story is where people have been divided. Some thought it took way too long to get into, and I just don't see why. It's a series, not a movie, so setup is much different. And if it were to have moved on quickly, the mystery element would've lost its momentum. Every episode has some sort of question leading up to the finale. That's where fan theories came in and caused even more disappointments. I support fan theories, but I don't base my expectation on the rest of the show. Even if I had some hopeful thoughts, I never expected them to show up later on. With expectation, you can only be disappointed. My suggestion is to not have anything in mind when going in. Questions kept building and that's what made this the most gripping show I had seen in such a long time. I would stay up till 2 a.m. for the release of the new episodes because I just had to know what would come next. With a series, there was more time to develop and think about plot and character. Most importantly, though, there was enough time to build upon the past episodes and make an enjoyable time for both the sitcom moments and the Marvel storyline. Sure, not every episode is as great as the one before or after. I don't think you'll ever find a show with each episode being perfect as ever. Take a highly regarded show like Breaking Bad. Many think of it as a perfect show, but it's not like they think every episode is perfect. To quote Steven Universe, one of my favorite shows, "if every pork chop were perfect, we wouldn't have hotdogs." If every episode were perfect, it wouldn't have that range that it has. I don't grade a show based on each episode, but rather as a whole. And throughout the duration of WandaVision, I had an absolute blast. Marvel, Matt Shakman, and the whole crew made something unforgettable. I believe this to be the greatest thing the MCU has given us. There will never be a show quite like WandaVision. Only a few shows have gotten my perfect rating, and this ranks among them. """ ,
          ''' ( SPOILERS) Absolute garbage and a waste of time. Full of plot twists that end up being nothing. Vision having holes in his body had nothing to do with the plot. Pietro having holes in his body had nothing to do with the plot. Pietro being from X'men was just a random coincidence. Also, every time a new male character walked into the show you knew he was either a wimp or evil. They even made pietros real last name "bohner" to make fun of manhood. Imagine if a female character everyone was stoked on turned out to be some random lady named "Vachina". Also, the physical vision just flew off for no reason, and digital vision never decided to tell wanda about his existence. Why? Lazy writing. Additionally at the end rhambeaou tells wanda "they will never know what you sacrificed". What the heck?! Like maybe apologize for trapping and tormenting these people every day for like a month. How on earth is wanda the victim or the "good-guy" in this show. She is literally a villain causing everyone pain, but it is "ok" because she did it out of a place of pain. Im sorry, almost all villains do evil out of a place of pain, that doesnt make it ok. Stupid, sexist show with bad plot that treats its audience like idiots. '''
          ]

In [70]:
reviews_transform= movie_count.transform(reviews)

In [71]:
 svm.predict(reviews_transform)

array([1, 1, 0])

In [72]:
svm.decision_function(reviews_transform)

array([ 1.81890326, 11.12377546, -2.79260898])

In [73]:
from sklearn.linear_model import LogisticRegression
logi=LogisticRegression()
logi.fit(X_train,y_train)
logi.predict_proba(reviews_transform)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[3.56977606e-03, 9.96430224e-01],
       [0.00000000e+00, 1.00000000e+00],
       [9.99980655e-01, 1.93452824e-05]])

In [74]:
# 데이터전처리 > 토큰화, 수치화  > 모델학습

In [75]:
#pipeline: 토큰화 + 모델학습
from sklearn.pipeline import make_pipeline

In [76]:
pipe = make_pipeline(CountVectorizer(), LinearSVC())

In [77]:
# 학습데이터를 잘 선택
# 토큰화 하기 전의 데이터를 집어 넣어야 한다
# text_train
pipe.fit(text_train,y_train)



Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('linearsvc', LinearSVC())])

In [78]:
# 예측
# 토큰화 하기 전 데이터를 집어넣어야 예측이 진행
pipe.predict(reviews)

array([1, 1, 0])

In [79]:
# CountVectorizer
# min_df : 전체문서 중에 등장해야하는 빈도의 최소치 설정
  # 문서에서 이만큼 나온것만 쓰겟어!
# Max_df: 전체문서 중에 등장해야하는 빈도의 최대치 설정

# 문서에서 너무 많이 나온것은 단어사전에 등록하지 않겠어!
# n_gram: 

In [81]:
from sklearn.model_selection import GridSearchCV

In [82]:
# 모델이름__하이퍼파라미터
grid_params = {
    'countvectorizer__min_df' : [3,5,10],
    'countvectorizer__max_df' : [20000,22000,24000],
    'countvectorizer__ngram_range' : [(1,2),(1,3),(2,2)],
    'linearsvc__C' : [0.001,0.01,0.1,10,100]
}
grid = GridSearchCV(pipe, grid_params, cv=3, n_jobs=-1)

In [85]:
# gridsearch 진행하고 나온 최고의 파라미터, 점수 출력
print(grid.best_score_)
print(grid.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

## tf-idf 를 활용한 시각화
- tf: 하나의 문서에서 단어가 등장하는 횟수
- idf: 전체문서에서 단어가 등장하는 횟수(df)의 역수 
> 적은문서에서 등장하는 단어일수록  값이 큼
- 적은문서에서 등장하고, 등장하는 문서에서는 많이 쓰이는 단어를 중요한 단어라고 인식 

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
tfidf= TfidfVectorizer()

In [88]:
tfidf.fit(test_txt)

TfidfVectorizer()

In [89]:
# 단어사전 확인
tfidf.vocabulary_

{'안녕하세요': 8,
 '저는': 11,
 '항나': 15,
 '예요': 9,
 '지금은': 14,
 '머신러닝': 2,
 '수업을': 5,
 '듣고': 0,
 '있슴둥': 10,
 '배고파요': 4,
 '맥도날드': 1,
 '햄버거': 16,
 '먹고': 3,
 '싶어요': 6,
 '지금': 13,
 '점심시간': 12,
 '아니예요': 7}

In [90]:
# 토큰화
tfidf.transform(test_txt).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.5       , 0.5       ,
        0.        , 0.5       , 0.        , 0.        , 0.        ,
        0.5       , 0.        ],
       [0.4472136 , 0.        , 0.4472136 , 0.        , 0.        ,
        0.4472136 , 0.        , 0.        , 0.        , 0.        ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ,
        0.        , 0.        ],
       [0.        , 0.4472136 , 0.        , 0.4472136 , 0.4472136 ,
        0.        , 0.4472136 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.4472136 ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.57735027, 0.57735027, 0.        ,
        0.        , 0.        ]])