<a href="https://colab.research.google.com/github/Takkar-915/movie_review/blob/main/movie_review_logistic_bag_of_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyprind
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tarfile
with tarfile.open('/content/drive/MyDrive/3年前期/知的情報システム開発/データセット/aclImdb_v1.tar.gz', 'r:gz') as tar:
  tar.extractall()

In [None]:
import pyprind
import pandas as pd
import os

basepath = 'aclImdb'

labels = {'pos': 1, 'neg' :0}
#進捗状況を確認
pbar = pyprind.ProgBar(50000)

df = pd.DataFrame()
for i in ('test','train'):
  for j in ('pos', 'neg'):
    path = os.path.join(basepath,i,j)
    for file in sorted(os.listdir(path)):
      with open(os.path.join(path,file),'r',encoding='utf-8') as infile:
        txt = infile.read()
      df =df.append([[txt,labels[j]]],ignore_index=True)
      pbar.update()
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:32


In [None]:
import numpy as np

np.random.seed(0)
#渡した配列の要素(df.index)をランダムに並び替える
df = df.reindex(np.random.permutation(df.index))

In [None]:
df.to_csv('movie_data.csv',index = False,encoding = 'utf-8')

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [None]:
import re

def preprocessor(text):

  #文字列を置換
  text = re.sub('<[^>]*>', '', text)

  emozi = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)

  text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emozi).replace('-', ''))
  return text

df['review'] = df['review'].apply(preprocessor)

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

#トークン化したものから語幹を取り出す
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop = stopwords.words('english') #あまりに一般的な単語は除外している。

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
x_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
x_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

グリッドサーチによるごり押しでハイパーパラメータの最適な組み合わせ見つける

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

bag_of_words = CountVectorizer(strip_accents=None, #ascii/unicodeに含まれないアクセント文字の置き換え
                              lowercase=False,    #アルファベットを小文字に統一する設定にしない
                              preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)], #1-gram
               'vect__stop_words': [stop, None],  #stopするかしないか。
               'vect__tokenizer': [tokenizer, tokenizer_porter], #さっき作った関数でトークン化の手法を2通り試す
               'clf__penalty': ['l1', 'l2'],  #正則化がl1,l2どっちがいいかどっちもやって試してる
               'clf__C': [10.0, 100.0]}, #正則化の強さを決めるパラメータ。勘。
              ]

lr_bag_of_words = Pipeline([('vect', bag_of_words),
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

#グリッドサーチを行うGridSearchCVクラスをインスタンス化
gs_lr_bag_of_words = GridSearchCV(lr_bag_of_words,  #チューニングを行うモデル
                           param_grid,  #パラメタ候補値を「パラメタ名, 候補値リスト」の辞書で与える
                           scoring='score',  #評価手法
                           cv=3,        #3分割交差検証
                           verbose=2,   #出力ログのレベル
                           n_jobs=-1) #コア数の指定。現状使えるコア数全部よこせ

In [None]:
gs_lr_bag_of_words.fit(x_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [10.0, 100.0], 'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
                                                'yours', 'yourself',
                                                'yourselves', 'he', 'him',
                                        

In [None]:
print('Best parameter set: %s ' % gs_lr_bag_of_words.best_params_)  #一番いいパラメータの組み合わせ出力
print('CV Accuracy: %.3f' % gs_lr_bag_of_words.best_score_)         #一番いいスコア

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fa68f5a8320>} 
CV Accuracy: 0.874


In [None]:
clf = gs_lr_bag_of_words.best_estimator_  
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))  #テストデータで性能評価

Test Accuracy: 0.880
