In [1]:
import pyprind
import pandas as pd
import os

In [2]:
basepath = '../data/aclImdb'

In [3]:
labels = {'pos':1,'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index = True)
            pbar.update()
df.columns = ['review', 'sentiment']            

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:01


In [4]:
import numpy as np

In [5]:
df.head(10)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
5,"I saw this film on September 1st, 2005 in Indi...",1
6,"Maybe I'm reading into this too much, but I wo...",1
7,I felt this film did have many good qualities....,1
8,This movie is amazing because the fact that th...,1
9,"""Quitting"" may be as much about exiting a pre-...",1


In [6]:
df = df.reindex(np.random.permutation(df.index))

In [7]:
df.head(10)

Unnamed: 0,review,sentiment
39414,Don't really know where to start with one of t...,0
24889,"What a mess--and I'm not referring to the ""des...",0
30145,This is a great movie but there could be more ...,1
21964,This movie is awful. At first I thought it may...,0
36763,"True, the idea for this TV series may have spr...",1
38850,Okay at first this movie seemed pretty good ev...,0
49662,There are a lot of highly talented filmmakers/...,0
20175,The actresses bra in a changing room--well I g...,0
49486,A bunch of kids set up a theatre to have an al...,0
9338,In this tale of a tightly wound Christian fami...,1


In [8]:
df.to_csv('movie_data.csv')

In [9]:
df.shape

(50000, 2)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
count = CountVectorizer()

In [12]:
import re

In [13]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [14]:
df['review'] = df['review'].apply(preprocessor)

In [15]:
def tokenizer(text) :
    return text.split()

In [16]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'],test_size= 0.5)

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [36]:
stop = stopwords.words('english')
poter = PorterStemmer()
def tokenizer_porter(text):
    return [poter.stem(word) for word in text.split()]

In [37]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

In [38]:
lr_tfidf = Pipeline([('vect', tfidf),
                   ('clf', LogisticRegression(solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                          cv=5, verbose=1, n_jobs=2)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 39.0min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 3659.9min
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed: 4394.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         