# Sentiment Analysis on IMDB Dataset

### Python Machine Learning

In [2]:
import pandas as pd
import numpy as np

# Get Data

In [4]:
! tar -zxf aclImdb_v1.tar.gz

In [48]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path ='./aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA[sec]: 0.000 
Total time elapsed: 107.874 sec


In [52]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [53]:
df.shape

(50001, 2)

# Save a csv copy

In [54]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False)

In [3]:
df = pd.read_csv('movie_data.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,Although there are some snippets in this 4-par...,0
1,I am astounded that so many people find this f...,0
2,A man and his wife are not getting along becau...,1
3,Safer indeed. Hitchcock is cinema's all time p...,1
4,"it was a very well written movie, and the acto...",1


# Remove Punctuation Marks

In [7]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + '\''.join(emoticons).replace('-', '')
    return text

In [8]:
df = df[df['review'].notnull()]

In [9]:
df['review'] = df['review'].apply(preprocessor)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,although there are some snippets in this 4 par...,0
1,i am astounded that so many people find this f...,0
2,a man and his wife are not getting along becau...,1
3,safer indeed hitchcock is cinema s all time pe...,1
4,it was a very well written movie and the actor...,1


# Stemming

In [11]:
def tokenizer(text):
    return text.split()

In [12]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [18]:
print(porter.stem('getting')) 
print(porter.stem('get') )

get
get


# Stopword Removal

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[u'runner', u'like', u'run', u'run', u'lot']

# Train Logistic Regression

### Split a training and test set

In [15]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

### Cross Validate and train logistic regression

In [72]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
                  'vect__stop_words': [stop, None],
                  'vect__tokenizer': [tokenizer,
                                      tokenizer_porter],
                  'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1.0, 10.0, 100.0]}
                ]
lr_tfidf = Pipeline([('vect', tfidf),
                        ('clf',
                         LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                              scoring='accuracy',
                              cv=5, verbose=1,
                              n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 106 out of 120 | elapsed: 15.6min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 16.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x10a9378c0>, <function tokenizer_porter at 0x10e605a28>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'y... u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'], None]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=No

In [73]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

CV Accuracy: 0.897
Test Accuracy: 0.900


# Let's do this on a smaller sample

In [101]:
df_sampled = df.loc[np.random.choice(range(df.shape[0]), 20000, replace=False)]
X_train_sampled = df_sampled.iloc[:10000].review.values
y_train_sampled = df_sampled.iloc[:10000].sentiment.values
X_test_sampled = df_sampled.iloc[10000:].review.values
y_test_sampled = df_sampled.iloc[10000:].sentiment.values

In [102]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_sampled = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
                  'vect__stop_words': [stop, None],
                  'vect__tokenizer': [tokenizer,
                                      tokenizer_porter],
                  'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1.0, 10.0, 100.0]}
                ]
lr_tfidf_sampled = Pipeline([('vect', tfidf_sampled),
                        ('clf',
                         LogisticRegression(random_state=0))])
gs_lr_tfidf_sampled = GridSearchCV(lr_tfidf_sampled, param_grid, 
                              scoring='accuracy',
                              cv=5, verbose=1,
                              n_jobs=-1)
gs_lr_tfidf.fit(X_train_sampled, y_train_sampled)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 106 out of 120 | elapsed:  6.4min remaining:   51.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  6.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x10a9378c0>, <function tokenizer_porter at 0x10e605a28>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'y... u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'], None]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=No

In [103]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test_sampled, y_test_sampled))

CV Accuracy: 0.880
Test Accuracy: 0.883


# Down sample even more

In [105]:
df_sampled_8000 = df.loc[np.random.choice(range(df.shape[0]), 8000, replace=False)]
X_train_sampled_8000 = df_sampled_8000.iloc[:4000].review.values
y_train_sampled_8000 = df_sampled_8000.iloc[:4000].sentiment.values
X_test_sampled_8000 = df_sampled_8000.iloc[4000:].review.values
y_test_sampled_8000 = df_sampled_8000.iloc[4000:].sentiment.values

In [108]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_sampled_8000 = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
                  'vect__stop_words': [stop, None],
                  'vect__tokenizer': [tokenizer,
                                      tokenizer_porter],
                  'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1.0, 10.0, 100.0]}
                ]
lr_tfidf_sampled_8000 = Pipeline([('vect', tfidf_sampled_8000),
                        ('clf',
                         LogisticRegression(random_state=0))])
gs_lr_tfidf_sampled_8000 = GridSearchCV(lr_tfidf_sampled_8000, param_grid, 
                              scoring='accuracy',
                              cv=5, verbose=1,
                              n_jobs=-1)
gs_lr_tfidf_sampled_8000.fit(X_train_sampled_8000, y_train_sampled_8000)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 106 out of 120 | elapsed:  2.6min remaining:   20.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x10a9378c0>, <function tokenizer_porter at 0x10e605a28>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'y... u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'], None]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=No

In [109]:
print('CV Accuracy: %.3f' % gs_lr_tfidf_sampled_8000.best_score_)
clf = gs_lr_tfidf_sampled_8000.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test_sampled_8000, y_test_sampled_8000))

CV Accuracy: 0.867
Test Accuracy: 0.862


# Try it out

In [110]:
df.head()

Unnamed: 0,review,sentiment
0,although there are some snippets in this 4 par...,0
1,i am astounded that so many people find this f...,0
2,a man and his wife are not getting along becau...,1
3,safer indeed hitchcock is cinema s all time pe...,1
4,it was a very well written movie and the actor...,1


In [120]:
comments = ['that first movie was awesome', 'the second movie sucked', 'the third movie was ehh', 'it had good and bad parts']
print(clf.predict_proba(comments))
print(clf.predict(comments))

[[ 0.22662417  0.77337583]
 [ 0.77889924  0.22110076]
 [ 0.64605786  0.35394214]
 [ 0.58457825  0.41542175]]
[1 0 0 0]
