In [1]:
import os
import pandas as pd

In [2]:
base = '/home/vaibhav/aclImdb'
labels = {'pos': 1, 'neg' : 0}
df = pd.DataFrame()
for folder in ('test', 'train'):
    for filename in ('pos', 'neg'):
        path = os.path.join(base, folder, filename)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[filename]]], ignore_index=True)
            
df.columns = ['review', 'sentiment']

In [3]:
df.head(5)

Unnamed: 0,review,sentiment
0,I watch tons of movies and had no idea this wo...,1
1,Richard Norton really lights the screen up in ...,1
2,One of eastwood's best movies after he had sep...,1
3,The best bit in the film was when Alan pulled ...,1
4,Richard Schickel's 1991 documentary about Gary...,1


In [4]:
#To reorder the index i.e. Randomize
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [5]:
df.head()

Unnamed: 0,review,sentiment
11841,No one would ever question that director Leos ...,1
19602,"Very odd, this seems like a very average movie...",0
45519,This is actually an insult to the victims and ...,0
25747,What does the Marquis de Sade have to do with ...,1
42642,"Beyond dirt cheap, this shot-on-video exercise...",0


In [13]:
df.to_csv('movie_reviews.csv', index=False)

In [9]:
import re

In [6]:
import string

In [10]:
def text_process(text):
    exclude = string.punctuation
    text = re.sub('<[^>]*>', '', text)
    text = ''.join(character for character in text.lower() if character not in exclude)
    return text

In [11]:
df['review'] = df['review'].apply(text_process)

In [45]:
df.to_csv("clean_movie_review.csv", index=False)

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def vect(text): 
    stemmer = PorterStemmer()
    tokens = text.split()
    tokens_filtered = [w for w in tokens if not w in stopwords.words('english')]
    stems = [stemmer.stem(t) for t in tokens_filtered]
    stems_nopunct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return (stems_nopunct)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(encoding='utf-8', decode_error='replace', 
                        smooth_idf = True,
                        tokenizer = vect,
                        ngram_range=(1,2),
                        max_df=0.6,
                        min_df=0.1)

In [19]:
#Train Test split
from sklearn.cross_validation import train_test_split
train = df.sample(frac=0.8, random_state=1)
test = df.loc[~df.index.isin(train.index)]
train.shape
test.shape

(10000, 2)

In [22]:
tfidf

TfidfVectorizer(analyzer=u'word', binary=False, decode_error='replace',
        dtype=<type 'numpy.int64'>, encoding='utf-8', input=u'content',
        lowercase=True, max_df=0.6, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function vect at 0x7fa1ddfb7668>, use_idf=True,
        vocabulary=None)

In [23]:
train.describe

<bound method DataFrame.describe of                                                   review  sentiment
46198  the main problem with power is that it feature...          0
12925  leave ed wood alone to call plan 9 from outer ...          0
44852  i stopped watching this pos as soon as the sna...          0
26174  a chance encounter between a salesman and a hi...          1
23099  i really wanted to like this film especially a...          0
29863  i just went to a screening of the film during ...          1
15075  spoilersi hate this one but it is better than ...          0
15670  i truly was disappointed by this film which i ...          0
21166  i accidentally happened upon this movie when i...          0
17543  i want to believe all new horror films coming ...          0
13647  a time to kill is based on john grishams first...          0
15011  corny and horrible i was not surprised this sh...          0
4685   so the wwe has done it they have poured over i...          1
44622  the s

In [30]:
import time

In [51]:
tfidf_train = tfidf.fit_transform(train.review)


In [52]:
tfidf_test = tfidf.transform(test.review)


In [53]:
tfidf_train.shape

(40000, 140)

In [54]:
tfidf_test.shape

(10000, 140)

In [42]:
type(train.sentiment)

pandas.core.series.Series

In [55]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(tfidf_train.toarray(), train.sentiment)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
pred = clf.predict(tfidf_test.toarray())

In [57]:
from sklearn.metrics import accuracy_score
print accuracy_score(pred, test.sentiment)

0.7439
