In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from stemming.porter2 import stem
import random

from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer


path_pos="C:\\Users\\Algorathmic\\Downloads\\Python Programming\\Python Programming\\movie_reviews\\pos"
path_neg="C:\\Users\\Algorathmic\\Downloads\\Python Programming\\Python Programming\\movie_reviews\\neg"

def preprocess( raw_review ):
    review_text = BeautifulSoup(raw_review).get_text()        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().replace("\'","").encode('Utf-8').split()
    
    # Stop Word Removal
    stops = set(stopwords.words("english"))                  
    meaningful_words = [stem(w) for w in words]   
    
    # Stemming
    for i in range(1,len(meaningful_words)):
            meaningful_words[i]=stem(meaningful_words[i])
            
    return( " ".join( meaningful_words))  

def create_dataframe(path,name):
    df = pd.DataFrame(columns=['File','Text','Review'])
    for root, dirs, files in os.walk(path):
        for fname in files:
            src = os.path.join(root, fname)
            text=open(src, "r")
            x = preprocess(text.read())
            y = name
            df=df.append(pd.Series([fname,x,y], index=['File','Text','Review']), ignore_index=True)
    return df
            

In [2]:
df1 = create_dataframe(path_pos,'pos')
df2 = create_dataframe(path_neg,'neg')
Frame = df1.append(pd.DataFrame(data=df2),ignore_index=True)
Frame['Review'] = Frame['Review'].map({'pos': 1, 'neg': 0})

In [51]:
n1 = len(df1)
n2 = len(df2)
total = n1+n2
split_n = (30*total)/100
rows = random.sample(Frame.index, split_n)
test = Frame.ix[rows]
train = Frame.drop(rows)

In [4]:
train.groupby('Review').count()  # 50 : 50

Unnamed: 0_level_0,File,Text
Review,Unnamed: 1_level_1,Unnamed: 2_level_1
neg,709,709
pos,691,691


In [5]:
test.groupby('Review').count() # 49 : 51

Unnamed: 0_level_0,File,Text
Review,Unnamed: 1_level_1,Unnamed: 2_level_1
neg,291,291
pos,309,309


In [52]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(train['Text']).toarray()
test_data_features = vectorizer.fit_transform(test['Text']).toarray()

In [91]:
##### Get The List of Features
vocab = vectorizer.get_feature_names()
print vocab

[u'abandon', u'abil', u'abl', u'aboard', u'abort', u'abound', u'abrupt', u'absenc', u'absent', u'absolut', u'absorb', u'absurd', u'abus', u'abyss', u'academi', u'accent', u'accept', u'access', u'accid', u'accident', u'acclaim', u'accompani', u'accomplish', u'accord', u'account', u'accur', u'accuraci', u'accus', u'ace', u'achiev', u'acid', u'acknowledg', u'across', u'act', u'action', u'activ', u'actor', u'actress', u'actual', u'ad', u'adam', u'adapt', u'add', u'addict', u'addit', u'address', u'adequ', u'admir', u'admit', u'adolesc', u'adopt', u'ador', u'adrenalin', u'adult', u'advanc', u'advantag', u'adventur', u'advertis', u'advic', u'advis', u'advoc', u'affair', u'affect', u'affleck', u'afford', u'aforement', u'afraid', u'africa', u'african', u'afternoon', u'afterward', u'age', u'agenda', u'agent', u'aggress', u'ago', u'agre', u'ahead', u'ahm', u'aid', u'aim', u'ain', u'air', u'airplan', u'airport', u'al', u'ala', u'alain', u'alan', u'albeit', u'albert', u'alcohol', u'alec', u'alex', 

In [92]:
#### Count the Frequecy of Each Feature
dist = np.sum(train_data_features, axis=0)
for tag, count in zip(vocab, dist):
    print count, tag

70 abandon
120 abil
226 abl
26 aboard
15 abort
13 abound
17 abrupt
18 absenc
22 absent
159 absolut
37 absorb
54 absurd
55 abus
17 abyss
69 academi
89 accent
123 accept
22 access
75 accid
48 accident
30 acclaim
43 accompani
84 accomplish
42 accord
43 account
34 accur
16 accuraci
47 accus
21 ace
117 achiev
17 acid
19 acknowledg
152 across
788 act
911 action
51 activ
871 actor
174 actress
712 actual
148 ad
111 adam
117 adapt
203 add
43 addict
95 addit
26 address
34 adequ
93 admir
133 admit
15 adolesc
37 adopt
31 ador
13 adrenalin
127 adult
58 advanc
30 advantag
130 adventur
34 advertis
33 advic
24 advis
29 advoc
76 affair
79 affect
68 affleck
13 afford
37 aforement
49 afraid
31 africa
58 african
13 afternoon
35 afterward
208 age
13 agenda
159 agent
19 aggress
132 ago
95 agre
43 ahead
13 ahm
71 aid
51 aim
25 ain
130 air
32 airplan
22 airport
60 al
43 ala
14 alain
77 alan
36 albeit
34 albert
37 alcohol
29 alec
52 alex
19 alexand
28 alfr
32 ali
46 alic
41 alicia
499 alien
14 alik
87 aliv
15 

In [61]:
## Naive Bayes (49% Acuuracy)
## Confusion Matrix For Testing DataSet
from sklearn.naive_bayes import GaussianNB,BernoulliNB
nb = GaussianNB().fit( train_data_features, train["Review"] )
result = nb.predict(test_data_features)
confusion_matrix(result, test['Review'])

array([[162, 163],
       [125, 150]])

In [62]:
## Naive Bayes 
## Confusion Matrix For Training DataSet
from sklearn.naive_bayes import GaussianNB,BernoulliNB
nb = GaussianNB().fit( train_data_features, train["Review"] )
result = nb.predict(train_data_features,)
confusion_matrix(result, train['Review'])

array([[703, 107],
       [ 10, 580]])

In [59]:
# Roc For Training DataSet
classifier = GaussianNB()
probas_ = classifier.fit(train_data_features, train['Review']).predict_proba(train_data_features)
fpr, tpr, thresholds = roc_curve(train['Review'], probas_[:, 1])
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()


Area under the ROC curve : 0.919130


In [58]:
# Roc For Testing DataSet DataSet
classifier = GaussianNB()
probas_ = classifier.fit(train_data_features, train['Review']).predict_proba(test_data_features)
fpr, tpr, thresholds = roc_curve(test['Review'], probas_[:, 1])
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()


In [None]:
#### In this we have used the basic NaiveBayes
#### If we choose some better classifier 
#### Decision Trees / Random Forest
#### Neural Networks
#### Support Vector Machines

#### Even Dimensionality Reduction would be very much usefull
#### Becasue here we have large number of features thus some method like
#### Principal Component Analysis Would be Helpful in this case before fitting a classifier

#### If We would use some other Vectorizer rather than CountVectorizer
#### Like BM25 which Normalizes the Term Weights with respect to the Length of the document and average length of the corpus

#### If we could make an ensemble of Term Weights and Part of Speech then I guess the classifier would work better