In [1]:
import pandas as pd
import nltk
import sklearn as sk
import numpy as np
from nltk.tokenize import word_tokenize

In [2]:
from nltk.corpus import stopwords
stop=stopwords.words('english')

In [3]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
from sklearn.model_selection import train_test_split

### READ DATA FILE

In [5]:
data=pd.read_excel('C:\\Users\\Ammick\\Google Drive\\PHD\\Research Paper\\PYTHON\\2016 Movies Reviews\\All Data + Extraction Code\\Paper II Working\\Review_Extract_Label_Bollywood.xlsx')

In [6]:
df=data.copy()

#### POLARITY ADJUSTMENT

In [7]:
df['Polarity']=df['Polarity'].replace(['Very Postive','Postive','Neutral','Very Negative'],['Positive','Positive','Negative','Negative'])

In [8]:
df[df.Polarity=='Positive'].count()

Reviews     2143
Polarity    2143
dtype: int64

#### TOKENIZE

In [9]:
df['Tokens']=df['Reviews'].apply(lambda token : word_tokenize(token))

#### REMOVE STOP WORDS

In [10]:
def token_stop(Tokens):
    token_stops=[w.lower() for w in Tokens if w not in stop]
    return token_stops

df['Stop_Tokens']=df.loc[:,'Tokens'].apply(lambda token : token_stop(token))
   

#### STEMMING

In [11]:
def token_stem(Tokens):
    token_stems=[PorterStemmer().stem(w) for w in Tokens]
    return token_stems

df['Stem_Tokens']=df['Stop_Tokens'].apply(lambda token: token_stem(token))

#### LEMMATIZING

In [12]:
def token_lemma(Tokens):
    token_lemmas=[WordNetLemmatizer().lemmatize(w) for w in Tokens]
    return token_lemmas

df['Lemma_Tokens']=df['Stop_Tokens'].apply(lambda token: token_lemma(token))

#### POS_TAGGING

In [13]:
df['POS_Tokens']=df['Lemma_Tokens'].apply(lambda token: nltk.pos_tag(token))

#### TRAIN TEST SPLIT 

In [14]:
train, test=train_test_split(df,test_size=0.20)

In [15]:
X_train, Y_train= train['Stem_Tokens'].values, train['Polarity']
X_test, Y_test= test['Stem_Tokens'], test['Polarity']
train_data=[]
for X,Y in zip(X_train, Y_train):
    X_t={}
    for x in set(X):
        X_t[x]='True'
    train_data.append((X_t,Y))
  

In [16]:
test_data=[]
for X,Y in zip(X_test, Y_test):
    X_t={}
    for x in set(X):
        X_t[x]='True'
    test_data.append((X_t,Y))  

### CLASSIFICATION

#### NAIVE BAYES

In [17]:
from nltk.classify import NaiveBayesClassifier
import nltk.classify.util
from nltk import precision, recall, f_measure

In [18]:
nbClassifier=NaiveBayesClassifier.train(train_data)


#### SVM CLASSIFIER

In [19]:
import nltk.classify
from sklearn.svm import LinearSVC

In [20]:
svmClassifier=nltk.classify.SklearnClassifier(LinearSVC()).train(train_data)

### TESTING CLASSIFIER WITH METRICES

In [21]:
import collections
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_data):
    refsets[label].add(i)
    observed = nbClassifier.classify(feats)
    testsets[observed].add(i)

In [22]:
print ('accuracy:', nltk.classify.util.accuracy(nbClassifier, test_data)*100)
print ('pos precision:', precision(refsets['Positive'], testsets['Positive']))
print ('pos recall:', recall(refsets['Positive'], testsets['Positive']))
print ('pos F-measure:', f_measure(refsets['Positive'], testsets['Positive']))
print ('neg precision:', precision(refsets['Negative'], testsets['Negative']))
print ('neg recall:', recall(refsets['Negative'], testsets['Negative']))
print ('neg F-measure:', f_measure(refsets['Negative'], testsets['Negative']))

accuracy: 69.98444790046656
pos precision: 0.9619771863117871
pos recall: 0.5802752293577982
pos F-measure: 0.7238912732474964
neg precision: 0.5184210526315789
neg recall: 0.9516908212560387
neg F-measure: 0.6712095400340715
