# Movie Reviews

In [70]:
## importing packages
import nltk
import random
from nltk import word_tokenize
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

In [71]:
movie = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

random.shuffle(movie)

In [72]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features
    


featuresets = [(find_features(rev), category) for (rev, category) in movie]
print(len(featuresets))

2000


### Training and testing 

In [73]:
training_set = featuresets[:1500]
testing_set = featuresets[1500:]

## Decision Tree

In [74]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

Tree_classifier = SklearnClassifier(DecisionTreeClassifier(min_samples_split =10))
Tree_classifier.train(training_set)
print("Tree_classifier accuracy percent:", (nltk.classify.accuracy(Tree_classifier, testing_set))*100)


Tree_classifier accuracy percent: 60.199999999999996


## NaiveBayes

In [75]:
# NaiveBayes
from sklearn.naive_bayes import MultinomialNB
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algorithm accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

Naive Bayes Algorithm accuracy percent: 79.80000000000001


## LogisticRegression

In [76]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
LogisticRegression = SklearnClassifier(LogisticRegression())
LogisticRegression.train(training_set)
print("LogisticRegression accuracy percent:", (nltk.classify.accuracy(LogisticRegression, testing_set))*100)


LogisticRegression accuracy percent: 82.0


###  SVC

In [77]:
#  SVC
from sklearn.svm import SVC
SVC = SklearnClassifier(SVC())
SVC.train(training_set)
print("SVC accuracy percent:", (nltk.classify.accuracy(SVC, testing_set))*100)

SVC accuracy percent: 73.4


## Logistic Regression Gives the Highest accuracy

In [None]:
######################################################################################################################

# Twitter

In [55]:
## import packages
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer

In [78]:
# import dataset
documents_twitter = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] + 
             [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])
random.shuffle(documents_twitter)

## Train and Test

In [79]:
training_data = documents_twitter[:7000]
testing_data = documents_twitter[7000:10000]

In [80]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in training_data for x in tagged_sent[0].split()]
print(len(vocabulary))
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
# print(vocabulary)

81267
19555


In [81]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

In [82]:
from nltk.corpus import sentiwordnet as swn
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [83]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [84]:
#extract the sentiment labels by making positive reviews as class 1 and negative reviews as class 2
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [85]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [86]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    print(inp)
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative")   

In [87]:
training_unigram_features = get_unigram_features(training_data,vocabulary) # vocabulary extracted in the beginning
training_swn_features = get_senti_wordnet_features(training_data)

training_features = merge_features(training_unigram_features,training_swn_features)

training_labels = get_lables(training_data)

test_unigram_features = get_unigram_features(testing_data,vocabulary)
test_swn_features=get_senti_wordnet_features(testing_data)
test_features= merge_features(test_unigram_features,test_swn_features)

test_gold_labels = get_lables(testing_data)

## Naive Bayes 

In [88]:

from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB().fit(training_features,training_labels) #training process
predictions = nb_classifier.predict(test_features)

print("Precision of NB classifier is")
predictions = nb_classifier.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = nb_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of NB classifier is
Training data	0.9991428571428571
Test data	0.9903333333333333


## SVM Classifier

In [89]:
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(penalty='l2', C=0.01).fit(training_features,training_labels)
predictions = svm_classifier.predict(training_features)

print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = svm_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of linear SVM classifier is:
Training data	1.0
Test data	0.9993333333333333


In [90]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
Tree_classifier = DecisionTreeClassifier(min_samples_split =10).fit(training_features,training_labels)
predictions = Tree_classifier.predict(training_features)
print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = Tree_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of linear SVM classifier is:
Training data	1.0
Test data	1.0


In [91]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
LogisticRegression = LogisticRegression().fit(training_features,training_labels)
predictions = LogisticRegression.predict(training_features)
print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = LogisticRegression.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of linear SVM classifier is:
Training data	1.0
Test data	0.9993333333333333


In [92]:
######################################################################################################################

# Accuracy

In [103]:
print("Dataset           NaiveBayes        SVM          DecisionTree  LogisticRegression ")
print("Movie Reviews        79.80         73.4           60.19             82.0              ")
print("Twitter_dataset      99.03         99.99          100              99.93                 ")

Dataset           NaiveBayes        SVM          DecisionTree  LogisticRegression 
Movie Reviews        79.80         73.4           60.19             82.0              
Twitter_dataset      99.03         99.99          100              99.93                 
