In [24]:
import random
import nltk
from nltk.corpus import movie_reviews 
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer ##For removing punctuation 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import confusion_matrix

## Using the Classifical Convention of stemming, converting to lowercase , removing stopwords and then using the SVM model to train the movie ratings. 

In [3]:
##Importing the movie reviews 
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
##Building the list of documents
##There are 2000 movie reviews with 1000 positive and 1000 negative movie reviews sorted out together.
document = []
category = []
print(movie_reviews.categories())

for category in movie_reviews.categories():
    if category != 'neg':
        print(len(movie_reviews.fileids(category)))


for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        if category == 'neg' or category != 'neg':
            document.append(movie_reviews.words(fileid))

print('First Review: {}'.format(' '.join(document[10])))
print('Total Length : {}'.format(len(document)))

In [44]:
##Creating a tuple with word tokenized review and the category for each of the 2000 documents 
documents = [(list(movie_reviews.words(fileid)),category)
            for category  in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

ps = PorterStemmer() ##For stemming 
stopwords = set(stopwords.words('english')) ##For stopwords
clean_words = []
    
    
##Shuffling the documents
random.shuffle(documents)

all_words = []
for w in movie_reviews.words():
    if w not in stopwords:
        w = filter(unicode.isalnum, w)
        if w != '':
            w = ps.stem(w) ##Stemming the words 
            all_words.append(w.lower())
    

all_words = nltk.FreqDist(all_words)  ##Arranges in the order of most common words 


print('Most Common Words used : {}'.format(all_words.most_common(20)))
# print len(documents[0][0])
# print len(all_words)

Most Common Words used : [(u'film', 11201), (u'movi', 6980), (u'one', 6030), (u'like', 4137), (u'charact', 3881), (u'make', 3243), (u'get', 3220), (u'time', 3047), (u'scene', 2671), (u'even', 2611), (u'good', 2475), (u'play', 2382), (u'stori', 2346), (u'see', 2224), (u'would', 2110), (u'much', 2051), (u'go', 2015), (u'well', 1968), (u'also', 1967), (u'two', 1912)]


In [77]:
##Out of all the words, I'm using the most common 4000 words as features

word_features = (all_words.keys())[:4000]

def find_features(document):
    words = set(document) ##Contains the word tokenizers present in document 
    features = {} ##empty dictionary
    
    for w in word_features:
        features[w] = (w in words)  ##th
    return features



features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print key

##words like proble, horror, confusing etc could be used as a feature to give a negative feedback

plot
music
want
arrow
away
bottom
down
concept
exact
fuck
still
stir
not
one
get
3
s
world
start
with
7
horror
more
american
also
dead
deal
into
video
insight
off
lost
problem
blair


In [78]:
##making the feature list for all the documents

feature_sets = [(find_features(rev),category) for rev,category in documents]

In [79]:
##Splitting the data into training and Testing

seed = 1

train,test = model_selection.train_test_split(feature_sets, test_size = 0.25, random_state = seed)

print(len(train))
print(len(test))

1500
500


In [80]:
model = SklearnClassifier(SVC(kernel= 'linear'))

In [81]:
model.train(train)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [82]:
accuracy  = nltk.classify.accuracy(model, test)
print('SVC Accuracy: {}'.format(accuracy))

SVC Accuracy: 0.692


## Trying out tf-idf and Count Vectorizer technique on real movie reviews

In [6]:
movie_vector = CountVectorizer(min_df = 2, tokenizer = nltk.word_tokenize)


In [7]:
moviedata = r'/home/umang/Desktop/NLP/corpora/movie_reviews'

In [8]:
# loading all files as training data. 
movie_data = load_files(moviedata, shuffle=True)

In [10]:
print(len(movie_data.data))
movie_data.target_names

2000


['neg', 'pos']

In [11]:
##The 1001th review in the movie_review corpus is a positive review
movie_data.target[1001]

1

In [14]:
##The tf vector 
movie_counts = movie_vector.fit_transform(movie_data.data)
print(movie_counts.todense())
print (movie_counts.shape)

[[0 0 2 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(2000, 25286)


In [15]:
##Making the tf-idf vector
movie_tfidf = TfidfTransformer().fit_transform(movie_counts)
movie_tfidf.toarray()

array([[0.        , 0.        , 0.03844927, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [16]:
##There are 25286 unique indexed term in the vocabulary
movie_tfidf.shape

(2000, 25286)

In [17]:
## 1 represents pos whereas 0 represents negative 
movie_data.target

array([0, 1, 1, ..., 1, 0, 0])

## Training the model using the Naive Bayes Classifier 

In [110]:
X_train,X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_data.target, test_size = 0.20, random_state = 12)

In [111]:
##Fitting the model on the training data
model_nb = MultinomialNB().fit(X_train, y_train)

In [112]:
##Predicitng the model on the testing data and then determining the accuracy
predictions = model_nb.predict(X_test)
print("Accuracy through Naive Bayes Classifier is : {}".format(sklearn.metrics.accuracy_score(y_test, predictions)))

Accuracy through Naive Bayes Classifier is : 0.82


In [113]:
##Printing out the confusion matrix
cm = confusion_matrix(y_test, predictions)
print ("Matrix:{}".format(cm))

Matrix:[[175  31]
 [ 41 153]]


## Training the model using the Logistic Regression Model

In [87]:
X_train,X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_data.target, test_size = 0.20, random_state = 12)

In [88]:
##Fitting the model on the training data
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [89]:
##Predicitng the model on the testing data and then determining the accuracy
predictions = model.predict(X_test)
print("Accuracy through Logistic Regression is : {}".format(sklearn.metrics.accuracy_score(y_test, predictions)))

Accuracy through Logistic Regression is : 0.7825


In [90]:
##Printing out the confusion matrix
cm = confusion_matrix(y_test, predictions)
print ("Matrix:{}".format(cm))

Matrix:[[152  54]
 [ 33 161]]


## We conclude that the Naive Bayes model with tf-idf features gives us the best results.
## We now predict the classification result based on a set of random reviews. 

In [182]:
 # very short and fake movie reviews
reviews = ['Awesome Horror movie','This movie was aweful', 'Absolute boring ride', 
            'awesome','YOLO', 'Steven Seagal shined through.', ':(',
              'This was certainly a movie', 'Two thumbs awesome up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.','Just fucking awesome.']
reviews_new_counts = movie_vector.transform(reviews)
reviews_new_tfidf = TfidfTransformer().fit_transform(reviews_new_counts)

In [183]:
len(movie_vector.vocabulary_)

25286

In [184]:
print (reviews_new_tfidf.todense().shape)
prediction = model_nb.predict(reviews_new_tfidf)

(17, 25286)


In [185]:
prediction

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0])

In [186]:
for review, segregation in zip(reviews_new, prediction):
    print('%r => %s' % (review, movie_data.target_names[segregation]))

'Awesome Horror movie' => pos
'This movie was aweful' => neg
'Absolute boring ride' => neg
'awesome' => pos
'YOLO' => pos
'Steven Seagal shined through.' => neg
':(' => neg
'This was certainly a movie' => neg
'Two thumbs awesome up' => pos
'I fell asleep halfway through' => neg
"We can't wait for the sequel!!" => neg
'!' => neg
'?' => neg
'I cannot recommend this highly enough' => neg
'instant classic.' => pos
'Steven Seagal was amazing. His performance was Oscar-worthy.' => pos
'Just fucking awesome.' => neg
