In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from time import time

In [123]:
class Book_Reviews_Analysis(object):
    def __init__(self):
        self.bookReviews = None
        self.stopWords = None
        self.vectorizer = None
        self.model = None
        self.features = None
        self.labels = None
        self.features_train = None
        self.labels_train = None
        self.features_test = None
        self.labels_test = None
    
    # function to fetch the data....
    def extractData(self, url):
        try:
            print("Extracting data from url....")
            self.bookReviews = pd.read_csv(url, sep="\t", names=['Review', 'text'])
            
            print("Data extracted from url : ")
            
            return self.bookReviews
        except Exception as e:
            print("Exception Caught is : ", e)
        
    # function to create stopwords...
    def createStopWords(self):
        self.stopWords = set(stopwords.words('english'))
        print "Stopwords are : "
        
        return self.stopWords
    
    # function to pre-process our data...
    def preProcessData(self, data, vectorizer_type):
        transformedData = None
        if vectorizer_type == "TfIdf":
            print("Creating a Tf-Idf vectorizer for the given data...")
            self.vectorizer = self.createTFIDFVectorization()
            
            # transform the data....
            transformedData = self.vectorizer.fit_transform(data['text'])
            print("Data type is : ", type(transformedData))
            self.labels = data["Review"]
            
        elif vectorizer_type == "CV":
            print("Creating the count vectorizer for the given data...")
        
        return transformedData, self.labels
    
            
    # function to perform tf-idf vectorization...
    def createTFIDFVectorization(self):
        # instantiating a Tf-Idf vectorizer...
        self.vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=self.stopWords)
        
        return self.vectorizer
    
    # method to split the data....
    def split_the_data(self, transformedData):
        print("Splitting the data into train and test....")
        self.features = transformedData
        self.features_train, self.features_test, self.labels_train, self.labels_test = train_test_split(self.features, self.labels, random_state=42)
        
        print("Data split is completed....")
        
        return self.features_train, self.features_test, self.labels_train, self.labels_test
    
    # create a classifier....
    def createClassifier(self, classifier_type):
        if classifier_type == 'MNB':
            print("Creating a Multinomial Classifier....")
            self.model = MultinomialNB()
            
            return self.model
            
        elif classifier_type == "LR":
            print("Creating a Logistic Regression Classifier....")
            self.model = LogisticRegression()
            
            return self.model
            
    # train the data...
    def trainData(self, classifier_type, features, labels, data_type):
        try:
            # create a classifier...
            self.model = self.createClassifier(classifier_type)
            if data_type == 'train':
                print("Training the model on trained data...")
                start_time = time()
                
                # train the data....
                self.model.fit(features, labels)
        
                end_time = time()
    
                training_time = end_time - start_time
                print("Training data is trained on the "+classifier_type+" classifier. "+" The training time is : "+str(float(training_time)))
            
            elif data_type == 'test':
                # work on test data...
                print("Training the moel on test data....")
                start_time = time()
                
                # train the data...
                self.model.fit(features, labels)
                
                end_time = time()
                
                training_time = end_time - start_time
                print("Testing data is trained on the "+classifier_type+" classifier. "+" The training time is : "+str(float(training_time)))
            
            return self.model, training_time
        
        except Exception as ex:
            print("Exception Caught is : ", ex)
        
        
    # make predictions...
    def predictResult(self, classifier, feature):
        # predict the result....
        labels_pred = classifier.predict(feature)
        
        return labels_pred
    
    def checkForAccuracy(self, labels_pred, labels_true):
        # generate classification report...
        return classification_report(labels_pred, labels_true)
        
    def validateForNewVector(self, movie_array):
        movie_review_vector = self.vectorizer.transform(movie_array)
        sts_movie_review = ""
        result = int(self.model.predict(movie_review_vector))
        
        if result == 0:
            sts_movie_review = "Negative"
        elif result == 1:
            sts_movie_review = "Positive"
            
        return sts_movie_review
        
        

In [124]:
print("Creating object...")
br = Book_Reviews_Analysis()
print("Object Created")

Creating object...
Object Created


In [125]:
url = "https://raw.githubusercontent.com/mbernico/CS570/master/data/UMICH_SI650_Sentiment_Classification.txt"
bookReviews = br.extractData(url)
bookReviews.head(5)

Extracting data from url....
Data extracted from url : 


Unnamed: 0,Review,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [126]:
# create a lit of stopwords....
stopWords = br.createStopWords()
print(stopWords)

Stopwords are : 
set([u'all', u'just', u"don't", u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'don', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u"should've", u"haven't", u'do', u'them', u'his', u'very', u"you've", u'they', u'not', u'during', u'now', u'him', u'nor', u"wasn't", u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u"won't", u'where', u"mustn't", u"isn't", u'few', u'because', u"you'd", u'doing', u'some', u'hasn', u"hasn't", u'are', u'our', u'ourselves', u'out', u'what', u'for', u"needn't", u'below', u're', u'does', u"shouldn't", u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u"mightn't", u"doesn't", u'were', u'here', u'shouldn', u'hers', u"aren't", u'by', u'on', u'about', u'couldn', u'of', u"wouldn't", u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u"hadn't", u'mightn', u"couldn't", u'wasn', u'your', u"you're", u'from', u'her', u'their', 

In [127]:
transformedData, target = br.preProcessData(bookReviews, "TfIdf")
print("Shape of the feature data is : ", transformedData.shape)

print("Shape of labels is : ", target.shape)

Creating a Tf-Idf vectorizer for the given data...
('Data type is : ', <class 'scipy.sparse.csr.csr_matrix'>)
('Shape of the feature data is : ', (6918, 2011))
('Shape of labels is : ', (6918L,))


In [128]:
# split the data....
features_train, features_test, labels_train, labels_test = br.split_the_data(transformedData)

print("Shape of train features are : ", features_train.shape)
print("Shape of the train labels are : ", labels_train.shape)

Splitting the data into train and test....
Data split is completed....
('Shape of train features are : ', (5188, 2011))
('Shape of the train labels are : ', (5188L,))


In [129]:
# train the training data...
classifier, training_time = br.trainData("MNB", features_train, labels_train, "train")

Creating a Multinomial Classifier....
Training the model on trained data...
Training data is trained on the MNB classifier.  The training time is : 0.00600004196167


In [130]:
# check for accuracy...
labels_pred_train = br.predictResult(classifier, features_train)

print("Classification report for the training data is : ")
print br.checkForAccuracy(labels_pred_train, labels_train)

print("Accuracy score for training data is : ", accuracy_score(labels_pred_train, labels_train)*100)

Classification report for the training data is : 
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      2218
          1       1.00      0.99      0.99      2970

avg / total       0.99      0.99      0.99      5188

('Accuracy score for training data is : ', 99.402467232074017)


In [131]:
# work on testing data...
classifier, training_time = br.trainData("MNB", features_test, labels_test, "test")

Creating a Multinomial Classifier....
Training the moel on test data....
Testing data is trained on the MNB classifier.  The training time is : 0.00300002098083


In [132]:
# check for accuracy...
labels_pred_test = br.predictResult(classifier, features_test)

print("Classification report for the testing data is : ")
print br.checkForAccuracy(labels_pred_test, labels_test)

print("Accuracy score for testing data is : ", accuracy_score(labels_pred_test, labels_test)*100)

Classification report for the testing data is : 
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       728
          1       1.00      0.99      0.99      1002

avg / total       0.99      0.99      0.99      1730

('Accuracy score for testing data is : ', 99.075144508670519)


*** Thats a pretty good and robust model...

In [143]:
# validating on a new array...
string = ["Jupyter Ascending is a decent movie"]

movie_review_array = np.array(string)

result = br.validateForNewVector(movie_review_array)

print("The Review Status is : ")
print(result)

The Review Status is : 
Negative
