In [1]:
import nltk
import pandas as pd
import sqlalchemy as sql
from nltk.classify.util import accuracy
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [2]:
# Disable the next line if punkt is not downloaded yet
# nltk.download('punkt')

# Setting functions / globals

In [3]:
def format_sentence(sentence):
    return({word: True for word in nltk.word_tokenize(sentence)})


In [4]:
# Set limit for reviews
MAX_LIMIT = 100000
# Create database connection
def call_procedure(function_name, params):
    
    connection = sql.create_engine('mysql://root:@localhost/hotel-reviews?charset=utf8').raw_connection()
    try:
        cursor = connection.cursor()
        cursor.callproc(function_name, params)
        results = list(cursor.fetchall())
        cursor.close()
        connection.commit()
        return results
    finally:
        connection.close()

## Getting all reviews

In [5]:
pos = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 1]), columns=['id', 'content', 'positive'])
neg = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 0]), columns=['id', 'content', 'positive'])

print('length pos:', len(pos), 'length neg:', len(neg))

length pos: 100000 length neg: 100000


In [6]:
pos_list = []
for val in pos.loc[:, 'content'].tolist():
    pos_list.append([format_sentence(val), 'Positive'])
    
neg_list = []
for val in neg.loc[:, 'content'].tolist():
    neg_list.append([format_sentence(val), 'Negative'])

## Setting training variables etc

In [7]:
training_limit = int(MAX_LIMIT * .8)

training = pos_list[:training_limit] + neg_list[:training_limit]
test = pos_list[training_limit:] + neg_list[training_limit:]

In [8]:
classifier = NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()

Most Informative Features
                  Superb = True           Positi : Negati =    145.0 : 1.0
                Friendly = True           Positi : Negati =    123.2 : 1.0
               Fantastic = True           Positi : Negati =    112.4 : 1.0
                Spacious = True           Positi : Negati =    111.4 : 1.0
                   Quiet = True           Positi : Negati =    105.9 : 1.0
                Pleasant = True           Positi : Negati =    101.7 : 1.0
                   Comfy = True           Positi : Negati =    100.5 : 1.0
               Beautiful = True           Positi : Negati =     99.1 : 1.0
               Excellent = True           Positi : Negati =     96.0 : 1.0
             Comfortable = True           Positi : Negati =     88.5 : 1.0


In [9]:
print(accuracy(classifier, test))

0.906


In [10]:
result_prediction = []
result_actual = []

for i in range(len(test)):
    result_prediction.append(classifier.classify(test[i][0]))
    result_actual.append(test[i][1])

In [11]:
print ('\nConfusion matrix:\n',confusion_matrix(result_actual, result_prediction))


Confusion matrix:
 [[19163   837]
 [ 2923 17077]]
