In [1]:
import nltk
import pandas as pd
import sqlalchemy as sql
from nltk.classify.util import accuracy
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [2]:
# Disable the next line if punkt is not downloaded yet
# nltk.download('punkt')

# Setting functions / globals

In [3]:
def format_sentence(sentence):
    return({word: True for word in nltk.word_tokenize(sentence)})

In [4]:
# Set limit for reviews
MAX_LIMIT = 200000
# Create database connection
def call_procedure(function_name, params):
    
    connection = sql.create_engine('mysql://root:@localhost/hotel-reviews?charset=utf8').raw_connection()
    try:
        cursor = connection.cursor()
        cursor.callproc(function_name, params)
        results = list(cursor.fetchall())
        cursor.close()
        connection.commit()
        return results
    finally:
        connection.close()

## Getting all reviews

In [5]:
pos = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 1]), columns=['id', 'content', 'positive'])
neg = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 0]), columns=['id', 'content', 'positive'])

print('length pos:', len(pos), 'length neg:', len(neg))

length pos: 200000 length neg: 200000


In [6]:
pos_list = []
for val in pos.loc[:, 'content'].tolist():
    pos_list.append([format_sentence(val), 'pos'])
    
neg_list = []
for val in neg.loc[:, 'content'].tolist():
    neg_list.append([format_sentence(val), 'neg'])

## Setting training variables etc

In [7]:
training_limit = int(MAX_LIMIT * .8)

training = pos_list[:training_limit] + neg_list[:training_limit]
test = pos_list[training_limit:] + neg_list[training_limit:]

In [8]:
classifier = NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()

Most Informative Features
                Friendly = True              pos : neg    =    162.5 : 1.0
                   Comfy = True              pos : neg    =    148.5 : 1.0
                  leaked = True              neg : pos    =    144.3 : 1.0
               Brilliant = True              pos : neg    =    132.1 : 1.0
                  Superb = True              pos : neg    =    125.3 : 1.0
                Spacious = True              pos : neg    =    122.8 : 1.0
                 cracked = True              neg : pos    =    119.0 : 1.0
               Fantastic = True              pos : neg    =    115.8 : 1.0
               Excellent = True              pos : neg    =    105.9 : 1.0
                  loudly = True              neg : pos    =    103.0 : 1.0


In [9]:
print(accuracy(classifier, test))

0.906725


In [10]:
result_prediction = []
result_actual = []

for i in range(len(test)):
    result_prediction.append(classifier.classify(test[i][0]))
    result_actual.append(test[i][1])

In [11]:
print(nltk.ConfusionMatrix(result_actual, result_prediction))

    |     n     p |
    |     e     o |
    |     g     s |
----+-------------+
neg |<38539> 1461 |
pos |  6001<33999>|
----+-------------+
(row = reference; col = test)



In [12]:
with open('../src/review3.csv', 'r') as review3:
    print(classifier.classify(format_sentence(review3.read())))
    
with open('../src/review4.csv', 'r') as review4:
    print(classifier.classify(format_sentence(review4.read())))

neg
neg
