In [1]:
import re, nltk
import sqlalchemy as sql
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Set important globals here

In [11]:
# Set limit for reviews
MAX_LIMIT = 100000
# Create database connection
ENGINE = sql.create_engine('mysql://root:@localhost/hotel-reviews?charset=utf8')

## Get positive and negative reviews

In [3]:
pos = pd.read_sql('SELECT * FROM `reviews` WHERE positive = 1 LIMIT {0}'.format(MAX_LIMIT), ENGINE)
neg = pd.read_sql('SELECT * FROM `reviews` WHERE positive = 0 LIMIT {0}'.format(MAX_LIMIT), ENGINE)

print('length pos:', len(pos), 'length neg:', len(neg))

length pos: 400000 length neg: 387848


In [4]:
pos_text = []
pos_labels = []
for df_index, df_value in pos.iterrows():
    pos_text.append(df_value['content'])
    pos_labels.append('pos')

neg_text = []
neg_labels = []
for neg_df_index, neg_df_value in neg.iterrows():
    neg_text.append(neg_df_value['content'])
    neg_labels.append('neg')
    
# all lengths should be same as MAX_LIMIT
print('len pos_text:', len(pos_text), 'len pos_labels:', len(pos_labels), 'len neg_text:', len(neg_text), 'len neg_labels:', len(neg_labels))

len pos_text: 400000 len pos_labels: 400000 len neg_text: 387848 len neg_labels: 387848


## Create training and test text/labels

In [5]:
# make the training set 80% of max limit
training_limit = int(MAX_LIMIT * .8)

# Training text and labels are 160% of MAX LIMIT (this is because of merging positive and negative) (and 80% of total)
training_text = pos_text[:training_limit] + neg_text[:training_limit]
training_labels = pos_labels[:training_limit] + neg_labels[:training_limit]

# Test text and labels are 40% of MAX LIMIT (and 20% of total)
test_text = pos_text[training_limit:] + neg_text[training_limit:]
test_labels = pos_labels[training_limit:] + neg_labels[training_limit:]

print('len training_text:', len(training_text), 'len training_labels:', len(training_labels), 'len test_text:', len(test_text), 'len test_labels:', len(test_labels))

len training_text: 640000 len training_labels: 640000 len test_text: 147848 len test_labels: 147848


## Other stuff

In [6]:
vectorizer = CountVectorizer(analyzer='word', lowercase=False, max_features=200)
features = vectorizer.fit_transform(training_text + test_text)
features_nd = features.toarray() 
len(features_nd)

787848

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    features_nd[0:len(training_text)], 
    training_labels,
    train_size=0.80, 
    random_state=1234
)

In [8]:
log_model = LogisticRegression(solver='lbfgs')
log_model = log_model.fit(X=x_train, y=y_train)
y_pred = log_model.predict(x_test)

In [9]:
print(nltk.ConfusionMatrix(list(y_test), list(y_pred)))

    |     n     p |
    |     e     o |
    |     g     s |
----+-------------+
neg |<59537> 4285 |
pos |  9229<54949>|
----+-------------+
(row = reference; col = test)



In [10]:
print(accuracy_score(y_test, y_pred))

0.894421875
