In [1]:
import numpy as np 
import pandas as pd 
import sqlalchemy as sql
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Enable the next line if stopwords isnt downloaded yet
# nltk.download('stopwords')

# Setting important globals here

In [3]:
# Set limit for reviews
MAX_LIMIT = 20000
# Create database connection
def call_procedure(function_name, params):
    
    connection = sql.create_engine('mysql://root:@localhost/hotel-reviews?charset=utf8').raw_connection()
    try:
        cursor = connection.cursor()
        cursor.callproc(function_name, params)
        results = list(cursor.fetchall())
        cursor.close()
        connection.commit()
        return results
    finally:
        connection.close()
        
def proces_text(the_text):
     # Remove all the special characters (pro_fea = processed feature)
    pro_fea = re.sub(r'\W', ' ', the_text)
    # remove all single characters
    pro_fea = re.sub(r'\s+[a-zA-Z]\s+', ' ', pro_fea)
    # Remove single characters from the start
    pro_fea = re.sub(r'\^[a-zA-Z]\s+', ' ', pro_fea) 
    # Substituting multiple spaces with single space
    pro_fea = re.sub(r'\s+', ' ', pro_fea, flags=re.I)
    # Removing prefixed 'b'
    pro_fea = re.sub(r'^b\s+', '', pro_fea)
    # Converting to Lowercase
    pro_fea = pro_fea.lower()
    return pro_fea

## Get positive and negative reviews

In [4]:
pos = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 1]), columns=['id', 'content', 'positive'])
neg = pd.DataFrame(call_procedure('getReviews', [MAX_LIMIT, 0]), columns=['id', 'content', 'positive'])

print('length pos:', len(pos), 'length neg:', len(neg))

length pos: 20000 length neg: 20000


In [5]:
pos_text = []
pos_labels = []
for df_index, df_value in pos.iterrows():
    pos_text.append(proces_text(df_value['content']))    
    pos_labels.append('pos')
    
neg_text = []
neg_labels = []
for df_index, df_value in neg.iterrows():
    neg_text.append(proces_text(df_value['content']))    
    neg_labels.append('neg')
    
print('len pos_text:', len(pos_text), 'len pos_labels:', len(pos_labels), 'len neg_text:', len(neg_text), 'len neg_labels:', len(neg_labels))

len pos_text: 20000 len pos_labels: 20000 len neg_text: 20000 len neg_labels: 20000


## Making training and testing set

In [6]:
# make the training set 80% of max limit
training_limit = int(MAX_LIMIT * .8)

# Training text and labels are 160% of MAX LIMIT (this is because of merging positive and negative) (and 80% of total)
training_text = pos_text[:training_limit] + neg_text[:training_limit]
training_labels = pos_labels[:training_limit] + neg_labels[:training_limit]

# Test text and labels are 40% of MAX LIMIT (and 20% of total)
test_text = pos_text[training_limit:] + neg_text[training_limit:]
test_labels = pos_labels[training_limit:] + neg_labels[training_limit:]

print('len training_text:', len(training_text), 'len training_labels:', len(training_labels), 'len test_text:', len(test_text), 'len test_labels:', len(test_labels))

len training_text: 32000 len training_labels: 32000 len test_text: 8000 len test_labels: 8000


In [7]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
features = vectorizer.fit_transform(training_text + test_text)
features_nd = features.toarray()

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)
x_train, x_test, y_train, y_test = train_test_split(
    features_nd[0:len(training_text)], 
    training_labels,
    train_size=0.80, 
    random_state=0
)

In [9]:
text_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
text_classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [10]:
predictions = text_classifier.predict(x_test)

In [11]:
confusion_matrix(y_test,predictions)

array([[2933,  271],
       [ 285, 2911]], dtype=int64)