In [1]:
import numpy as np
import pandas as pd 
import os 
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [18]:
data = pd.read_csv("Datasets/tripadvisor_hotel_reviews.csv")
data.shape

(20491, 2)

In [2]:
stop = stopwords.words("english")
port = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) 
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [26]:
next(stream_docs(path="Datasets/tripadvisor_hotel_reviews.csv"))[0]

'"nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  "'

In [4]:
def get_minibatch(doc_stream,size):
    docs,y = [] , []
    try:
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None
    return docs,y

In [37]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error="ignore",
                        n_features=2**21,
                        tokenizer=tokenizer)

clf = SGDClassifier(loss="log",
                   random_state=1,
                   max_iter=1)

In [38]:
classes = np.array([1,2,3,4,5])
for _ in range(45):
    X_train,y_train = get_minibatch(stream_docs(path="Datasets/tripadvisor_hotel_reviews.csv"),size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)

In [39]:
print("Prediction: {}".format(clf.score(X_train,y_train)))

Prediction: 0.992


In [44]:
X_test,y_test = get_minibatch(stream_docs(path="Datasets/tripadvisor_hotel_reviews.csv"),size=2000)
X_test = vect.transform(X_test)
print("Prediction: {}".format(clf.score(X_test,y_test)))

Prediction: 0.7985


In [42]:
clf = clf.partial_fit(X_test,y_test)

In [47]:
import pickle
import os

dest = os.path.join('classifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [48]:
%%writefile classifier/vectorizer.py
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
                os.path.join(cur_dir, 
                'pkl_objects', 
                'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
                   + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

Writing classifier/vectorizer.py


In [49]:
import os 
os.chdir("classifier")

In [50]:
import pickle
import re 
import os 
from vectorizer import vect

clf = pickle.load(open(os.path.join("pkl_objects","classifier.pkl"),"rb"))

In [57]:
import numpy as np
label={1:"worse",2:"bad",3:"good",4:"better",5:"best"}
example = ['I love this place']
X = vect.transform(example)
print("Prediction: {} \nProbability: {}".format(label[clf.predict(X)[0]],
                                               np.max(clf.predict_proba(X)*100)))

Prediction: best 
Probability: 73.58510469989795


In [59]:
import sqlite3
import os

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

conn.commit()
conn.close()

In [61]:
conn = sqlite3.connect("reviews.sqlite")
c = conn.cursor()
# example1 = "I love this place"

# c.execute("INSERT INTO review_db"\
#          " (review, sentiment, date) VALUES"\
#          " (?, ?, DATETIME('now'))",(example1,5))

# example2 = "I dislike this place"

# c.execute("INSERT INTO review_db"\
#          " (review, sentiment, date) VALUES"\
#          " (?, ?, DATETIME('now'))",(example2,1))
# conn.commit()

c.execute("SELECT * from review_db")
results = c.fetchall()
conn.close
print(results)

[('I love this place', 5, '2021-11-21 09:09:09'), ('I dislike this place', 1, '2021-11-21 09:09:09')]
