In [149]:
import pandas as pd
import numpy as np
import joblib
import pickle

dataset = pd.read_csv('Accommodation_Reviews.csv')

X_good = dataset.Positive_Review.values
X_bad = dataset.Negative_Review.values

y = dataset.Hotel_Name.values

In [150]:
#Some model will not be able to work with such a large dataset, so I split it into small batches,
#moreover, some model support batch learning so the split will be useful
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=40)

X_good_ = list()
X_bad_ = list()
y_ = list()

for train_index, test_index in skf.split(X_good, y):
    X_good_.append(X_good[test_index])
    X_bad_.append(X_bad[test_index])
    y_.append(y[test_index])



In [None]:
#The next functions are the same as in study_dataset file 

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
def embedding(X):
    X = X.tolist()
    X = [x.lower() for x in X]
    corpus = []
    nltk.download('stopwords')
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    for i in range(0, len(X)):
        review = X[i]
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    cv = CountVectorizer(max_features = 1500)
    cv.fit(corpus)
    return cv

In [12]:
def recom_score(predictor, X_test, y_test):
    X_test = X_test
    y_test = y_test
    success = 0
    for i in range(len(y_test)):
        y_pred = predictor.predict_proba([X_test[i]])
        y_pred_sorted = np.argsort(y_pred)
        top_100 = predictor.classes_[y_pred_sorted][0][-100:]
        if (y_test[i] in top_100):
            success+=1
    return success/len(y_test)

In [5]:
vectorizer = embedding(X_good_[0])
X_good_vect = vectorizer.transform(X_good_[0]).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#I want to select models which can give me multiple probable hotels in descending probability order
# Hence I need a probabilistic model, so selecting among naive bayes, Random Forest, and Logistics Regression
from sklearn.naive_bayes import GaussianNB
gaus = GaussianNB()
gaus.fit(X_good_vect, y_[0])

In [16]:
recom_score(gaus, vectorizer.transform(X_good_[1][:1000]).toarray(), y_[1][:1000])

0.122

In [10]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100, max_depth = 10)
forest.fit(X_good_vect, y_[0])

In [14]:
recom_score(forest, vectorizer.transform(X_good_[1]).toarray(), y_[1])

0.38583837443772295

In [8]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss = "log_loss", n_jobs = -1)
sgd.fit(X_good_vect, y_[0])

In [13]:
recom_score(sgd, vectorizer.transform(X_good_[1]).toarray(), y_[1])

0.4019699084845665

In [None]:
#SGDClassifier performed as the best probabilistic model. That is handy, because it also supports batch learning
#thus, I can improve the result

In [None]:
#Same tesing but for negative reviews

In [17]:
vectorizer_bad = embedding(X_good_[0])
X_bad_vect = vectorizer_bad.transform(X_bad_[0]).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
gaus_bad = GaussianNB()
gaus_bad.fit(X_bad_vect, y_[0])
recom_score(gaus, vectorizer_bad.transform(X_bad_[1][:1000]).toarray(), y_[1][:1000])

0.086

In [23]:
forest_bad = RandomForestClassifier(n_estimators = 100, max_depth = 10)
forest_bad.fit(X_bad_vect, y_[0])
recom_score(forest_bad, vectorizer_bad.transform(X_bad_[1]).toarray(), y_[1])

0.33030867069955017

In [24]:
sgd_bad = SGDClassifier(loss = "log_loss", n_jobs = -1)
sgd_bad.fit(X_bad_vect, y_[0])
recom_score(sgd_bad, vectorizer_bad.transform(X_bad_[1]).toarray(), y_[1])

0.31813246471226925

In [None]:
#Yex, Random Forest performed slightly better (~1.2%) but that is not significant, because 
#SGDClassifier has partial_fit and it can perfom better when trained on the whole dataset.

In [None]:
#Next I train each model (for good and bad reviews) on as many batches as possible until it starts overfitting

In [119]:
clf_good_reviews = SGDClassifier(loss = "log_loss", n_jobs = -1)
clf_good_reviews.partial_fit(X_good_vect, y_[0], classes = np.unique(y))

In [120]:
clf_good_reviews.partial_fit(vectorizer.transform(X_good_[1]).toarray(), y_[1])

In [121]:
#Once the model performs worse two sequential rounds, the trining is stopped
from sklearn import clone
previous_score = recom_score(clf_good_reviews, vectorizer.transform(X_good_[1]).toarray(), y_[1])
score_now = recom_score(clf_good_reviews, vectorizer.transform(X_good_[2]).toarray(), y_[2])
for i in range(2, 38):
    clf_good_reviews.partial_fit(vectorizer.transform(X_good_[i]).toarray(), y_[i])
    score_b4_previous = previous_score
    previous_score = score_now
    score_now = recom_score(clf_good_reviews, vectorizer.transform(X_good_[i+1]).toarray(), y_[i+1])
    print("batch number {number}; score is {score}".format(number = i, score = score_now))
    if (score_now < previous_score) and (previous_score < score_b4_previous):
        print("the training is stopped after {number} batches".format(number = i))
        break

joblib.dump(clf_good_reviews, "clf_good_reviews.pkl")

batch number 2; score is 0.40538234837909104
batch number 3; score is 0.42546921048549713
batch number 4; score is 0.4367147510469986
batch number 5; score is 0.4506747324336901
batch number 6; score is 0.45478517139754926
batch number 7; score is 0.46649604467194045
batch number 8; score is 0.46905537459283386
batch number 9; score is 0.47805180704203504
batch number 10; score is 0.4754924771211416
batch number 11; score is 0.4795253606328525
batch number 12; score is 0.47510469986040016
batch number 13; score is 0.4765782534512176
batch number 14; score is 0.47161470451372733
batch number 15; score is 0.4683573755234993
the training is stopped after 15 batches


['clf_good_reviews.pkl']

In [123]:
#Same for the bad reviews

clf_bad_reviews = SGDClassifier(loss = "log_loss", n_jobs = -1)
clf_bad_reviews.partial_fit(vectorizer_bad.transform(X_bad_[0]).toarray(), y_[0], classes = np.unique(y))
clf_bad_reviews.partial_fit(vectorizer_bad.transform(X_bad_[1]).toarray(), y_[1])
previous_score = recom_score(clf_bad_reviews, vectorizer_bad.transform(X_bad_[1]).toarray(), y_[1])
score_now = recom_score(clf_bad_reviews, vectorizer_bad.transform(X_bad_[2]).toarray(), y_[2])

for i in range(2, 38):
    clf_bad_reviews.partial_fit(vectorizer_bad.transform(X_bad_[i]).toarray(), y_[i])
    score_b4_previous = previous_score
    previous_score = score_now
    score_now = recom_score(clf_bad_reviews, vectorizer_bad.transform(X_bad_[i+1]).toarray(), y_[i+1])
    print("batch number {number}; score is {score}".format(number = i, score = score_now))
    if (score_now < previous_score) and (previous_score < score_b4_previous):
        print("the training is stopped after {number} batches".format(number = i))
        break
joblib.dump(clf_bad_reviews, "clf_bad_reviews.pkl")

batch number 2; score is 0.30533581510780206
batch number 3; score is 0.3146424693655964
batch number 4; score is 0.3301535597952536
batch number 5; score is 0.3395377695051962
batch number 6; score is 0.348146424693656
batch number 7; score is 0.34822398014580425
batch number 8; score is 0.3610981852024197
batch number 9; score is 0.36536373507057546
batch number 10; score is 0.3684659531565069
batch number 11; score is 0.3724212812160695
batch number 12; score is 0.3721886148596246
batch number 13; score is 0.37544594384985264
batch number 14; score is 0.3757561656584458
batch number 15; score is 0.38304637816038467
batch number 16; score is 0.3817279354738638
batch number 17; score is 0.3709764988753587
the training is stopped after 17 batches


['clf_bad_reviews.pkl']

In [124]:
joblib.dump(vectorizer_bad, "vectorizer_bad.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

In [144]:
#getting all the hotels names, their scores, and 3 corresponding comments to save it
#and load them on the web page when recommending
hotel_scores = dict()
hotel_reviews = dict()
for i in np.unique(y):
    hotel_scores[i] = dataset[dataset.Hotel_Name == i].Average_Score.values[0]
    hotel_reviews[i] = [dataset[dataset.Hotel_Name == i].Positive_Review.values[0],
                        dataset[dataset.Hotel_Name == i].Positive_Review.values[1],
                        dataset[dataset.Hotel_Name == i].Positive_Review.values[2]]

f = open("hotel_scores.pkl","wb")
pickle.dump(hotel_scores,f)
f.close()
f = open("hotel_reviews.pkl","wb")
pickle.dump(hotel_reviews,f)
f.close()

In [2]:
from joblib import load
from typing import Union
import numpy as np
import pickle


with open('hotel_scores.pkl', 'rb') as file:
    hotel_scores = pickle.load(file)

with open('hotel_reviews.pkl', 'rb') as file:
    hotel_reviews = pickle.load(file)

clf_bad_loaded = load('clf_bad_reviews.pkl')
clf_good_loaded = load('clf_good_reviews.pkl')
vectorizer_bad = load('vectorizer_bad.pkl')
vectorizer = load('vectorizer.pkl')


def recommender(positive: str, negative: Union[str, None] = None):
    y_pred = clf_bad_loaded.predict_proba(vectorizer.transform([positive]).toarray())
    y_pred_sorted = np.argsort(y_pred)

    top_50 = clf_bad_loaded.classes_[y_pred_sorted][0][-50:]
    top_50 = top_50.tolist()

    if negative:
        y_pred = clf_good_loaded.predict_proba(vectorizer_bad.transform([negative]).toarray())
        y_pred_sorted = np.argsort(y_pred)

        negative_50 = clf_good_loaded.classes_[y_pred_sorted][0][-50:]
        negative_50 = negative_50.tolist()

        for i in negative_50:
            if i in top_50:
                top_50.remove(i)

    top_50.reverse()
    scores = [float(hotel_scores[i]) for i in top_50]
    reviews = [hotel_reviews[i] for i in top_50]

    return top_50, scores, reviews


In [3]:
recommender("Clean bathroom, beach, large bed", "dogs, smell, loud")

(['Park Plaza Victoria London',
  'Club Quarters Hotel Gracechurch',
  'The Queens Gate Hotel',
  'Radisson Blu Edwardian Vanderbilt',
  'Holiday Inn London Bloomsbury',
  'De Vere Devonport House',
  'Hotel Cavendish',
  'Hotel SB Diagonal Zero Barcelona 4 Sup',
  'The Student Hotel Amsterdam City',
  'BEST WESTERN Maitrise Hotel Maida Vale',
  'Hampshire Hotel Rembrandt Square Amsterdam',
  'citizenM Tower of London',
  'Ramada Apollo Amsterdam Centre',
  'Novotel London Wembley',
  'Park Grand London Kensington',
  'Gainsborough Hotel',
  'Hilton London Canary Wharf',
  'citizenM London Bankside',
  'Ambassadors Bloomsbury',
  'Petit Palace Boqueria Garden'],
 [8.6,
  8.2,
  8.2,
  8.2,
  8.2,
  8.2,
  6.4,
  8.2,
  8.7,
  7.1,
  7.8,
  9.1,
  8.2,
  8.9,
  8.4,
  6.9,
  9.0,
  9.1,
  7.9,
  8.5],
 [[' The bed was so comfortable Great location near the Victoria Coach Station Victoria Station We walked everywhere To catch tours we only had to walk a few blocks Loved our stay at this 