In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score,recall_score, confusion_matrix,  roc_auc_score
import numpy as np
import time
import pickle
"Own made libraries"
import database as db
import scrapping.scrapperSelenium as scrapperSelenium
import scrapping.scrapperSoup as scrapperSoup
import textCleaning as tc
import model.buildmodel as build
import scrapping.scrapperSoup as soup

In [4]:
dfReviewsDb = db.retrieve_table_into_df('cleanedhoteldata')
X = dfReviewsDb["lemReviews"]
y = dfReviewsDb["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=69)

In [5]:
vect = CountVectorizer(max_features=1000)

X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [6]:
randomForest = pickle.load(open('model/saved/finalized_random_model_best_params2.sav', 'rb'))

In [14]:
y_pred = randomForest.predict(X_test_vect)
y_prob = randomForest.predict_proba(X_test_vect)[::, 1]

In [15]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("AUC: \n", roc_auc_score(y_test, y_prob))

Accuracy: 92.12%
Confusion Matrix:
 [[   661  31751]
 [   742 379453]]
AUC: 
 0.6965041215355987


In [8]:
import pickle
filename = 'finalized_model_best_params.sav'
pickle.dump(randomForest, open(filename, 'wb'))

## Cleaning data from Hotel_Reviews.csv

In [2]:
start_time = time.time()

dfReviews = pd.read_csv("Hotel_Reviews.csv")
dfReviews = tc.kaggle_strip_reviews(dfReviews)
dfReviews = tc.kaggle_label_data(dfReviews)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfStrippedReviews["Review"] = dfReviews["Negative_Review"].apply(lambda x: x.replace("No Negative", "")) + \


In [4]:
dfReviews = tc.clean_data(dfReviews)

end_time = time.time()

In [6]:
db.insert_df_into_db(dfReviews, "cleanedhoteldata", "append")

Inserting data into database per 10000 rows


In [7]:
time_lapsed = end_time - start_time
print(time_lapsed)

282.41862750053406


### Clean own data

In [3]:
dfReviewsOwn = pd.read_csv("data/own_reviews.csv", sep=',')
dfReviewsOwn = tc.own_label_data(dfReviewsOwn)
dfReviewsOwn = tc.clean_data(dfReviewsOwn)

#### Insert own data into cleanedhoteldata database

In [None]:
db.insert_df_into_db(dfReviewsOwn, "cleanedhoteldata", "append")

In [None]:
dfReviews = db.retrieve_table_into_df('cleanedhoteldata')
svm = build.build_svm(dfReviews)
filename = 'model/svm_best_params.sav'
pickle.dump(randomForest, open(filename, 'wb'))

In [14]:
dfReviewsOwn = db.retrieve_table_into_df('scrappedrawhoteldata')
dfReviewsOwn = tc.clean_data(dfReviewsOwn)

In [15]:
Xown = dfReviewsOwn['lemReviews']
Yown = dfReviewsOwn['label']

Xown_vect = vect.transform(Xown)

In [12]:
y_pred = randomForest.predict(Xown_vect)
y_prob = randomForest.predict_proba(Xown_vect)[::, 1]
print("Accuracy: {:.2f}%".format(accuracy_score(Yown, y_pred) * 100))
print("Confusion Matrix:\n", confusion_matrix(Yown, y_pred))
print("AUC: \n", roc_auc_score(Yown, y_prob))

Accuracy: 94.17%
Confusion Matrix:
 [[  2  14]
 [  0 224]]
AUC: 
 0.7672991071428572


In [12]:
df = soup.soup_scrapper_booking("https://www.booking.com/reviews/nl/hotel/westcord-city-centre.en-gb.html", num_of_pages=10)

In [13]:
db.insert_df_into_db(df, 'scrappedrawbookinghoteldata', 'append')

Inserting data into database per 10000 rows


In [15]:
dfTrip = db.retrieve_table_into_df('scrappedrawtriphoteldata')
dfTrip.to_csv('trip_reviews.csv', index=False)

In [3]:
df = tc.clean_data(df)

In [None]:
Xsoup = df['lemReviews']
Ysoup = df['label']

Xsoup_vect = vect.transform(Xown)

In [11]:
y_pred = randomForest.predict(Xsoup_vect)
y_prob = randomForest.predict_proba(Xsoup_vect)[::, 1]
print("Accuracy: {:.2f}%".format(accuracy_score(Ysoup, y_pred) * 100))
print("Confusion Matrix:\n", confusion_matrix(Ysoup, y_pred))
print("AUC: \n", roc_auc_score(Yown, y_prob))

Accuracy: 94.17%
Confusion Matrix:
 [[  2  14]
 [  0 224]]
AUC: 
 0.7672991071428572


In [2]:
#randomForest = pickle.load(open('model/saved/finalized_random_model_best_params4.sav', 'rb'))
dfReviewsOwn = pd.read_csv('data/own_reviews.csv')
dfReviewsOwn = tc.own_label_data(dfReviewsOwn)
dfReviewsOwn = tc.clean_data(dfReviewsOwn)
dfTrip = pd.read_csv('data/trip_reviews.csv')
dfBooking = pd.read_csv('data/booking_reviews.csv')
dfHotels = pd.concat([dfBooking, dfTrip], ignore_index=True)
dfHotels = dfHotels[['Review', 'label']]
dfHotels = dfHotels.dropna()
dfHotels = tc.clean_data(dfHotels)
dfHotels = dfHotels.replace(r'^\s*$',np.nan, regex=True)
dfHotels = dfHotels.dropna()
dfKaggle = pd.read_csv("data/Hotel_Reviews.csv")
dfKaggle = tc.kaggle_strip_reviews(dfKaggle)
dfKaggle = tc.kaggle_label_data(dfKaggle)
dfKaggle = tc.clean_data(dfKaggle)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfStrippedReviews["Review"] = dfReviews["Negative_Review"].apply(lambda x: x.replace("No Negative", "")) + \


In [3]:
dfCombined = pd.concat([dfKaggle, dfHotels, dfReviewsOwn], ignore_index=True)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=69)

vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [48]:
X = dfCombined['lemReviews']
y = dfCombined['label']

In [40]:
model = pickle.load(open('model/saved/finalized_lr_model_best_params2.sav', 'rb'))

In [49]:
vect = pickle.load(open('model/saved/lrvect.sav', 'rb'))

X_test_vect = vect.transform(X)
y_pred = model.predict(X_test_vect)
y_prob = model.predict_proba(X_test_vect)[::, 1]
print("Accuracy: {:.2f}%".format(accuracy_score(y, y_pred) * 100))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("AUC: \n", roc_auc_score(y, y_prob))
print("Precision: \n", precision_score(y, y_pred, average='weighted'))
print("Recall: \n", recall_score(y, y_pred, average='weighted'))

Accuracy: 98.18%
Confusion Matrix:
 [[ 33553   6939]
 [  2469 473233]]
AUC: 
 0.9812581689297968
Precision: 
 0.9813058966645906
Recall: 
 0.9817742941607225
