# **Import libraries**

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from textblob import TextBlob
import re

# Read dataset to Pandas Dataframe

In [95]:
fields = ['Positive_Review', 'Negative_Review']
df = pd.read_csv(
    '../input/515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv',
    usecols= fields, nrows=5000)
df.head()

Unnamed: 0,Negative_Review,Positive_Review
0,I am so angry that i made this post available...,Only the park outside of the hotel was beauti...
1,No Negative,No real complaints the hotel was great great ...
2,Rooms are nice but for elderly a bit difficul...,Location was good and staff were ok It is cut...
3,My room was dirty and I was afraid to walk ba...,Great location in nice surroundings the bar a...
4,You When I booked with your company on line y...,Amazing location and building Romantic setting


# Stemming

In [96]:
stemmer = SnowballStemmer('english')
df['Positive_Review'] = df['Positive_Review'].apply(
    lambda x:' '.join([stemmer.stem(y) for y in x.split()]))
df['Negative_Review'] = df['Negative_Review'].apply(
    lambda x: ' '.join([stemmer.stem(y) for y in x.split()]))

# Removing Stopwords

In [97]:
url = "https://countwordsfree.com/stopwords/english/json"
response = pd.DataFrame(data = json.loads(requests.get(url).text))
SW = list(response['words'])
df['Positive_Review'] = df['Positive_Review'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (SW)]))
df['Negative_Review'] = df['Negative_Review'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (SW)]))

#  Initialize new Dataframe and Removing numbers

In [98]:
df_Positive = df['Positive_Review'].copy()
df_Positive = df_Positive.str.replace('\d+', '')
df_Negative = df['Negative_Review'].copy()
df_Negative = df_Negative.str.replace('\d+', '')

# Apply TF-IDF method

In [99]:
tfidf = TfidfVectorizer(min_df=2,max_df=0.5, ngram_range=(1,3))
features = tfidf.fit_transform(df_Positive)
df_Positive = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names()
)
df_Positive['Target'] = '1'
df_Positive.head()

Unnamed: 0,abil,abl,abl book,abl check,abl leav,abl open,abl open window,abov,abov stay,absolut,...,wrap,wrong,xx,yard,year,young,young coupl,young ladi,zone,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.147383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [100]:
tfidf = TfidfVectorizer(min_df=2,max_df=.05, ngram_range=(1,3))
features = tfidf.fit_transform(df_Negative)
df_Negative = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names()
)
df_Negative['Target'] = '0'
df_Negative.head()

Unnamed: 0,abit,abit tire,abl,abl open,abl open window,abl room,abov,abov air,abov air condit,abov bed,...,year room,year veri,yesterday,young,young child,young child sleep,youth,yr,zone,Target
0,0.0,0.0,0.059471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [101]:
df = df_Positive.append(df_Negative)
df.shape

(10000, 12676)

In [102]:
%%time
df = df.fillna(0)

CPU times: user 801 ms, sys: 1.12 s, total: 1.92 s
Wall time: 1.92 s


In [103]:
df.tail()

Unnamed: 0,abil,abl,abl book,abl check,abl leav,abl open,abl open window,abov,abov stay,absolut,...,wrong night,xmas,year eve,year room,year veri,yesterday,young child,young child sleep,youth,yr
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Define x and y set

In [104]:
x = df.drop('Target',axis=1)
y = df['Target']

x.shape, y.shape

((10000, 12675), (10000,))

In [105]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.2, random_state = 0, stratify = y)

# Remove constant features

In [113]:
constant_filter = VarianceThreshold(threshold = 0.0002)
constant_filter.fit(x_train)
feature_list = x_train[x_train.columns[
    constant_filter.get_support(indices=True)]]

print('Number of selected features: ' ,len(list(feature_list)),'\n')
print('List of selected features: \n' ,list(feature_list))

Number of selected features:  716 

List of selected features: 
 ['abl', 'abov', 'absolut', 'absolut noth', 'accept', 'access', 'accommod', 'actual', 'ad', 'adequ', 'air', 'air condit', 'aircon', 'alway', 'am', 'amaz', 'ambianc', 'ambienc', 'amen', 'ani', 'anoth', 'anyth', 'apex', 'appoint', 'appreci', 'architectur', 'area', 'arriv', 'atmospher', 'attent', 'attract', 'avail', 'averag', 'aw', 'awesom', 'bad', 'bad experi', 'bag', 'balconi', 'bar', 'bar area', 'bar staff', 'basement', 'basic', 'bath', 'bathroom', 'bathroom small', 'bathroom tini', 'bathroom veri', 'beauti', 'beauti build', 'beauti hotel', 'becaus', 'bed', 'bed comfi', 'bed comfort', 'bed nice', 'bed pillow', 'bed realli', 'bed room', 'bed soft', 'bed veri', 'bed veri comfi', 'bed veri comfort', 'bedroom', 'bedroom veri', 'befor', 'big', 'bigger', 'birthday', 'bit small', 'book', 'bottl', 'breakfast', 'breakfast buffet', 'breakfast excel', 'breakfast expens', 'breakfast good', 'breakfast great', 'breakfast includ', 'break

In [114]:
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

In [115]:
x_train_filter.shape, x_test_filter.shape, x_train.shape

((8000, 716), (2000, 716), (8000, 12675))

In [116]:
x_train_filter = pd.DataFrame(x_train_filter)
x_test_filter = pd.DataFrame(x_test_filter)

#  Remove Correlated features

In [117]:
def get_correlation(data, threshold):
    corr_col = set()
    cormat = data.corr()
    for i in range(len(cormat.columns)):
        for j in range(i):
            if abs(cormat.iloc[i,j]) > threshold:
                colname = cormat.columns[i]
                corr_col.add(colname)
    return corr_col

In [118]:
corr_features = get_correlation(x_train_filter, 0.70)

In [119]:
x_train_uncorr = x_train_filter.drop(labels= corr_features, axis = 1)
x_test_uncorr = x_test_filter.drop(labels= corr_features, axis = 1)
x_train_uncorr = pd.DataFrame(x_train_uncorr)
x_test_uncorr = pd.DataFrame(x_test_uncorr)
x_train_uncorr.shape, x_test_uncorr.shape, x_train_filter.shape

((8000, 693), (2000, 693), (8000, 716))

# Applying Linear Discriminant Analysis

In [124]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
x_train_lda = lda.fit_transform(x_train_uncorr, y_train)
x_test_lda = lda.fit_transform(x_test_uncorr, y_test)

# Build a Random Forest Classifier (RFC)

In [125]:
def runRandomForest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print ('accracy is: ', accuracy_score(y_test, y_pred))

Run Random Forest Classifier using LDA model:

In [122]:
%%time
runRandomForest(x_train_lda, x_test_lda, y_train, y_test)

accracy is:  0.9795
CPU times: user 1.34 s, sys: 104 ms, total: 1.45 s
Wall time: 713 ms


Run calssifier without using LDA model:

In [123]:
%%time
runRandomForest(x_train, x_test, y_train, y_test)

accracy is:  0.9605
CPU times: user 48.3 s, sys: 106 ms, total: 48.4 s
Wall time: 12.9 s
