In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/media/vedic/FEBE11DFBE11916F/STUDY/Summer Internship 19/Hotel_Reviews.csv")

In [3]:
df["review"] = df["Negative_Review"] + df["Positive_Review"]

In [4]:
df["is_bad_review"] = df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)

In [5]:
df = df[["review", "is_bad_review"]]

In [6]:
df = df.sample(frac = 0.1, replace = False, random_state=42)

In [7]:
df["review"] = df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

In [8]:
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

In [9]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [11]:
df["review_clean"] = df["review"].apply(lambda x: clean_text(x))

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [13]:
df["sentiments"] = df["review"].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)

In [14]:
df

Unnamed: 0,review,is_bad_review,review_clean,neg,neu,pos,compound
488440,Would have appreciated a shop in the hotel th...,0,would appreciate shop hotel sell drinking wate...,0.049,0.617,0.334,0.9924
274649,No tissue paper box was present at the room,0,tissue paper box present room,0.216,0.784,0.000,-0.2960
374688,Pillows Nice welcoming and service,0,pillow nice welcome service,0.000,0.345,0.655,0.6908
404352,Everything including the nice upgrade The Hot...,0,everything include nice upgrade hotel revamp s...,0.000,0.621,0.379,0.9153
451596,Lovely hotel v welcoming staff,0,lovely hotel welcome staff,0.000,0.230,0.770,0.7717
302161,They don t have free wifi The location is per...,0,free wifi location perfect lot time want look ...,0.000,0.735,0.265,0.8074
317079,Room generally a bit shabby with some lack of...,0,room generally bit shabby lack maintenance cru...,0.040,0.854,0.106,0.5859
13963,Executive rooms 9th Floor don t have a bath T...,0,executive room floor bath website make look li...,0.047,0.823,0.130,0.8316
159785,Pity about the two days of rain Its centralit...,0,pity two day rain centrality proximity destina...,0.155,0.845,0.000,-0.2960
195089,Didn t like it at all construction was in pro...,1,like construction progress stuff lie vacancy l...,0.108,0.660,0.231,0.6369


In [15]:
df["nb_chars"] = df["review"].apply(lambda x: len(x))
df["nb_words"] = df["review"].apply(lambda x: len(x.split(" ")))

In [16]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [17]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["review_clean"].apply(lambda x: x.split(" ")))]

In [18]:
model = Doc2Vec(documents, vector_size = 5, window = 2, min_count = 1, workers = 4)

In [19]:
doc2vec_df = df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)

In [20]:
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]

In [21]:
df = pd.concat([df, doc2vec_df], axis = 1)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfidf = TfidfVectorizer(min_df = 10)

In [24]:
tfidf_result = tfidf.fit_transform(df["review_clean"]).toarray()

In [25]:
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())

In [26]:
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]

In [27]:
tfidf_df.index = df.index

In [28]:
df = pd.concat([df , tfidf_df], axis = 1)

In [29]:
df["is_bad_review"].value_counts(normalize = True)    #Imbalanced_Dataset

0    0.956761
1    0.043239
Name: is_bad_review, dtype: float64

In [30]:
df

Unnamed: 0,review,is_bad_review,review_clean,neg,neu,pos,compound,nb_chars,nb_words,doc2vec_vector_0,...,word_yet,word_yoghurt,word_yogurt,word_young,word_yr,word_yummy,word_zero,word_ziggo,word_zone,word_zuid
488440,Would have appreciated a shop in the hotel th...,0,would appreciate shop hotel sell drinking wate...,0.049,0.617,0.334,0.9924,599,113,0.178099,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
274649,No tissue paper box was present at the room,0,tissue paper box present room,0.216,0.784,0.000,-0.2960,44,10,-0.065233,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
374688,Pillows Nice welcoming and service,0,pillow nice welcome service,0.000,0.345,0.655,0.6908,36,7,0.022825,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
404352,Everything including the nice upgrade The Hot...,0,everything include nice upgrade hotel revamp s...,0.000,0.621,0.379,0.9153,155,27,-0.061550,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
451596,Lovely hotel v welcoming staff,0,lovely hotel welcome staff,0.000,0.230,0.770,0.7717,32,7,-0.076619,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
302161,They don t have free wifi The location is per...,0,free wifi location perfect lot time want look ...,0.000,0.735,0.265,0.8074,130,32,0.135714,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
317079,Room generally a bit shabby with some lack of...,0,room generally bit shabby lack maintenance cru...,0.040,0.854,0.106,0.5859,318,57,0.157756,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
13963,Executive rooms 9th Floor don t have a bath T...,0,executive room floor bath website make look li...,0.047,0.823,0.130,0.8316,483,93,0.159010,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
159785,Pity about the two days of rain Its centralit...,0,pity two day rain centrality proximity destina...,0.155,0.845,0.000,-0.2960,76,14,0.077279,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
195089,Didn t like it at all construction was in pro...,1,like construction progress stuff lie vacancy l...,0.108,0.660,0.231,0.6369,186,42,0.097850,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
label = "is_bad_review"
ignore_cols = ["is_bad_review", "review", "review_clean"]

In [32]:
features = [c for c in df.columns if c not in ignore_cols]

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size = 0.2, random_state = 42)

In [35]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
rf.score(X_test, y_test)

0.9560833737275812

In [83]:
final_result = rf.predict(X_test)

In [87]:
import numpy as np
np.unique(final_result, return_counts = True)

(array([0, 1]), array([10303,    12]))