# The Hotel Review Analysis

In [10]:
import numpy as np
import pandas as pd
import os
import string
string.punctuation
import re
from textblob import TextBlob # using text blob function
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import gensim
from pattern.text.en import singularize
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

#NLP Pre-Process:
def entier_func(directory,data,column_drop,col_name,hotel=0):
    os.chdir(directory)
    os.getcwd()
    
    df = pd.read_csv(data)
    df.drop('Unnamed: 0',axis=1,inplace=True)
    df.dropna(axis=0,inplace=True)
    if hotel == 0:
        df1 = df.drop(column_drop,axis=1)
    else:
        df1 = df[df['hotel_name'] == hotel]
        df1 = df1.drop(column_drop,axis=1)
    # pre-processing:
    
    # punctuation removal
    def remove_punctuation(text):
        punctuation_free="".join([i for i in text if i not in punctuations])
        return punctuation_free
    
    df1['Review_no_punc']=df1[col_name].apply(lambda x: remove_punctuation(x))
    
    # lowering text
    df1['Review_lower']= df1['Review_no_punc'].apply(lambda x: x.lower())
    
    # extra white space removal
    df1['Review_no_extra_space'] = df1['Review_lower'].apply(lambda text: re.sub(' +', ' ', text))
    
    # emoji removal
    def emoji(string):  # created a emoji removing funcction
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)
    
    df1['Review_no_emoji'] = df1['Review_no_extra_space'].apply(emoji)
    
    ## ------------- For compactness pruning and redundancy pruning----------------
    # Sentence Tokenization:
    from nltk.tokenize import sent_tokenize
    df1['Review_sent_token'] = df1['Review_no_emoji'].apply(sent_tokenize)
        
    # stop words removal:
    def stop_words_removal(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in stop_words:
                result.append(token)
        return result
    
    b = df1['Review_no_emoji'].apply(stop_words_removal)
    df1['Stop_words_removal'] = b.apply(lambda x: " ".join(x))
    ## -----------------------------------------------------------------------------
    
    # word Tokenization:
    def tokenization(text):
        tokens = re.split('W+',text)
        return tokens
    
    a = df1['Review_no_emoji'].apply(lambda x: tokenization(x)) # calling the function
    df1['Review_token'] = a.apply(lambda x: " ".join(x)) # Join the words into the String
    
    # remove the words less than two words:
    def greater_than_2(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if len(token) > 3:
                result.append(token)
        return result
    
    b = df1['Review_token'].apply(lambda x: greater_than_2(x)) 
    df1['Review_above_3token'] = b.apply(lambda x: " ".join(x)) # Join the words into the String
    # Necessary Tagging:
    
    # parts of speech tagging
    def pos_tag(text):
        try:
            return TextBlob(text).tags
        except:
            return None
    
    df1['Review_pos_tag'] = df1['Review_above_3token'].apply((pos_tag)) # calling pos_tag function 
    
    # addjective tagging
    def get_adjectives(text):
        blob = TextBlob(text)
        return [ word for (word,tag) in blob.tags if tag == "JJ"]
    
    df1['Review_adjective'] = df1['Review_above_3token'].apply(get_adjectives)
    
    # Common Noun(NN and NNS) [which is called Noun and as well as common noun and NNP, NNPS is called proper Noun]
    def get_common_nouns(text):
        blob = TextBlob(text)
        return [ word for (word,tag) in blob.tags if tag == "NN" or tag == "NNS"]
    
    df1['Review_Common_Noun'] = df1['Review_above_3token'].apply(get_common_nouns)
    df1.reset_index(inplace = True, drop = True) # changed the index structure
    
    # converting it into singular
    def singularity(texts):
        return [singularize(text) for text in texts]
    # calling the function and saving it as like singular noun
    df1['Review_Singular_Cnoun'] = df1['Review_Common_Noun'].apply(lambda x: singularity(x)) # calling the function
    print(df1.shape)
    return df1

# Binary Conversion:
from sklearn.preprocessing import MultiLabelBinarizer
def binary_conversion(data,colname):
    mlb = MultiLabelBinarizer() # binary converter
    data = pd.DataFrame(mlb.fit_transform(df1[data]),columns=mlb.classes_)
    
    # getting only unique noun list(which is used in binary columns)
    unique_noun_list = []   # it is for unique words
    for i in range(0,len(df1[colname])): # iterating the len of noun column
        for j in df1[colname][i]:        # iterating the text of each index in noun column
            if j not in unique_noun_list:      # if the iterated text not presented inside the list
                unique_noun_list.append(j)     # then we append it into the list
            else:                              # if not we leave it and continue
                continue
    print("Unique Noun Lists: ")
    print("Showing the sample output: ",unique_noun_list[0:10])
    return data

# Association Rule
from mlxtend.frequent_patterns import apriori, association_rules
def association_rule(data,dup_col1,dup_col2):
    df2 = data.copy()
    #using Apriori rule to find the frequent set
    freq_reviewset = apriori(df = df2, min_support = 0.03, use_colnames=True) # fixing the support as 0.03
    freq_reviewset.to_csv('frequent_review_set.csv', encoding='utf-8')
    
    rules = association_rules(freq_reviewset,min_threshold=0.2) #fixing threshold as 0.2
    rules.to_csv('association_rule.csv', encoding='utf-8')
    
    rules.drop_duplicates(subset=[dup_col1,dup_col2],keep='first',inplace=True)
    return rules


# Finding the max support and removing the duplicates

def max_support_finding(rules,one_of_asso_column):
    rules1 = rules[['antecedents','consequents','support']]
    l1 = []
    l2 = []
    max = []
    
    df2 = rules1
    dfs =  pd.DataFrame()
    dbs =  pd.DataFrame()
    
    for x in range(len(df2)):
        l1.append(df2.iloc[x,0])
        l2.append(df2.iloc[x,1])
    
    for x in range(len(l1)):
        for y in range(len(df2)):
            if (l1[x] == df2.iloc[y,1]) and (l2[x] == df2.iloc[y,0]):
                kd = (df2.iloc[[x],:])
                dfs = pd.concat([dfs,kd],axis = 0)
                dfs.sort_values(by =one_of_asso_column, ascending = False)
    
    f = list(dfs.index)

    dfnon = df2.drop(f)
    

    for x in range(len(dfs)):
        if x % 2 == 0:
            s = dfs.iloc[x:x+2 , 0:2].max()
            max.append(dfs.iloc[x:x+2,2].max())
            #print(s)
            dbs = pd.concat([dbs,s],axis = 1)
    
    dbs = dbs.T.reset_index(drop=True)
    
    dbs[one_of_asso_column] = max

    new_data = pd.concat([dfnon , dbs])

    new_data.reset_index(drop=True,inplace = True)

    return (new_data)


#Compactness Pruning
def compact_pruning():
    prun_data = new_data.copy()

    # converting the frozenset to list
    prun_data['antecedents'] = prun_data[['antecedents']].applymap(lambda x: list(x))
    prun_data['consequents'] = prun_data[['consequents']].applymap(lambda x: list(x))

    # conversion from list to str
    prun_data['antecedents'] = prun_data[['antecedents']].applymap(lambda x: " ".join(x))
    prun_data['consequents'] = prun_data[['consequents']].applymap(lambda x: " ".join(x))

    comp = []
    first = []
    second = []
    count=0
    k=0
    mindis = []
    count = 0
    feat = []
    w = []

    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))

    for i,j in zip(prun_data['antecedents'],prun_data['consequents']):
        feat.append(i)
        feat.append(j)
        #print(feat)
        for i in s:
            for word in i: 
                if word not in stop_words:
                    w.append(word)
            #print(w)
            if (feat[0] in w and feat[1] in w):
                #print(w)
                for j in w:
                    #print(j)
                    if j == feat[0]:
                        #print(j)
                        k =w.index(feat[0],k)
                        #print(k)
                        first.append(k)
                        k += 1
                k = 0
                for j in w:
                    #print(j)
                    if(j==feat[1]):
                        k = w.index(feat[1],k)
                        second.append(k)
                        k += 1
                for i in first:
                    for j in second:
                        if(i<j):
                            mindis.append((j-i)-1)
                    #print(mindis)
                    if(mindis == []):
                        break
                    if (min(mindis)) <= 5:
                        #print(min(mindis))
                        count+= 1
                    #print(count)
                k = 0
            w = []
            first = []
            second = []
            mindis = []
        if (count >= 2):
                comp.append(feat)
        feat = []
        count = 0
    df_comp = pd.DataFrame(comp)
    df_comp = df_comp.rename(columns = {0:'antecedents',1:'consequents'}, inplace = False)
    
    return (comp)

# Redundancy Pruning

# 1.two feature feature for redundancy pruning:
def twofeats():
    twofeat = {}
    w = []
    feat = []
    k = 0
    count = 0
    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))
    for i,j in comp:
        feat.append(i)
        feat.append(j)
        #print(feat)
        for i in s:
            for word in i: 
                if word not in stop_words:
                    w.append(word)
            #print(w)
            if (feat[0] in w and feat[1] in w):
                #print(w)
                for j in w:
                    #print(j)
                    if j == feat[0]:
                        #print(j)
                        k =w.index(feat[0],k)
                        #print(k)
                        first.append(k)
                        k += 1
                k = 0
                for j in w:
                    #print(j)
                    if(j==feat[1]):
                        k = w.index(feat[1],k)
                        second.append(k)
                        k += 1
                for i in first:
                    for j in second:
                        if(i<j):
                            mindis.append((j-i)-1)
                    #print(mindis)
                    if(mindis == []):
                        break
                    if (min(mindis)) <= 5:
                        #print(min(mindis))
                        count+= 1
                    #print(count)
                k = 0
            w = []
            first = []
            second = []
            mindis = []
        #print(feat)
        twofeat[feat[0],feat[1]] = count
        feat = []
        #print(count)
        count = 0
    return twofeat

# 2. single feature for redundancy pruning:
def single_feat():
    count = 0
    sinfeat = {}
    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))
    for l in np.unique(comp):
        for i in s:
            for j in i:
                if(j == l):
                    count+= 1
        #print(l)
        #print(count)
        sinfeat[l] = count
        count = 0
    return sinfeat
# 3. p support redundancy pruning:
def p_support(twofeat,sinfeat):
    sum2 = 0
    support = 0
    red = {}
    for j in sinfeat:
        #print(j)
        for i in twofeat:
            #print(i)
            if(j in i):
                sum2 += twofeat[i]
        support = sinfeat[j] - sum2
        red[j] = support
        support = 0
        sum2 = 0
    return red
    

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vishw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vishw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## NLP Pre-Process

### 10 Hotel Names:
- Ocean Palms Goa
- Silver Sands Serenity
- Goa Woodlands Hotel
- The Byke Old Anchor Beach Resort & Spa
- Whispering Palms Beach Resort
- Rivasa Resort
- Rendezvous Beach Resort
- Hotel Campal
- Hotel MR Manfred
- Hotel Royal Palace

In [2]:
# NLP Pre-Process
df1 = entier_func(directory = r"D:\2.Praxis( all Stuff)\3. subject wise records\3.Term 3\1.CAPP\Data_File"
            ,data = "10_hotels_reviews.csv"
            ,column_drop = ['Rating','hotel_name']
            ,col_name = 'Review'
            ,hotel = 'Goa Woodlands Hotel')
df1.head()

(773, 13)


Unnamed: 0,Review,Review_no_punc,Review_lower,Review_no_extra_space,Review_no_emoji,Review_sent_token,Stop_words_removal,Review_token,Review_above_3token,Review_pos_tag,Review_adjective,Review_Common_Noun,Review_Singular_Cnoun
0,Toilet was not cleaned on arrival. Substandard...,Toilet was not cleaned on arrival. Substandard...,toilet was not cleaned on arrival. substandard...,toilet was not cleaned on arrival. substandard...,toilet was not cleaned on arrival. substandard...,"[toilet was not cleaned on arrival., substanda...",toilet cleaned arrival substandard service rep...,toilet was not cleaned on arrival. substandard...,toilet cleaned arrival substandard service rep...,"[(toilet, NN), (cleaned, VBD), (arrival, JJ), ...","[arrival, substandard, multiple, want, good, g...","[toilet, service, reponse, times, things, aver...","[toilet, service, reponse, time, thing, averag..."
1,Cleanliness is Average . . Spider webs on the ...,Cleanliness is Average . . Spider webs on the ...,cleanliness is average . . spider webs on the ...,cleanliness is average . . spider webs on the ...,cleanliness is average . . spider webs on the ...,"[cleanliness is average ., ., spider webs on t...",cleanliness average spider webs room wall chec...,cleanliness is average . . spider webs on the ...,cleanliness average spider webs room wall chec...,"[(cleanliness, JJ), (average, JJ), (spider, NN...","[cleanliness, average, half, good, tasty, over...","[spider, webs, room, wall, check, delay, hour,...","[spider, web, room, wall, check, delay, hmy, r..."
2,OK stay.,OK stay.,ok stay.,ok stay.,ok stay.,[ok stay.],ok stay,ok stay.,stay,"[(stay, NN)]",[],[stay],[stay]
3,Good central place in south goa.,Good central place in south goa.,good central place in south goa.,good central place in south goa.,good central place in south goa.,[good central place in south goa.],good central place south goa,good central place in south goa.,good central place south,"[(good, JJ), (central, JJ), (place, NN), (sout...","[good, central]","[place, south]","[place, south]"
4,"Business Visit to Goa, Venue is in perfect loc...",Business Visit to Goa Venue is in perfect loca...,business visit to goa venue is in perfect loca...,business visit to goa venue is in perfect loca...,business visit to goa venue is in perfect loca...,[business visit to goa venue is in perfect loc...,business visit goa venue perfect location hear...,business visit to goa venue is in perfect loca...,business visit venue perfect location heart ci...,"[(business, NN), (visit, NN), (venue, NN), (pe...","[perfect, good, various, comfortable, good, go...","[business, visit, venue, location, heart, city...","[busines, visit, venue, location, heart, city,..."


## Binary conversion

In [3]:
# Binary conversion
data = binary_conversion(data='Review_Singular_Cnoun',colname = 'Review_Singular_Cnoun')
data

Unique Noun Lists: 
Showing the sample output:  ['toilet', 'service', 'reponse', 'time', 'thing', 'average', 'hmy', 'reminder', 'staff', 'equipment']


Unnamed: 0,acces,accessibility,accessible,accommodation,action,actualniranjan,addition,adequate,adult,advait,...,work,working,world,worst,worth,year,yesterday,youre,yummy,zomato
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Association Rule

In [4]:
# Association Rule
rules = association_rule(data,dup_col1='antecedents',dup_col2='consequents')
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(bathroom),(room),0.040103,0.531695,0.032342,0.806452,1.516757,0.011019,2.419577
1,(beach),(hotel),0.080207,0.548512,0.068564,0.854839,1.558468,0.024570,3.110249
2,(beach),(location),0.080207,0.275550,0.034929,0.435484,1.580418,0.012828,1.283312
3,(beach),(room),0.080207,0.531695,0.047865,0.596774,1.122400,0.005220,1.161397
4,(beach),(staff),0.080207,0.350582,0.033635,0.419355,1.196167,0.005516,1.118442
...,...,...,...,...,...,...,...,...,...
812,"(value, money)","(room, staff)",0.099612,0.226391,0.031048,0.311688,1.376772,0.008497,1.123923
813,"(value, staff)","(room, money)",0.056921,0.065977,0.031048,0.545455,8.267380,0.027292,2.054851
814,"(money, staff)","(room, value)",0.062096,0.059508,0.031048,0.500000,8.402174,0.027353,1.880983
815,(value),"(room, money, staff)",0.108668,0.034929,0.031048,0.285714,8.179894,0.027252,1.351100


## Finding the Max Support

In [5]:
# Finding the Max Support and removing the duplicates
new_data = max_support_finding(rules,one_of_asso_column='support')
new_data

Unnamed: 0,antecedents,consequents,support
0,(bathroom),(room),0.032342
1,(beach),(hotel),0.068564
2,(beach),(location),0.034929
3,(beach),(room),0.047865
4,(beach),(staff),0.033635
...,...,...,...
667,"(service, staff)","(stay, hotel)",0.032342
668,"(room, money, staff)",(value),0.031048
669,"(room, money)","(value, staff)",0.031048
670,"(money, staff)","(room, value)",0.031048


## Compactness Pruning

In [6]:
# Compactness Pruning
comp = compact_pruning()
comp

[['beach', 'hotel'],
 ['beach', 'location'],
 ['money', 'breakfast'],
 ['value', 'breakfast'],
 ['city', 'hotel'],
 ['everything', 'room'],
 ['experience', 'food'],
 ['experience', 'hotel'],
 ['experience', 'location'],
 ['experience', 'room'],
 ['experience', 'service'],
 ['experience', 'staff'],
 ['facility', 'food'],
 ['facility', 'hotel'],
 ['facility', 'location'],
 ['facility', 'staff'],
 ['family', 'hotel'],
 ['family', 'staff'],
 ['family', 'stay'],
 ['market', 'food'],
 ['money', 'food'],
 ['place', 'food'],
 ['quality', 'food'],
 ['railway', 'food'],
 ['restaurant', 'food'],
 ['time', 'food'],
 ['value', 'food'],
 ['market', 'hotel'],
 ['money', 'hotel'],
 ['place', 'hotel'],
 ['restaurant', 'hotel'],
 ['station', 'hotel'],
 ['time', 'hotel'],
 ['trip', 'hotel'],
 ['value', 'hotel'],
 ['woodland', 'hotel'],
 ['money', 'location'],
 ['value', 'location'],
 ['market', 'service'],
 ['market', 'staff'],
 ['money', 'room'],
 ['money', 'service'],
 ['money', 'stay'],
 ['place', 'ro

## Redundancy Pruning

In [11]:
# Two Feature Counting
twofeat = twofeats()
twofeat

{('beach', 'hotel'): 9,
 ('beach', 'location'): 2,
 ('money', 'breakfast'): 2,
 ('value', 'breakfast'): 2,
 ('city', 'hotel'): 6,
 ('everything', 'room'): 4,
 ('experience', 'food'): 5,
 ('experience', 'hotel'): 7,
 ('experience', 'location'): 2,
 ('experience', 'room'): 3,
 ('experience', 'service'): 2,
 ('experience', 'staff'): 3,
 ('facility', 'food'): 2,
 ('facility', 'hotel'): 3,
 ('facility', 'location'): 2,
 ('facility', 'staff'): 2,
 ('family', 'hotel'): 3,
 ('family', 'staff'): 2,
 ('family', 'stay'): 4,
 ('market', 'food'): 4,
 ('money', 'food'): 3,
 ('place', 'food'): 3,
 ('quality', 'food'): 3,
 ('railway', 'food'): 2,
 ('restaurant', 'food'): 17,
 ('time', 'food'): 2,
 ('value', 'food'): 3,
 ('market', 'hotel'): 5,
 ('money', 'hotel'): 7,
 ('place', 'hotel'): 5,
 ('restaurant', 'hotel'): 5,
 ('station', 'hotel'): 4,
 ('time', 'hotel'): 7,
 ('trip', 'hotel'): 5,
 ('value', 'hotel'): 8,
 ('woodland', 'hotel'): 9,
 ('money', 'location'): 3,
 ('value', 'location'): 3,
 ('marke

In [8]:
# Single Feature Counting
sinfeat = single_feat()
sinfeat

{'beach': 37,
 'breakfast': 194,
 'city': 57,
 'everything': 45,
 'experience': 63,
 'facility': 25,
 'family': 52,
 'food': 279,
 'hotel': 582,
 'location': 199,
 'market': 60,
 'money': 47,
 'place': 65,
 'quality': 58,
 'railway': 69,
 'restaurant': 75,
 'room': 301,
 'service': 251,
 'staff': 244,
 'station': 51,
 'stay': 269,
 'time': 43,
 'trip': 27,
 'value': 82,
 'woodland': 37}

In [12]:
# Finding the P-Support form (Single Feature - sum(Two Feature))
p_support(twofeat,sinfeat)

{'beach': 26,
 'breakfast': 134,
 'city': 51,
 'everything': 41,
 'experience': 41,
 'facility': 16,
 'family': 43,
 'food': 116,
 'hotel': 329,
 'location': 107,
 'market': 45,
 'money': 23,
 'place': 46,
 'quality': 46,
 'railway': 61,
 'restaurant': 42,
 'room': 178,
 'service': 121,
 'staff': 158,
 'station': 40,
 'stay': 167,
 'time': 27,
 'trip': 22,
 'value': 60,
 'woodland': 28}