# The Hotel Review Analysis

In [1]:
#libraries
import numpy as np
import pandas as pd
import os
import string
string.punctuation
import re
from textblob import TextBlob # using text blob function
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import gensim
from pattern.text.en import singularize
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

#NLP Pre-Process:
def entier_func(directory,data,column_drop,col_name,hotel=0):
    os.chdir(directory)
    os.getcwd()
    
    df = pd.read_csv(data)
    df.drop('Unnamed: 0',axis=1,inplace=True)
    df.dropna(axis=0,inplace=True)
    if hotel == 0:
        df1 = df.drop(column_drop,axis=1)
    else:
        df1 = df[df['hotel_name'] == hotel]
        df1 = df1.drop(column_drop,axis=1)
    # pre-processing:
    
    # punctuation removal
    def remove_punctuation(text):
        punctuation_free="".join([i for i in text if i not in punctuations])
        return punctuation_free
    
    df1['Review_no_punc']=df1[col_name].apply(lambda x: remove_punctuation(x))
    
    # lowering text
    df1['Review_lower']= df1['Review_no_punc'].apply(lambda x: x.lower())
    
    # extra white space removal
    df1['Review_no_extra_space'] = df1['Review_lower'].apply(lambda text: re.sub(' +', ' ', text))
    
    # emoji removal
    def emoji(string):  # created a emoji removing funcction
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)
    
    df1['Review_no_emoji'] = df1['Review_no_extra_space'].apply(emoji)
    
    ## ------------- For compactness pruning and redundancy pruning----------------
    # Sentence Tokenization:
    from nltk.tokenize import sent_tokenize
    df1['Review_sent_token'] = df1['Review_no_emoji'].apply(sent_tokenize)
        
    # stop words removal:
    def stop_words_removal(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in stop_words:
                result.append(token)
        return result
    
    b = df1['Review_no_emoji'].apply(stop_words_removal)
    df1['Stop_words_removal'] = b.apply(lambda x: " ".join(x))
    ## -----------------------------------------------------------------------------
    
    # word Tokenization:
    def tokenization(text):
        tokens = re.split('W+',text)
        return tokens
    
    a = df1['Review_no_emoji'].apply(lambda x: tokenization(x)) # calling the function
    df1['Review_token'] = a.apply(lambda x: " ".join(x)) # Join the words into the String
    
    # remove the words less than two words:
    def greater_than_2(text):
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if len(token) > 3:
                result.append(token)
        return result
    
    b = df1['Review_token'].apply(lambda x: greater_than_2(x)) 
    df1['Review_above_3token'] = b.apply(lambda x: " ".join(x)) # Join the words into the String
    # Necessary Tagging:
    
    # parts of speech tagging
    def pos_tag(text):
        try:
            return TextBlob(text).tags
        except:
            return None
    
    df1['Review_pos_tag'] = df1['Review_above_3token'].apply((pos_tag)) # calling pos_tag function 
    
    # addjective tagging
    def get_adjectives(text):
        blob = TextBlob(text)
        return [ word for (word,tag) in blob.tags if tag == "JJ"]
    
    df1['Review_adjective'] = df1['Review_above_3token'].apply(get_adjectives)
    
    # Common Noun(NN and NNS) [which is called Noun and as well as common noun and NNP, NNPS is called proper Noun]
    def get_common_nouns(text):
        blob = TextBlob(text)
        return [ word for (word,tag) in blob.tags if tag == "NN" or tag == "NNS"]
    
    df1['Review_Common_Noun'] = df1['Review_above_3token'].apply(get_common_nouns)
    df1.reset_index(inplace = True, drop = True) # changed the index structure
    
    # converting it into singular
    def singularity(texts):
        return [singularize(text) for text in texts]
    # calling the function and saving it as like singular noun
    df1['Review_Singular_Cnoun'] = df1['Review_Common_Noun'].apply(lambda x: singularity(x)) # calling the function
    print(df1.shape)
    return df1

# Binary Conversion:
from sklearn.preprocessing import MultiLabelBinarizer
def binary_conversion(data,colname):
    mlb = MultiLabelBinarizer() # binary converter
    data = pd.DataFrame(mlb.fit_transform(df1[data]),columns=mlb.classes_)
    
    # getting only unique noun list(which is used in binary columns)
    unique_noun_list = []   # it is for unique words
    for i in range(0,len(df1[colname])): # iterating the len of noun column
        for j in df1[colname][i]:        # iterating the text of each index in noun column
            if j not in unique_noun_list:      # if the iterated text not presented inside the list
                unique_noun_list.append(j)     # then we append it into the list
            else:                              # if not we leave it and continue
                continue
    print("Unique Noun Lists: ")
    print("Showing the sample output: ",unique_noun_list[0:10])
    return data

# Association Rule
from mlxtend.frequent_patterns import apriori, association_rules
def association_rule(data,dup_col1,dup_col2):
    df2 = data.copy()
    #using Apriori rule to find the frequent set
    freq_reviewset = apriori(df = df2, min_support = 0.03, use_colnames=True) # fixing the support as 0.03
    freq_reviewset.to_csv('frequent_review_set.csv', encoding='utf-8')
    
    rules = association_rules(freq_reviewset,min_threshold=0.2) #fixing threshold as 0.2
    rules.to_csv('association_rule.csv', encoding='utf-8')
    
    rules.drop_duplicates(subset=[dup_col1,dup_col2],keep='first',inplace=True)
    return rules


# Finding the max support and removing the duplicates

def max_support_finding(rules,one_of_asso_column):
    rules1 = rules[['antecedents','consequents','support']]
    l1 = []
    l2 = []
    max = []
    
    df2 = rules1
    dfs =  pd.DataFrame()
    dbs =  pd.DataFrame()
    
    for x in range(len(df2)):
        l1.append(df2.iloc[x,0])
        l2.append(df2.iloc[x,1])
    
    for x in range(len(l1)):
        for y in range(len(df2)):
            if (l1[x] == df2.iloc[y,1]) and (l2[x] == df2.iloc[y,0]):
                kd = (df2.iloc[[x],:])
                dfs = pd.concat([dfs,kd],axis = 0)
                dfs.sort_values(by =one_of_asso_column, ascending = False)
    
    f = list(dfs.index)

    dfnon = df2.drop(f)
    

    for x in range(len(dfs)):
        if x % 2 == 0:
            s = dfs.iloc[x:x+2 , 0:2].max()
            max.append(dfs.iloc[x:x+2,2].max())
            #print(s)
            dbs = pd.concat([dbs,s],axis = 1)
    
    dbs = dbs.T.reset_index(drop=True)
    
    dbs[one_of_asso_column] = max

    new_data = pd.concat([dfnon , dbs])

    new_data.reset_index(drop=True,inplace = True)

    return (new_data)


#Compactness Pruning
def compact_pruning():
    prun_data = new_data.copy()

    # converting the frozenset to list
    prun_data['antecedents'] = prun_data[['antecedents']].applymap(lambda x: list(x))
    prun_data['consequents'] = prun_data[['consequents']].applymap(lambda x: list(x))

    # conversion from list to str
    prun_data['antecedents'] = prun_data[['antecedents']].applymap(lambda x: " ".join(x))
    prun_data['consequents'] = prun_data[['consequents']].applymap(lambda x: " ".join(x))

    comp = []
    first = []
    second = []
    count=0
    k=0
    mindis = []
    count = 0
    feat = []
    w = []

    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))

    for i,j in zip(prun_data['antecedents'],prun_data['consequents']):
        feat.append(i)
        feat.append(j)
        #print(feat)
        for i in s:
            for word in i: 
                if word not in stop_words:
                    w.append(word)
            #print(w)
            if (feat[0] in w and feat[1] in w):
                #print(w)
                for j in w:
                    #print(j)
                    if j == feat[0]:
                        #print(j)
                        k =w.index(feat[0],k)
                        #print(k)
                        first.append(k)
                        k += 1
                k = 0
                for j in w:
                    #print(j)
                    if(j==feat[1]):
                        k = w.index(feat[1],k)
                        second.append(k)
                        k += 1
                for i in first:
                    for j in second:
                        if(i<j):
                            mindis.append((j-i)-1)
                    #print(mindis)
                    if(mindis == []):
                        break
                    if (min(mindis)) <= 5:
                        #print(min(mindis))
                        count+= 1
                    #print(count)
                k = 0
            w = []
            first = []
            second = []
            mindis = []
        if (count >= 2):
                comp.append(feat)
        feat = []
        count = 0
    df_comp = pd.DataFrame(comp)
    df_comp = df_comp.rename(columns = {0:'antecedents',1:'consequents'}, inplace = False)
    
    return (comp)

# Redundancy Pruning

# 1.two feature feature for redundancy pruning:
def twofeats():
    twofeat = {}
    w = []
    feat = []
    k = 0
    count = 0
    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))
    for i,j in comp:
        feat.append(i)
        feat.append(j)
        #print(feat)
        for i in s:
            for word in i: 
                if word not in stop_words:
                    w.append(word)
            #print(w)
            if (feat[0] in w and feat[1] in w):
                #print(w)
                for j in w:
                    #print(j)
                    if j == feat[0]:
                        #print(j)
                        k =w.index(feat[0],k)
                        #print(k)
                        first.append(k)
                        k += 1
                k = 0
                for j in w:
                    #print(j)
                    if(j==feat[1]):
                        k = w.index(feat[1],k)
                        second.append(k)
                        k += 1
                for i in first:
                    for j in second:
                        if(i<j):
                            mindis.append((j-i)-1)
                    #print(mindis)
                    if(mindis == []):
                        break
                    if (min(mindis)) <= 5:
                        #print(min(mindis))
                        count+= 1
                    #print(count)
                k = 0
            w = []
            first = []
            second = []
            mindis = []
        #print(feat)
        twofeat[feat[0],feat[1]] = count
        feat = []
        #print(count)
        count = 0
    return twofeat

# 2. single feature for redundancy pruning:
def single_feat():
    count = 0
    sinfeat = {}
    s = []
    for i in df1['Review_sent_token']:
        for j in i:
            s.append(j.split( ))
    for l in np.unique(comp):
        for i in s:
            for j in i:
                if(j == l):
                    count+= 1
        #print(l)
        #print(count)
        sinfeat[l] = count
        count = 0
    return sinfeat
# 3. p support redundancy pruning:
def p_support(twofeat,sinfeat):
    sum2 = 0
    support = 0
    red = {}
    for j in sinfeat:
        #print(j)
        for i in twofeat:
            #print(i)
            if(j in i):
                sum2 += twofeat[i]
        support = sinfeat[j] - sum2
        red[j] = support
        support = 0
        sum2 = 0
    return red
    

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vishw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vishw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## NLP Pre-Process

### 10 Hotel Names:
- Ocean Palms Goa
- Silver Sands Serenity
- Goa Woodlands Hotel
- The Byke Old Anchor Beach Resort & Spa
- Whispering Palms Beach Resort
- Rivasa Resort
- Rendezvous Beach Resort
- Hotel Campal
- Hotel MR Manfred
- Hotel Royal Palace

In [2]:
# NLP Pre-Process
df1 = entier_func(directory = r"D:\2.Praxis( all Stuff)\3. subject wise records\3.Term 3\1.CAPP\Data_File"
            ,data = "10_hotels_reviews.csv"
            ,column_drop = ['Rating','hotel_name']
            ,col_name = 'Review'
            )
df1.head()

(6046, 13)


Unnamed: 0,Review,Review_no_punc,Review_lower,Review_no_extra_space,Review_no_emoji,Review_sent_token,Stop_words_removal,Review_token,Review_above_3token,Review_pos_tag,Review_adjective,Review_Common_Noun,Review_Singular_Cnoun
0,"Good hotel, great staffs, nice food and servic...",Good hotel great staffs nice food and service....,good hotel great staffs nice food and service....,good hotel great staffs nice food and service....,good hotel great staffs nice food and service....,[good hotel great staffs nice food and service...,good hotel great staffs nice food service near...,good hotel great staffs nice food and service....,good hotel great staffs nice food service near...,"[(good, JJ), (hotel, NN), (great, JJ), (staffs...","[good, great, nice]","[hotel, staffs, food, service, beach, shopping...","[hotel, staff, food, service, beach, shopping,..."
1,"Very good hotel, nice service we will plan nex...",Very good hotel nice service we will plan next...,very good hotel nice service we will plan next...,very good hotel nice service we will plan next...,very good hotel nice service we will plan next...,[very good hotel nice service we will plan nex...,good hotel nice service plan next vacation als...,very good hotel nice service we will plan next...,very good hotel nice service will plan next va...,"[(very, RB), (good, JJ), (hotel, NN), (nice, J...","[good, nice, next, friendly]","[hotel, service, vacation, family, friends, st...","[hotel, service, vacation, family, friend, sta..."
2,Far away from the over crowded Calangute and B...,Far away from the over crowded Calangute and B...,far away from the over crowded calangute and b...,far away from the over crowded calangute and b...,far away from the over crowded calangute and b...,[far away from the over crowded calangute and ...,far away crowded calangute baga whispering pal...,far away from the over crowded calangute and b...,away from over crowded calangute baga whisperi...,"[(away, RB), (from, IN), (over, IN), (crowded,...","[crowded, candolim, perfect, commercial, allot...","[calangute, baga, palms, location, access, gat...","[calangute, baga, palm, location, acces, gate,..."
3,Very close to Candolim beach. Nice food and bu...,Very close to Candolim beach. Nice food and bu...,very close to candolim beach. nice food and bu...,very close to candolim beach. nice food and bu...,very close to candolim beach. nice food and bu...,"[very close to candolim beach., nice food and ...",close candolim beach nice food buffet swimming...,very close to candolim beach. nice food and bu...,very close candolim beach nice food buffet swi...,"[(very, RB), (close, RB), (candolim, JJ), (bea...","[candolim, nice, good]","[beach, food, buffet, pool]","[beach, food, buffet, pool]"
4,"Very nice ambience, just 2 mins walk from reso...",Very nice ambience just 2 mins walk from resor...,very nice ambience just 2 mins walk from resor...,very nice ambience just 2 mins walk from resor...,very nice ambience just 2 mins walk from resor...,[very nice ambience just 2 mins walk from reso...,nice ambience mins walk resort beach though ro...,very nice ambience just 2 mins walk from resor...,very nice ambience just mins walk from resort ...,"[(very, RB), (nice, JJ), (ambience, NN), (just...","[nice, good, good]","[ambience, mins, resort, beach, rooms, food, p...","[ambience, min, resort, beach, room, food, pla..."


## Binary conversion

In [3]:
# Binary conversion
data = binary_conversion(data='Review_Singular_Cnoun',colname = 'Review_Singular_Cnoun')
data

Unique Noun Lists: 
Showing the sample output:  ['hotel', 'staff', 'food', 'service', 'beach', 'shopping', 'market', 'vacation', 'family', 'friend']


Unnamed: 0,aand,aasma,absence,absolute,academy,acasium,accent,accept,acces,accessibility,...,youngster,your,youre,yourself,youve,yuck,yummy,zero,zomato,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6043,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Association Rule

In [4]:
# Association Rule
rules = association_rule(data,dup_col1='antecedents',dup_col2='consequents')
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(area),(room),0.048627,0.468243,0.031426,0.646259,1.380176,0.008656,1.503235
1,(beach),(breakfast),0.217665,0.179788,0.047469,0.218085,1.213011,0.008336,1.048978
2,(breakfast),(beach),0.179788,0.217665,0.047469,0.264029,1.213011,0.008336,1.062998
3,(distance),(beach),0.043169,0.217665,0.035395,0.819923,3.766912,0.025999,4.344458
4,(beach),(food),0.217665,0.297056,0.081045,0.372340,1.253436,0.016387,1.119945
...,...,...,...,...,...,...,...,...,...
410,"(hotel, room, service)",(staff),0.086338,0.357757,0.037711,0.436782,1.220888,0.006823,1.140309
411,"(staff, room)","(hotel, service)",0.181277,0.137446,0.037711,0.208029,1.513531,0.012795,1.089123
412,"(staff, service)","(hotel, room)",0.101886,0.234535,0.037711,0.370130,1.578142,0.013815,1.215274
413,"(hotel, service)","(staff, room)",0.137446,0.181277,0.037711,0.274368,1.513531,0.012795,1.128290


## Finding the Max Support

In [5]:
# Finding the Max Support and removing the duplicates
new_data = max_support_finding(rules,one_of_asso_column='support')
new_data

Unnamed: 0,antecedents,consequents,support
0,(area),(room),0.031426
1,(distance),(beach),0.035395
2,(place),(beach),0.038538
3,(resort),(beach),0.031591
4,(breakfast),(food),0.048462
...,...,...,...
324,"(room, service)","(hotel, location)",0.033907
325,"(hotel, service)","(room, location)",0.033907
326,"(room, location)","(staff, hotel)",0.039530
327,"(hotel, location)","(staff, room)",0.039530


## Compactness Pruning

In [6]:
# Compactness Pruning
comp = compact_pruning()
comp

[['area', 'room'],
 ['distance', 'beach'],
 ['place', 'beach'],
 ['resort', 'beach'],
 ['breakfast', 'food'],
 ['breakfast', 'hotel'],
 ['breakfast', 'stay'],
 ['experience', 'food'],
 ['experience', 'hotel'],
 ['experience', 'room'],
 ['experience', 'service'],
 ['experience', 'staff'],
 ['facility', 'hotel'],
 ['facility', 'location'],
 ['facility', 'room'],
 ['facility', 'staff'],
 ['family', 'hotel'],
 ['family', 'room'],
 ['family', 'staff'],
 ['family', 'stay'],
 ['money', 'food'],
 ['place', 'food'],
 ['pool', 'food'],
 ['quality', 'food'],
 ['restaurant', 'food'],
 ['time', 'food'],
 ['market', 'hotel'],
 ['money', 'hotel'],
 ['night', 'hotel'],
 ['place', 'hotel'],
 ['pool', 'hotel'],
 ['quality', 'hotel'],
 ['restaurant', 'hotel'],
 ['time', 'hotel'],
 ['value', 'hotel'],
 ['water', 'hotel'],
 ['money', 'location'],
 ['place', 'location'],
 ['pool', 'location'],
 ['money', 'room'],
 ['money', 'staff'],
 ['night', 'room'],
 ['place', 'room'],
 ['place', 'service'],
 ['place', 

## Redundancy Pruning

In [7]:
# Two Feature Counting
twofeat = twofeats()
twofeat

{('area', 'room'): 5,
 ('distance', 'beach'): 54,
 ('place', 'beach'): 30,
 ('resort', 'beach'): 42,
 ('breakfast', 'food'): 65,
 ('breakfast', 'hotel'): 40,
 ('breakfast', 'stay'): 13,
 ('experience', 'food'): 18,
 ('experience', 'hotel'): 45,
 ('experience', 'room'): 13,
 ('experience', 'service'): 11,
 ('experience', 'staff'): 30,
 ('facility', 'hotel'): 11,
 ('facility', 'location'): 4,
 ('facility', 'room'): 4,
 ('facility', 'staff'): 9,
 ('family', 'hotel'): 18,
 ('family', 'room'): 14,
 ('family', 'staff'): 6,
 ('family', 'stay'): 33,
 ('money', 'food'): 9,
 ('place', 'food'): 25,
 ('pool', 'food'): 24,
 ('quality', 'food'): 44,
 ('restaurant', 'food'): 65,
 ('time', 'food'): 11,
 ('market', 'hotel'): 10,
 ('money', 'hotel'): 30,
 ('night', 'hotel'): 5,
 ('place', 'hotel'): 16,
 ('pool', 'hotel'): 33,
 ('quality', 'hotel'): 20,
 ('restaurant', 'hotel'): 30,
 ('time', 'hotel'): 46,
 ('value', 'hotel'): 27,
 ('water', 'hotel'): 23,
 ('money', 'location'): 7,
 ('place', 'location')

In [8]:
# Single Feature Counting
sinfeat = single_feat()
sinfeat

{'area': 215,
 'beach': 1003,
 'breakfast': 1235,
 'distance': 205,
 'experience': 482,
 'facility': 119,
 'family': 376,
 'food': 1796,
 'hotel': 3631,
 'location': 1634,
 'market': 215,
 'money': 307,
 'night': 138,
 'place': 669,
 'pool': 938,
 'property': 322,
 'quality': 400,
 'resort': 413,
 'restaurant': 396,
 'room': 2363,
 'service': 1429,
 'staff': 2022,
 'stay': 1727,
 'time': 447,
 'value': 408,
 'view': 232,
 'water': 425}

In [9]:
# Finding the P-Support form (Single Feature - sum(Two Feature))
p_support(twofeat,sinfeat)

{'area': 210,
 'beach': 399,
 'breakfast': 877,
 'distance': 151,
 'experience': 365,
 'facility': 91,
 'family': 305,
 'food': 1034,
 'hotel': 2015,
 'location': 848,
 'market': 205,
 'money': 233,
 'night': 126,
 'place': 399,
 'pool': 714,
 'property': 316,
 'quality': 319,
 'resort': 355,
 'restaurant': 258,
 'room': 1554,
 'service': 864,
 'staff': 801,
 'stay': 1122,
 'time': 328,
 'value': 360,
 'view': 202,
 'water': 372}