# Hotel Reviews: Text Pre-Processing

## Packages

In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from textblob import TextBlob
from spellchecker import SpellChecker
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

## Data

In [2]:
# read cleaned df

hotel_df = pd.read_csv('data/hotel_reviews.csv')
hotel_df.head(2)

Unnamed: 0,index,hotel_city,latitude,longitude,name,hotel_state,date,rating,text,user_city,user_state,language,score
0,0,Mableton,45.421611,12.376187,Hotel Russo Palace,GA,2013-09-22 00:00:00+00:00,4.0,Pleasant 10 min walk along the sea front to th...,,,"('en', -390.9012541770935)",positive
1,1,Mableton,45.421611,12.376187,Hotel Russo Palace,GA,2015-04-03 00:00:00+00:00,5.0,Really lovely hotel. Stayed on the very top fl...,,,"('en', -535.3024659156799)",positive


In [3]:
hotel_df.shape

(31670, 13)

In [4]:
# select only the rating score and text columns

df = hotel_df[['score', 'text']]
df.head(2)

Unnamed: 0,score,text
0,positive,Pleasant 10 min walk along the sea front to th...
1,positive,Really lovely hotel. Stayed on the very top fl...


In [5]:
# create copy of original text column
df.loc[:,'text_cln'] = df['text']

In [6]:
df

Unnamed: 0,score,text,text_cln
0,positive,Pleasant 10 min walk along the sea front to th...,Pleasant 10 min walk along the sea front to th...
1,positive,Really lovely hotel. Stayed on the very top fl...,Really lovely hotel. Stayed on the very top fl...
2,positive,We stayed here for four nights in October. The...,We stayed here for four nights in October. The...
3,positive,We loved staying on the island of Lido! You ne...,We loved staying on the island of Lido! You ne...
4,positive,Lovely view out onto the lagoon. Excellent vie...,Lovely view out onto the lagoon. Excellent vie...
...,...,...,...
31665,positive,We truly enjoyed staying at the Elk Springs re...,We truly enjoyed staying at the Elk Springs re...
31666,positive,We were here for a 2nd time the last weekend o...,We were here for a 2nd time the last weekend o...
31667,positive,Best of the Best!!!! My family and I stayed in...,Best of the Best!!!! My family and I stayed in...
31668,positive,Amazing Vacation in Beautiful Cabin We stayed ...,Amazing Vacation in Beautiful Cabin We stayed ...


In [7]:
df.shape

(31670, 3)

## Text Pre-Processing

### Make Text Lower Case

Make text lower case so words with mixed cases are treated as the same word and not two separate words.

In [8]:
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].str.lower()

In [9]:
df['text_cln'].head(20)

0     pleasant 10 min walk along the sea front to th...
1     really lovely hotel. stayed on the very top fl...
2     we stayed here for four nights in october. the...
3     we loved staying on the island of lido! you ne...
4     lovely view out onto the lagoon. excellent vie...
5     it was ok hotel is nice from in and out but ro...
6     the hotel staff was very friendly and helpful....
7     nice hotel , with very friendly staff and help...
8                                                   . .
9     don't stay here unless you're less than 2 feet...
10    we had absolutely no problems whatsoever with ...
11    lovely hotel, 10 min walk to the water bus sto...
12    located on the lido i would recommend this hot...
13    great stay...close to ferry.food not so good n...
14    stayed with parents, wife twin toddlers in two...
15    this hotel is in lido which is a better choice...
16    it was a 10 min+ walk to water bus, would have...
17    room was tiny-bed saggy-bathroom door didn

### Remove Punctuations

In [10]:
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].str.replace('[^\w\s]',' ')

In [11]:
df['text_cln'].head(20)

0     pleasant 10 min walk along the sea front to th...
1     really lovely hotel  stayed on the very top fl...
2     we stayed here for four nights in october  the...
3     we loved staying on the island of lido  you ne...
4     lovely view out onto the lagoon  excellent vie...
5     it was ok hotel is nice from in and out but ro...
6     the hotel staff was very friendly and helpful ...
7     nice hotel   with very friendly staff and help...
8                                                      
9     don t stay here unless you re less than 2 feet...
10    we had absolutely no problems whatsoever with ...
11    lovely hotel  10 min walk to the water bus sto...
12    located on the lido i would recommend this hot...
13    great stay   close to ferry food not so good n...
14    stayed with parents  wife twin toddlers in two...
15    this hotel is in lido which is a better choice...
16    it was a 10 min  walk to water bus  would have...
17    room was tiny bed saggy bathroom door didn

### Remove Numbers & Strip Whitespace

In [12]:
def remove_numbers(text):
    """Function to remove numbers and whitespace."""
    
    # remove numbers
    pattern = r'[^a-zA-z\s*|\s\\s]' 
    text2 = re.sub(pattern, '', text)
    
    # remove white space where number was at
    pattern2 = r'^\s*|\s\s*'
    return re.sub(pattern2, ' ', text2).strip()

In [13]:
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(remove_numbers(x)))
df['text_cln'].head(20)

0     pleasant min walk along the sea front to the w...
1     really lovely hotel stayed on the very top flo...
2     we stayed here for four nights in october the ...
3     we loved staying on the island of lido you nee...
4     lovely view out onto the lagoon excellent view...
5     it was ok hotel is nice from in and out but ro...
6     the hotel staff was very friendly and helpful ...
7     nice hotel with very friendly staff and helpfu...
8                                                      
9     don t stay here unless you re less than feet t...
10    we had absolutely no problems whatsoever with ...
11    lovely hotel min walk to the water bus stop on...
12    located on the lido i would recommend this hot...
13    great stay close to ferry food not so good nearby
14    stayed with parents wife twin toddlers in two ...
15    this hotel is in lido which is a better choice...
16    it was a min walk to water bus would have like...
17    room was tiny bed saggy bathroom door didn

### Remove Emoji's and Emoitcons

Emoticons are faces written with text :-)

In [14]:
def emoji(string):
    """Function to remove emoji."""
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_emoticons(text):
    """Function to remove emoticons."""
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

In [15]:
# remove any emoji's
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(emoji(x)))

In [16]:
# remove any emoticons
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(remove_emoticons(x)))

### Remove URL's

Just in case there are any URL's in the text, these will be removed.

In [17]:
def remove_urls(text):
    """Function to remove urls."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [18]:
# remove any url's
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(remove_urls(x)))

### Spell Check Words

Make sure all misspelled words are treated as the correct spelled word.

In [19]:
spell = SpellChecker()

In [20]:
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(spell.correction(x)))

In [21]:
df.to_csv('data/text_checkpoint1.csv', index=False)

### Normalization - Lemmatization

Lemmatization converts word to base form based off of context. It is more accurate than stemming.

In [22]:
lemmatizer = WordNetLemmatizer()

# part of speach tag
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 

def lemmatize_words(text):
    """Function for lemmatization using part of speach tag"""
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) 
                     for word, pos in pos_tagged_text])

In [23]:
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(lemmatize_words(x)))

In [24]:
df['text_cln'].head(20)

0     pleasant min walk along the sea front to the w...
1     really lovely hotel stay on the very top floor...
2     we stay here for four night in october the hot...
3     we love stay on the island of lido you need to...
4     lovely view out onto the lagoon excellent view...
5     it be ok hotel be nice from in and out but roo...
6     the hotel staff be very friendly and helpful t...
7     nice hotel with very friendly staff and helpfu...
8                                                     i
9     don t stay here unless you re less than foot t...
10    we have absolutely no problem whatsoever with ...
11    lovely hotel min walk to the water bus stop on...
12    locate on the lido i would recommend this hote...
13    great stay close to ferry food not so good nearby
14    stay with parent wife twin toddler in two trip...
15    this hotel be in lido which be a good choice t...
16    it be a min walk to water bus would have like ...
17    room be tiny bed saggy bathroom door didn 

### Store Text as is for R Analysis

Store the text as is for R analyais before removing stop words.

In [25]:
cln_full_sentence = pd.Series(df['text_cln'], name='cln_full_sentence')

### Remove Stop Words

Remove words such as "the," "a," "and," "it," "is"

In [26]:
STOPWORDS = set(stopwords.words('english'))

def stopwords(text):
    """Function to remove stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [27]:
# remove stopwrds
df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: str(stopwords(x)))

In [28]:
df['text_cln'].head(10)

0    pleasant min walk along sea front water bus re...
1    really lovely hotel stay top floor surprise ja...
2    stay four night october hotel staff welcome fr...
3    love stay island lido need take water venice g...
4    lovely view onto lagoon excellent view staff w...
5    ok hotel nice room small pay double bed bat at...
6    hotel staff friendly helpful room clean comfor...
7    nice hotel friendly staff helpful great choice...
8                                                     
9    stay unless less foot tall like sleep centiped...
Name: text_cln, dtype: object

### Rare Words

List of 100 rare words in the reviews. These are words that are most likely unique to an individual review and would not occur requently. They will be removed from the text.

In [29]:
freq = pd.Series(' '.join(df['text_cln']).split()).value_counts()[-100:] # 100 rare words
freq = list(freq.index)

In [30]:
freq

['psychological',
 'preschool',
 'inhouse',
 'neth',
 'bether',
 'antiquing',
 'inturotel',
 'sens',
 'faze',
 'wack',
 'transpire',
 'ge',
 'aaaahh',
 'submarine',
 'resurants',
 'regrettable',
 'merricks',
 'coarse',
 'motorize',
 'spectatular',
 'luminere',
 'workth',
 'yellowing',
 'bea',
 'krystie',
 'visiters',
 'tricked',
 'elavator',
 'imrovement',
 'latina',
 'clc',
 'heartless',
 'historian',
 'mored',
 'belgin',
 'elevatorhandy',
 'healthful',
 'azul',
 'quess',
 'annaheim',
 'sonic',
 'turnstile',
 'donkey',
 'hylton',
 'mcchicken',
 'eleventh',
 'laqunita',
 'hissy',
 'epoxy',
 'compareable',
 'unto',
 'wonferful',
 'brusque',
 'dinginess',
 'excelentemente',
 'sensibly',
 'shanna',
 'extortion',
 'leeway',
 'unloading',
 'inconvienant',
 'springer',
 'experiencein',
 'anteroom',
 'allergen',
 'cornelia',
 'friendyest',
 'propped',
 'cigerate',
 'dripping',
 'lounce',
 'taxing',
 'krista',
 'facilitator',
 'csumb',
 'renotivations',
 'persistently',
 'trackside',
 'bogans'

In [31]:
# if the word is not in the freq list, then keep it in the text_cln.

df.loc[:,'text_cln'] = df.loc[:,'text_cln'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['text_cln'].head()

0    pleasant min walk along sea front water bus re...
1    really lovely hotel stay top floor surprise ja...
2    stay four night october hotel staff welcome fr...
3    love stay island lido need take water venice g...
4    lovely view onto lagoon excellent view staff w...
Name: text_cln, dtype: object

### Export Cleaned DF

In [32]:
text_clean = df[['text_cln']]

hotel_text_cln = pd.concat([hotel_df, cln_full_sentence, text_clean], axis=1)

In [33]:
hotel_text_cln.drop(['index'], axis=1, inplace=True)

In [34]:
hotel_text_cln.head(3)

Unnamed: 0,hotel_city,latitude,longitude,name,hotel_state,date,rating,text,user_city,user_state,language,score,cln_full_sentence,text_cln
0,Mableton,45.421611,12.376187,Hotel Russo Palace,GA,2013-09-22 00:00:00+00:00,4.0,Pleasant 10 min walk along the sea front to th...,,,"('en', -390.9012541770935)",positive,pleasant min walk along sea front water bus re...,pleasant min walk along sea front water bus re...
1,Mableton,45.421611,12.376187,Hotel Russo Palace,GA,2015-04-03 00:00:00+00:00,5.0,Really lovely hotel. Stayed on the very top fl...,,,"('en', -535.3024659156799)",positive,really lovely hotel stay top floor surprise ja...,really lovely hotel stay top floor surprise ja...
2,Mableton,45.421611,12.376187,Hotel Russo Palace,GA,2013-10-27 00:00:00+00:00,5.0,We stayed here for four nights in October. The...,,,"('en', -713.2087678909302)",positive,stay four night october hotel staff welcome fr...,stay four night october hotel staff welcome fr...


In [35]:
hotel_text_cln.to_csv('data/hotel_text_cln.csv', index=False)