In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions
from collections import Counter
from text_preprocessing_function import *
import time
import json
#import concurrent.futures
import nltk

In [2]:
#Showing graphs inline
%matplotlib inline

In [3]:
nlp = spacy.load("en_core_web_lg")
#from spacy.lang.en import English
#nlp = English() 

In [4]:
# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

In [10]:
print(nlp.Defaults.stop_words)

{'forty', 'only', 'nevertheless', 'whither', 'i', 'be', 'am', 'you', 'becomes', 'whether', 'anything', 'former', 'fifteen', 'then', 'sixty', 'beside', '’re', 'indeed', "'d", 'well', 'twelve', 'afterwards', 'which', 'such', 'onto', 'anywhere', 'into', 'within', '‘d', 'were', 'where', 'should', 'doing', 'n‘t', 'their', 'really', 'yours', 'no', 'except', 'first', "'s", 'those', 'alone', 'several', 'almost', 'via', 'been', 'by', 'my', 'due', "'re", 'on', 'somehow', 'same', 'this', 'whatever', 'however', 'him', 'anyhow', 'so', 'put', 'has', 'own', 'out', 'does', 'throughout', 'with', 'than', 'nor', 'third', 'elsewhere', 'do', 'some', "'ve", 'above', 'during', 'an', 'would', 'even', 'nowhere', 'take', 'before', 'he', 'them', 'whole', 'below', 'over', '‘re', 'our', 'yourselves', 'who', 'whenever', 'made', 'nothing', 'someone', 'whom', 'of', 'keep', 'hereafter', '’ll', 'namely', 'that', 'just', 'any', 'meanwhile', 'because', 'yet', 'can', 'everything', 'four', 'fifty', 'along', 'hereby', 'elev

In [11]:
#Load the data set
data_raw = pd.read_csv("Data/tripadvisor_hotel_reviews.csv")
data_raw = data_raw[:][0:30]

In [12]:
def text_preprocessing1(text, strip_html=True, extra_whitespace=True, accented_chars=True, contraction=True, lowercase=True,
                       space_correction =True,
                       stop_words=True, punctuations=True, special_chars=True, remove_num=True, convert_num=True, lemmatization=True         
                       ):
    """preprocess text with default option set to true for all steps"""
    
    if strip_html == True: #convert all characters to lowercase
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text(separator=" ")
    if extra_whitespace == True: #remove extra whitespaces
        text = text.strip()
        text = " ".join(text.split())
    if accented_chars == True: #remove accented characters
        text = unidecode.unidecode(text)
    if contraction == True: #expand contractions
        text = contractions.fix(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    if space_correction == True: #The regular expression loocs for cases where there is no whitespace after the end of a sentace
        text = re.sub(r'(?<=[.,])(?=[^\s])', r' ', text)
    return text

In [13]:
#Apply basic preprocessing
data_raw['Review'] = data_raw['Review'].apply(lambda x: text_preprocessing1(x, lowercase=False))

#Lemmatize all words
for t in range(len(data_raw['Review'])):
    doc = nlp(data_raw['Review'][t])
    data_raw['Review'][t]= str(" ".join([token.lemma_ for token in doc]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [14]:
#Calculate IDF_values
IDF_values = IDF(data_raw['Review'])

In [19]:
#Doing processing over the whole data, here 30
de = [ word2features(nlp(x), IDF_values,len(data_raw['Review'])) for x in data_raw['Review'] ]

In [23]:
de[1]

[[{'bias': 1.0,
   'word': 'ok',
   'isTitle': False,
   'entity': '',
   'tag': 'UH',
   'pos': 91,
   'Len': 0.16666666666666666,
   'PoS': 1,
   'TF*IDF': 0.1354025100551105,
   'DEP': 1,
   'vect_norm': 5.788086,
   'BOS': True,
   '+1:word': 'nothing',
   '+1:isTitle': False,
   '+1:entity': '',
   '+1:tag': 'NN',
   '+1:pos': 95,
   '+1:Len': 0.5833333333333334,
   '+1:PoS': 1,
   '+1:TF*IDF': 0.10074515102711323,
   '+1:DEP': 1,
   '+1:vect_norm': 5.004161},
  {'bias': 1.0,
   'word': 'nothing',
   'isTitle': False,
   'entity': '',
   'tag': 'NN',
   'pos': 95,
   'Len': 0.5833333333333334,
   'PoS': 1,
   'TF*IDF': 0.10074515102711323,
   'DEP': 1,
   'vect_norm': 5.004161,
   '-1:word': 'ok',
   '-1:isTitle': False,
   '-1:entity': '',
   '-1:tag': 'UH',
   '-1:pos': 91,
   '-1:Len': 0.16666666666666666,
   '-1:PoS': 1,
   '-1:TF*IDF': 0.1354025100551105,
   '-1:DEP': 1,
   '-1:vect_norm': 5.788086,
   'EOS': True}],
 [{'bias': 1.0,
   'word': 'special',
   'isTitle': False,


In [34]:
data_raw['Review'][1]

'ok nothing special charge diamond member hilton decide chain shoot 20th anniversary seattle , start book suite pay extra website description not , suite bedroom bathroom standard hotel room , take print reservation desk show say thing like tv couch ect desk clerk tell oh mixed suites description kimpton website sorry free breakfast , get kid , embassy suit sit room bathroom bedroom unlike kimpton call suite , 5 day stay offer correct false advertising , send kimpton preferred guest website email ask failure provide suite advertise website reservation description furnish hard copy reservation printout website desk manager duty do not reply solution , send email trip guest survey do not follow email mail , guess tell concerned guest . the staff range indifferent not helpful , ask desk good breakfast spot neighborhood hood tell no hotel , gee good breakfast spot seattle 1/2 block away convenient hotel do not know exist , arrive late night 11 pm inside run bellman busy chat cell phone hel