# Sentiment Analysis

Practicing s

In [49]:
import pandas as pd
import spacy
import nltk
from wordcloud import WordCloud
import plotly.express as px

In [50]:
# Initialize spacy ‘en’ model, keeping only component needed for lemmatization and creating an engine:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [51]:
data = pd.read_csv("../tripadvisor_hotel_reviews.csv")

In [52]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## Simple Visualization

In [53]:
fig = px.histogram(data,
             x = 'Rating',
             title = 'Histogram of Review Rating',
             template = 'plotly_dark',
             color = 'Rating',
             color_discrete_sequence= px.colors.sequential.Blues_r,
             opacity = 0.8,
             height = 525,
             width = 835,
            )

fig.update_yaxes(title='Count')
fig.show()

# Text Processing

## Tokenize

Tokenize: Remove punc, space, symbols, numbers

In [26]:
from nltk.tokenize import word_tokenize
import re

In [32]:
text = data["Review"][1]

In [28]:
print(word_tokenize(text))

['ok', 'nothing', 'special', 'charge', 'diamond', 'member', 'hilton', 'decided', 'chain', 'shot', '20th', 'anniversary', 'seattle', ',', 'start', 'booked', 'suite', 'paid', 'extra', 'website', 'description', 'not', ',', 'suite', 'bedroom', 'bathroom', 'standard', 'hotel', 'room', ',', 'took', 'printed', 'reservation', 'desk', 'showed', 'said', 'things', 'like', 'tv', 'couch', 'ect', 'desk', 'clerk', 'told', 'oh', 'mixed', 'suites', 'description', 'kimpton', 'website', 'sorry', 'free', 'breakfast', ',', 'got', 'kidding', ',', 'embassy', 'suits', 'sitting', 'room', 'bathroom', 'bedroom', 'unlike', 'kimpton', 'calls', 'suite', ',', '5', 'day', 'stay', 'offer', 'correct', 'false', 'advertising', ',', 'send', 'kimpton', 'preferred', 'guest', 'website', 'email', 'asking', 'failure', 'provide', 'suite', 'advertised', 'website', 'reservation', 'description', 'furnished', 'hard', 'copy', 'reservation', 'printout', 'website', 'desk', 'manager', 'duty', 'did', 'not', 'reply', 'solution', ',', 'se

In [35]:
#tokenization and remove punctuations
words = [str(token) for token in nlp(text) if not token.is_punct] 

#remove digits and other symbols except "@"--used to remove email
words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]

#remove empty spaces 
words = [word for word in words if word!=' ']

In [36]:
words

['ok',
 'nothing',
 'special',
 'charge',
 'diamond',
 'member',
 'hilton',
 'decided',
 'chain',
 'shot',
 'th',
 'anniversary',
 'seattle',
 'start',
 'booked',
 'suite',
 'paid',
 'extra',
 'website',
 'description',
 'not',
 'suite',
 'bedroom',
 'bathroom',
 'standard',
 'hotel',
 'room',
 'took',
 'printed',
 'reservation',
 'desk',
 'showed',
 'said',
 'things',
 'like',
 'tv',
 'couch',
 'ect',
 'desk',
 'clerk',
 'told',
 'oh',
 'mixed',
 'suites',
 'description',
 'kimpton',
 'website',
 'sorry',
 'free',
 'breakfast',
 'got',
 'kidding',
 'embassy',
 'suits',
 'sitting',
 'room',
 'bathroom',
 'bedroom',
 'unlike',
 'kimpton',
 'calls',
 'suite',
 '',
 'day',
 'stay',
 'offer',
 'correct',
 'false',
 'advertising',
 'send',
 'kimpton',
 'preferred',
 'guest',
 'website',
 'email',
 'asking',
 'failure',
 'provide',
 'suite',
 'advertised',
 'website',
 'reservation',
 'description',
 'furnished',
 'hard',
 'copy',
 'reservation',
 'printout',
 'website',
 'desk',
 'manager',

## Remove stop words 

Removing stop words in the list

In [44]:
#import other lists of stopwords
from nltk.corpus import stopwords 

stopwords = nltk.corpus.stopwords.words('english')

In [47]:
words=[word.lower() for word in words if word.lower() not in stopwords]
     #combine a list into one string   
string = " ".join(words)

In [48]:
string

'ok nothing special charge diamond member hilton decided chain shot th anniversary seattle start booked suite paid extra website description suite bedroom bathroom standard hotel room took printed reservation desk showed said things like tv couch ect desk clerk told oh mixed suites description kimpton website sorry free breakfast got kidding embassy suits sitting room bathroom bedroom unlike kimpton calls suite  day stay offer correct false advertising send kimpton preferred guest website email asking failure provide suite advertised website reservation description furnished hard copy reservation printout website desk manager duty reply solution send email trip guest survey follow email mail guess tell concerned guestthe staff ranged indifferent helpful asked desk good breakfast spots neighborhood hood told hotels gee best breakfast spots seattle  block away convenient hotel know exist arrived late night  pm inside run bellman busy chating cell phone help bagsprior arrival emailed hote

# Testing Vectorizer

## Count Vectorizer

Count vectorizer: bag of word approach extracts informatio from the word counts. Transform doc to vector.

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
vec_str = [string]

In [63]:
vectorize = CountVectorizer(stop_words="english")
vectorize.fit(vec_str)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [65]:
vector = vectorize.transform(vec_str)

In [69]:
print(vectorize.vocabulary_)

{'ok': 110, 'special': 143, 'charge': 30, 'diamond': 50, 'member': 101, 'hilton': 80, 'decided': 46, 'chain': 28, 'shot': 138, 'th': 158, 'anniversary': 4, 'seattle': 132, 'start': 147, 'booked': 18, 'suite': 150, 'paid': 111, 'extra': 58, 'website': 168, 'description': 48, 'bedroom': 12, 'bathroom': 11, 'standard': 146, 'hotel': 82, 'room': 127, 'took': 161, 'printed': 117, 'reservation': 125, 'desk': 49, 'showed': 139, 'said': 129, 'things': 159, 'like': 97, 'tv': 164, 'couch': 43, 'ect': 53, 'clerk': 35, 'told': 160, 'oh': 109, 'mixed': 102, 'suites': 151, 'kimpton': 92, 'sorry': 142, 'free': 64, 'breakfast': 20, 'got': 68, 'kidding': 91, 'embassy': 56, 'suits': 152, 'sitting': 140, 'unlike': 165, 'calls': 25, 'day': 45, 'stay': 148, 'offer': 108, 'correct': 42, 'false': 61, 'advertising': 2, 'send': 133, 'preferred': 116, 'guest': 72, 'email': 54, 'asking': 8, 'failure': 60, 'provide': 120, 'advertised': 1, 'furnished': 65, 'hard': 75, 'copy': 41, 'printout': 118, 'manager': 100, '

In [70]:
print(vector.shape)

(1, 169)


In [71]:
print(vector.toarray())

[[1 1 1 1 2 3 1 1 1 1 1 2 2 1 1 1 2 1 1 1 3 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 3 4 1 1 1 1 4 1 1 1 1 1 1 1 2 1 1 1 1 4 3 1 1 1
  2 1 1 1 1 1 1 2 1 1 5 2 1 1 1 1 1 1 1 1 3 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2
  1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 3 1 6 1 1 1 1 3 2 1 1 1 1 1 1 1 1 1 1
  2 2 1 2 1 2 4 1 1 1 1 1 1 1 2 1 3 1 1 1 1 1 1 1 5]]
