In [53]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

In [54]:
data=pd.read_csv("./data_csv/tripadvisor_hotel_reviews.csv")

In [55]:
data.info()
data.head()
data["Review"][0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  109 non-null    object
 1   Rating  109 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

We are preprocessing the data.

The first part is making all the data lowercase

In [56]:
data["review_lowercase"]=data["Review"].str.lower()
data.head()

Unnamed: 0,Review,Rating,review_lowercase
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso..."


We will remove any stopwords within data

In [57]:
en_stopwords=stopwords.words("english")
en_stopwords.remove("not")
data["remove_no_stopwords"] = data["review_lowercase"].apply(lambda x: " ".join(word for word in x.split() if word not in en_stopwords))
data.head()
#lambda x: " ".join(word for word in x.split() if word not in en_stopwords
#x is each of the reviews, and doing x.split(), to seperate the sentences into words. 
#Afterwads we are  " ".join if the words is not part of en_stopwords. 


Unnamed: 0,Review,Rating,review_lowercase,remove_no_stopwords
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso..."


We are going to remove all punctuation except the * because this symbols plays a role in reviewing in stars. This is posisble through regex

In [58]:
data["remove_no_stopwords_no_punct"] = data.apply(lambda x: re.sub(r"[*]", "star", x["remove_no_stopwords"]), axis=1)
#pandas.apply() means it applys a funcitons along an axis, and by stating axis=1 it means that it goes down the column
#x represent each row and x["remove_no_stopwords"] means go through a section within each row. 
data.head()

Unnamed: 0,Review,Rating,review_lowercase,remove_no_stopwords,remove_no_stopwords_no_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4star experience hotel monaco s...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso..."


In [59]:
data["remove_no_stopwords_no_punct"] = data.apply(lambda x: re.sub(r"([^\w\s])", "", x["remove_no_stopwords"]), axis=1)
data.head()

Unnamed: 0,Review,Rating,review_lowercase,remove_no_stopwords,remove_no_stopwords_no_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4 experience hotel monaco seatt...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...


The next task is to tokenize the text


In [60]:
data["tokenize"] = data.apply(lambda x: word_tokenize(x["remove_no_stopwords_no_punct"]), axis = 1)
data["tokenize"].iloc[0]

['nice',
 'hotel',
 'expensive',
 'parking',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversary',
 'arrived',
 'late',
 'evening',
 'took',
 'advice',
 'previous',
 'reviews',
 'valet',
 'parking',
 'check',
 'quick',
 'easy',
 'little',
 'disappointed',
 'nonexistent',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'size',
 'bed',
 'comfortable',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillows',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bangs',
 'doors',
 'opening',
 'closing',
 'hear',
 'people',
 'talking',
 'hallway',
 'maybe',
 'noisy',
 'neighbors',
 'aveda',
 'bath',
 'products',
 'nice',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantage',
 'staying',
 'longer',
 'location',
 'great',
 'walking',
 'distance',
 'shopping',
 'overall',
 'nice',
 'experience',
 'pay',
 '40',
 'parking',
 'night']

After tokenizing the text, we use the PorterStemmer to reduce the words to their root forms.

In [61]:
ps=PorterStemmer()

In [62]:
data["stemmed"]=data.apply(lambda x: [ps.stem(word) for word in x["tokenize"]] , axis = 1)
data["stemmed2"]=data["tokenize"].apply(lambda tokens: [ps.stem(token) for token in tokens])
print(data["stemmed"].iloc[0])
print(data["stemmed2"].iloc[0])

['nice', 'hotel', 'expens', 'park', 'got', 'good', 'deal', 'stay', 'hotel', 'anniversari', 'arriv', 'late', 'even', 'took', 'advic', 'previou', 'review', 'valet', 'park', 'check', 'quick', 'easi', 'littl', 'disappoint', 'nonexist', 'view', 'room', 'room', 'clean', 'nice', 'size', 'bed', 'comfort', 'woke', 'stiff', 'neck', 'high', 'pillow', 'not', 'soundproof', 'like', 'heard', 'music', 'room', 'night', 'morn', 'loud', 'bang', 'door', 'open', 'close', 'hear', 'peopl', 'talk', 'hallway', 'mayb', 'noisi', 'neighbor', 'aveda', 'bath', 'product', 'nice', 'not', 'goldfish', 'stay', 'nice', 'touch', 'taken', 'advantag', 'stay', 'longer', 'locat', 'great', 'walk', 'distanc', 'shop', 'overal', 'nice', 'experi', 'pay', '40', 'park', 'night']
['nice', 'hotel', 'expens', 'park', 'got', 'good', 'deal', 'stay', 'hotel', 'anniversari', 'arriv', 'late', 'even', 'took', 'advic', 'previou', 'review', 'valet', 'park', 'check', 'quick', 'easi', 'littl', 'disappoint', 'nonexist', 'view', 'room', 'room', 'c

In [63]:
nltk.download("wordnet")
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tle72\AppData\Roaming\nltk_data...


In [64]:
data["lemmatize"]=data["tokenize"].apply(lambda tokens:[lemmatizer.lemmatize(token) for token in tokens])
data["lemmatize"].iloc[0]

['nice',
 'hotel',
 'expensive',
 'parking',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversary',
 'arrived',
 'late',
 'evening',
 'took',
 'advice',
 'previous',
 'review',
 'valet',
 'parking',
 'check',
 'quick',
 'easy',
 'little',
 'disappointed',
 'nonexistent',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'size',
 'bed',
 'comfortable',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillow',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'opening',
 'closing',
 'hear',
 'people',
 'talking',
 'hallway',
 'maybe',
 'noisy',
 'neighbor',
 'aveda',
 'bath',
 'product',
 'nice',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantage',
 'staying',
 'longer',
 'location',
 'great',
 'walking',
 'distance',
 'shopping',
 'overall',
 'nice',
 'experience',
 'pay',
 '40',
 'parking',
 'night']

We are inspecting our data by using N-grams

First part is to put all the tokens into one list. By putting the lemmatize and an empty list into one list

In [66]:
tokens_clean=sum(data["lemmatize"],[])
unigrams=pd.Series(nltk.ngrams(tokens_clean, 1)).value_counts()
print(unigrams)

(hotel,)       292
(room,)        275
(great,)       126
(not,)         122
(stay,)         95
              ... 
(midweek,)       1
(smirking,)      1
(40,)            1
(diamond,)       1
(od,)            1
Name: count, Length: 2588, dtype: int64


In [68]:
tokens_clean=sum(data["lemmatize"],[])
bigrams=pd.Series(nltk.ngrams(tokens_clean, 2)).value_counts()
print(bigrams)

(great, location)    24
(space, needle)      21
(hotel, monaco)      16
(great, view)        12
(pike, place)        12
                     ..
(robe, slipper)       1
(slipper, walk)       1
(walk, right)         1
(right, hotel)        1
(hotel, store)        1
Name: count, Length: 8262, dtype: int64
