In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import WordNetLemmatizer, word_tokenize, ngrams
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect
import emoji
from transformers import BertTokenizer
import sentencepiece as spm
import inflect
import contractions

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


# Data Import & Initial Exploration

In [2]:
# load the raw individual datasets
colombo = pd.read_csv('../data/raw/dataset_colombo_raw.csv')
east = pd.read_csv('../data/raw/dataset_east_raw.csv')
ella = pd.read_csv('../data/raw/dataset_ella_raw.csv')
galle = pd.read_csv('../data/raw/dataset_galle_raw.csv')
kandy = pd.read_csv('../data/raw/dataset_kandy_raw_1.csv')
nuwaraeliya = pd.read_csv('../data/raw/dataset_nuwaraeliya_raw.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# print the shape of the datasets
colombo.shape, east.shape, ella.shape, galle.shape, kandy.shape, nuwaraeliya.shape

((2000, 1012), (1846, 91), (1951, 870), (2100, 1992), (2000, 90), (1932, 1011))

Checking which columns to keep

In [4]:
kandy.columns.tolist()

['helpfulVotes',
 'id',
 'lang',
 'locationId',
 'ownerResponse',
 'ownerResponse/connectionToSubject',
 'ownerResponse/id',
 'ownerResponse/lang',
 'ownerResponse/publishedDate',
 'ownerResponse/responder',
 'ownerResponse/text',
 'placeInfo/address',
 'placeInfo/addressObj/city',
 'placeInfo/addressObj/country',
 'placeInfo/addressObj/postalcode',
 'placeInfo/addressObj/state',
 'placeInfo/addressObj/street1',
 'placeInfo/addressObj/street2',
 'placeInfo/id',
 'placeInfo/latitude',
 'placeInfo/locationString',
 'placeInfo/longitude',
 'placeInfo/name',
 'placeInfo/numberOfReviews',
 'placeInfo/rating',
 'placeInfo/ratingHistogram/count1',
 'placeInfo/ratingHistogram/count2',
 'placeInfo/ratingHistogram/count3',
 'placeInfo/ratingHistogram/count4',
 'placeInfo/ratingHistogram/count5',
 'placeInfo/webUrl',
 'placeInfo/website',
 'publishedDate',
 'publishedPlatform',
 'rating',
 'roomTip',
 'subratings/0/name',
 'subratings/0/value',
 'subratings/1/name',
 'subratings/1/value',
 'subra

In [5]:
kandy.head()

Unnamed: 0,helpfulVotes,id,lang,locationId,ownerResponse,ownerResponse/connectionToSubject,ownerResponse/id,ownerResponse/lang,ownerResponse/publishedDate,ownerResponse/responder,...,user/contributions/helpfulVotes,user/contributions/totalContributions,user/link,user/name,user/userId,user/userLocation,user/userLocation/id,user/userLocation/name,user/userLocation/shortName,user/username
0,0,972640226,en,4173413,,,,,,,...,0,1,www.tripadvisor.com/Profile/homea575,Home A,C8DEEF169BD0B06A7116F5AA8CBEE505,,,,,homea575
1,0,942345342,en,4173413,,,,,,,...,1,20,www.tripadvisor.com/Profile/R5273HHvalentind,Valentin D,74A8DBD1CFC8AB82944759ED1DE3E355,,,,,R5273HHvalentind
2,0,932695686,en,4173413,,,,,,,...,0,1,www.tripadvisor.com/Profile/dinast2024,Dinas T,6984DC788123C57BB9DB35B5F53E8CE9,,,,,dinast2024
3,0,919333160,en,4173413,,,,,,,...,0,1,www.tripadvisor.com/Profile/madhawar2016,Madhawa R,9CEC5C98C2A8732A9E9C71E3A79113E8,,293962.0,"Colombo, Sri Lanka",Colombo,madhawar2016
4,0,914641015,en,4173413,,,,,,,...,0,12,www.tripadvisor.com/Profile/315chinthakaw,Chinthaka W,DA2EAB222D26CBEAC097EFCB1EB671FD,,,,,315chinthakaw


In [6]:
# based on manual analysis of the scrapped data the following columns were deemed relevant
useful_columns = [
    'helpfulVotes',
    'id',
    'placeInfo/address',
    'placeInfo/addressObj/city',
    'placeInfo/name',
    'placeInfo/numberOfReviews',
    'placeInfo/rating',
    'placeInfo/webUrl',
    'publishedDate',
    'rating',
    'roomTip',
    'text',
    'title',
    'travelDate',
    'tripType',
    'user/username',
]

In [7]:
# filter dataframes to only include useful columns
colombo = colombo[useful_columns]
east = east[useful_columns]
ella = ella[useful_columns]
galle = galle[useful_columns]
kandy = kandy[useful_columns]
nuwaraeliya = nuwaraeliya[useful_columns]

# check shape of dataframes after filteration
colombo.shape, east.shape, ella.shape, galle.shape, kandy.shape, nuwaraeliya.shape 

((2000, 16), (1846, 16), (1951, 16), (2100, 16), (2000, 16), (1932, 16))

In [8]:
# Concatenate dataframes
data = pd.concat([colombo, east, ella, galle, kandy, nuwaraeliya])
data.shape

(11829, 16)

In [9]:
data.head()

Unnamed: 0,helpfulVotes,id,placeInfo/address,placeInfo/addressObj/city,placeInfo/name,placeInfo/numberOfReviews,placeInfo/rating,placeInfo/webUrl,publishedDate,rating,roomTip,text,title,travelDate,tripType,user/username
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,Great place,Very good service at Ayu and friendly staff. S...,Experience at Ayu,2024-11,SOLO,219nikal
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,,Change u r lady manager of u r hotel morning s...,Change u r lady manager of u r hotel morning s...,2024-10,BUSINESS,rajacool1984itz
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,,Just Perfect \nWe liked everything.\nStaffs ar...,Amazed !,2024-10,FAMILY,857navidj
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Our stay was unforgettable! The hotel was beau...,Wonderful stay,2024-10,FAMILY,809mickaelt
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Amazing hotel very well situated in Colombo! Y...,Amazing hotel !,2024-10,FAMILY,M1879HRchloet


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11829 entries, 0 to 1931
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   helpfulVotes               11828 non-null  float64
 1   id                         11828 non-null  float64
 2   placeInfo/address          11829 non-null  object 
 3   placeInfo/addressObj/city  11779 non-null  object 
 4   placeInfo/name             11829 non-null  object 
 5   placeInfo/numberOfReviews  11829 non-null  int64  
 6   placeInfo/rating           11828 non-null  float64
 7   placeInfo/webUrl           11829 non-null  object 
 8   publishedDate              11828 non-null  object 
 9   rating                     11828 non-null  float64
 10  roomTip                    1989 non-null   object 
 11  text                       11828 non-null  object 
 12  title                      11828 non-null  object 
 13  travelDate                 11819 non-null  obje

In [11]:
# renaming columns to be more readable
column_names = {
    'helpfulVotes': 'helpfulVotes',
    'id': 'id',
    'placeInfo/address': 'address',
    'placeInfo/addressObj/city': 'city',
    'placeInfo/name': 'placeName',
    'placeInfo/numberOfReviews': 'numberOfReviews',
    'placeInfo/rating': 'placeRating',
    'placeInfo/webUrl': 'webUrl',
    'publishedDate': 'publishedDate',
    'rating': 'userRating',
    'roomTip': 'roomTip',
    'text': 'review_text',
    'title': 'review_title',
    'travelDate': 'travelDate',
    'tripType': 'tripType',
    'user/username': 'username',
}

data = data.rename(columns=column_names)
data.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,roomTip,review_text,review_title,travelDate,tripType,username
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,Great place,Very good service at Ayu and friendly staff. S...,Experience at Ayu,2024-11,SOLO,219nikal
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,,Change u r lady manager of u r hotel morning s...,Change u r lady manager of u r hotel morning s...,2024-10,BUSINESS,rajacool1984itz
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,,Just Perfect \nWe liked everything.\nStaffs ar...,Amazed !,2024-10,FAMILY,857navidj
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Our stay was unforgettable! The hotel was beau...,Wonderful stay,2024-10,FAMILY,809mickaelt
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Amazing hotel very well situated in Colombo! Y...,Amazing hotel !,2024-10,FAMILY,M1879HRchloet


In [12]:
# data.to_csv('..data/dataset_raw.csv', index=False)

# Data Cleaning

In [13]:
df = pd.read_csv("../data/dataset_raw.csv")
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,roomTip,review_text,review_title,travelDate,tripType,username
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,Great place,Very good service at Ayu and friendly staff. S...,Experience at Ayu,2024-11,SOLO,219nikal
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,,Change u r lady manager of u r hotel morning s...,Change u r lady manager of u r hotel morning s...,2024-10,BUSINESS,rajacool1984itz
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,,Just Perfect \nWe liked everything.\nStaffs ar...,Amazed !,2024-10,FAMILY,857navidj
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Our stay was unforgettable! The hotel was beau...,Wonderful stay,2024-10,FAMILY,809mickaelt
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,,Amazing hotel very well situated in Colombo! Y...,Amazing hotel !,2024-10,FAMILY,M1879HRchloet


In [14]:
# Identify incomplete reviews
print("Number of incomplete reviews:", df[df['review_text'].isnull()].shape[0])

Number of incomplete reviews: 1


### Handling Duplicates

In [15]:
duplicates = df.duplicated(subset=['review_text'])
print("Number of duplicate reviews removed:", df[df.duplicated(subset=['review_text'])].shape[0])

df = df[~duplicates]

Number of duplicate reviews removed: 1


### Handling Missing Values

In [16]:
# check for missing data
print(f"Number of missing data: \n{df.isnull().sum()}")

Number of missing data: 
helpfulVotes          1
id                    1
address               0
city                 50
placeName             0
numberOfReviews       0
placeRating           1
webUrl                0
publishedDate         1
userRating            1
roomTip            9839
review_text           1
review_title          1
travelDate           10
tripType             10
username              1
dtype: int64


In [17]:
# check null rows for review_text column
df[df['review_text'].isnull()]

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,roomTip,review_text,review_title,travelDate,tripType,username
9917,,,"No 09 Little England, Nuwara Eliya 22200 Sri L...",Nuwara Eliya,Swiss Cottage Nuwara Eliya,0,,https://www.tripadvisor.com/Hotel_Review-g6085...,,,,,,,,


In [18]:
# drop null row
df.dropna(subset=['review_text'], inplace=True)

In [19]:
df.isnull().sum()

helpfulVotes          0
id                    0
address               0
city                 50
placeName             0
numberOfReviews       0
placeRating           0
webUrl                0
publishedDate         0
userRating            0
roomTip            9838
review_text           0
review_title          0
travelDate            9
tripType              9
username              0
dtype: int64

## Text Preprocessing

### Convert text to lowercase

In [20]:
df['review_text'] = df['review_text'].str.lower()

### Remove HTML tags

In [21]:
df['review_text'] = df['review_text'].str.replace(r'<.*?>', ' ', regex=True)

### Remove URLs/emails from reviews

In [22]:
df['review_text'] = df['review_text'].str.replace(r'http\S+|www.\S+|\S+@\S+', ' ', regex=True)

### Remove punctuation & special characters

In [23]:
df['review_text'] = df['review_text'].str.replace(r'[^\w\s]', ' ', regex=True)

In [24]:
# check the reviews for emojis using emoji library
def contains_emoji(text):
    return emoji.emoji_count(text) > 0

print(f"Review titles with emojis: {df['review_title'].apply(contains_emoji).sum()}")
print(f"Review text with emojis: {df['review_text'].apply(contains_emoji).sum()}")

Review titles with emojis: 177
Review text with emojis: 0


In [25]:
# handling line breaks
lb_count = df['review_text'].str.contains('\n').sum()

df['review_text'] = df['review_text'].str.replace('\n', ' ')

print(f"Number of line breaks before: {lb_count}")
print("Number of line breaks after: ", df['review_text'].str.contains('\n').sum())

Number of line breaks before: 5199
Number of line breaks after:  0


In [26]:
# expand contractions
df['review_text'] = df['review_text'].apply(lambda x: contractions.fix(x))

In [27]:
# dealing with numbers (inflecting numbers to words) 
p = inflect.engine()

# Function to replace numbers with words
def convert_numbers_to_words(text):
    return ' '.join([p.number_to_words(word) if word.isdigit() else word for word in text.split()])

df['review_text'] = df['review_text'].apply(convert_numbers_to_words)

### Removing Stopwords

In [28]:
stop_words = set(stopwords.words('english'))

In [29]:
# check stopword count in review_text column
pre_sw_count = df['review_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).sum()
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

post_sw_count = df['review_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).sum()

print(f"Number of stop words before: {pre_sw_count}")
print(f"Number of stop words after: {post_sw_count}")

Number of stop words before: 518762
Number of stop words after: 0


### Lemmatize Text

In [30]:
lemmatizer = WordNetLemmatizer()
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

### Handling Multilingual Data

In [31]:
# Detect language
df['language'] = df['review_text'].apply(detect)
df['language'].value_counts()

en    11809
fr        3
ja        2
sv        2
he        2
da        1
it        1
ta        1
pl        1
et        1
sq        1
es        1
sk        1
ar        1
Name: language, dtype: int64

In [32]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [33]:
df.loc[df['language'] != 'en', ['id', 'review_text', 'language']]

Unnamed: 0,id,review_text,language
531,978318618.0,ம ற ஜ வ ட த ய ல அன த த வசத கள ம ச றப ப க இர ந தத அற கள த ப பரவ டன ம வசத ய கவ ம ந ர த த ய கவ ம க ணப பட டத வழங கப பட ட உணவ கழ ம நல லத க இர ந தத வ ட த ய ல இர ந த ப ர க க ம ப த கடற கர அழக க த ர க றத அன த த ச வ க க ம நன ற,ta
559,973182740.0,cinnamon best restaurant five star hotel sri lanka best value money,sv
1124,971043814.0,shanuka damith janith tharindu inusha sachini bashitha give u great service thank marriot,sq
2089,485911118.0,awesome arugam bay never many yummy meal breakfast snack luch dinner juice local food amazing mexican food good staff really friendly make feel like old friend place really good two different area beach well relaxing lounge clean washroom toilet bagni puliti ottimo cibo buona vista mare frutta fresca e deliziosi succhi il personale è super amichevole soprattutto caso di bisogno problemi anche sanitari rafaiul uno dei manager lavora anche ospedale locale prezzi sono nella medium la qualità è decisamente migliore di molti altri posti,it
3051,395484171.0,stayed two night comfortable affordable people pretty nice reliable friendly really enjoyed thank much 初めてのアルガンベイで 滞在先に困っていましたが ここのホテルにしてとても最高でした フレンドリーで優しいスタッフの皆さんありがとうございました,ja
3273,502935778.0,fajne miejsce super lokalizacja wszędzie blisko znamy szefa stąd wielka sympatia wobec ludzi tutaj poproście będzie wam dane dobra kuchnia niskie ceny świetne miejsce na nurkowanie w standardzie lokalnym porządnie miło atmosfera problem polecam spędziłthem tu z żoną cudowny tydzień great place perfect localization beach front balcony diving site ten min walk center trinco another direction koneswaram temple swami rock nilaveli uppaveli five ten min tuktuk really cool food ask done easy young staff super cool owner call bos give respect spent wonderful week wife super miss sun set dawn really loved perfect easy local stay forget dive,pl
4189,963411238.0,nice experience amazing hotel good service staff view absolutely fantastic thank shantha help really like come next time מלון מצוין עם נוף מדהים העובדים מאוד חרוצים ועוזרים בכל דבר המקום נגיש וקרוב למקומות בילוי ואטרקציות,he
4438,821759456.0,bandra kindly man always smile room big comfortable priceless view kindly crew ישראלים מקום מאוד נחמד עם חדרים ברמה גבוהה באמת הבעלים אדיב ונעים לא הצטערנו,he
4731,715440763.0,hotel near centrum railway station near restaurant shoops room big balcony view adam peak brakfast perfect mix europen local foood fruit fresh juice host number one thank nice holiday hotel krasna horska chata vyhladom na adam speak blizko centrum vedla zeleznicnej stanice blizko obchod restiky ozba bola velka cista wifi tepla voda krasny vyhlad na adam peak ranajky boli bohate hostitelia boli uzastni dakujeme za krasny zazitok,sk
5869,931857605.0,great place wonderful room also enjoyed yoga class jsgjdgvh fnheh dv dv dvndhndcndgndhndgnsgnscbsc scc vsgbgbzgbdgbscbxv fsbdhndcndcndcndcn fjegbdgbdg vsgbdhnzgndgbscbxb gjdgbdgbdgbscbsgbdgbsc fbgbfsbgbdvfhng,da


In [34]:
# keep english reviews only
df = df[df['language'] == 'en']
df.drop(columns=['language'], inplace=True)

### Normalize Whitespace

In [35]:
df['review_text'] = df['review_text'].apply(lambda x: ' '.join(x.split()))

# Tokenization Exploration

Techniques explored:
1. Word Tokenization
2. Subword Tokenization (Byte-pair or BPE)
3. n-Gram Tokenization (bi-gram/tri-gram)
4. Whitespace Tokenization
5. Rule-Based Tokenization
6. SpaCy Tokenization
7. WordPiece Tokenization

In [36]:
tkn_results = []

In [37]:
def calculate_vocab_size(tokenized_column):
    vocab_size = len(set([token for tokens in tokenized_column for token in tokens]))
    return vocab_size

In [38]:
def calculate_average_token_length(tokenized_column):
    total_token_length = sum(len(token) for tokens in tokenized_column for token in tokens)
    total_tokens = sum(len(tokens) for tokens in tokenized_column)
    average_token_length = total_token_length / total_tokens
    return average_token_length

In [42]:
def calculate_tokenizer_metrics(tokenized_column, tokenizer_name):
    vocab_size = len(set([token for tokens in tokenized_column for token in tokens]))
    avg_tkn_len = calculate_average_token_length(tokenized_column)

    tkn_results.append({
        "Tokenizer": tokenizer_name,
        "Vocabulary Size": vocab_size,
        "Average Token Length": avg_tkn_len,
    })

    print(f'Vocab size: {vocab_size}')
    print(f'Average token length: {avg_tkn_len}')

### Word Tokenization

In [43]:
%%time
df['word_tokenization'] = df['review_text'].apply(lambda x: word_tokenize(x))

Wall time: 3.03 s


In [44]:
calculate_tokenizer_metrics(df['word_tokenization'], "Word Tokenization")

Vocab size: 19452
Average token length: 5.764739810526389


### BPE Tokenization

In [45]:
%%time
# Train BPE model
df['review_text'].to_csv('reviews.txt', index=False, header=False)
spm.SentencePieceTrainer.train(input='reviews.txt', model_prefix='bpe', vocab_size=5000, model_type='bpe')

# Load the model
sp = spm.SentencePieceProcessor(model_file='bpe.model')

def bpe_tokenize(text):
    return sp.encode(text, out_type=str)

df['bpe_tokenization'] = df['review_text'].apply(bpe_tokenize)

Wall time: 2.76 s


In [46]:
calculate_tokenizer_metrics(df['bpe_tokenization'], "BPE Tokenization")

Vocab size: 5070
Average token length: 5.939266018598967


### n-Gram Tokenization

Bigram & Trigram

In [47]:
%%time
# bigram tokenization
df['bigram_tokenization'] = df['review_text'].apply(lambda x: list(ngrams(x.split(), 2)))

Wall time: 249 ms


In [48]:
calculate_tokenizer_metrics(df['bigram_tokenization'], "Bigram Tokenization")

Vocab size: 274634
Average token length: 2.0


In [49]:
%%time
# trigram tokenization
df['trigram_tokenization'] = df['review_text'].apply(lambda x: list(ngrams(x.split(), 3)))

Wall time: 250 ms


In [50]:
calculate_tokenizer_metrics(df['trigram_tokenization'], "Trigram Tokenization")

Vocab size: 483244
Average token length: 3.0


### Whitespace Tokenization

In [51]:
%%time
df['whitespace_tokenization'] = df['review_text'].apply(lambda x: x.split())

Wall time: 100 ms


In [52]:
calculate_tokenizer_metrics(df['whitespace_tokenization'], "Whitespace Tokenization")

Vocab size: 19454
Average token length: 5.768445462808404


In [53]:
# check if whitespace tokenization is the same as word tokenization
df[df['word_tokenization'] != df['whitespace_tokenization']]['word_tokenization'].shape

(316,)

### Rule Based Tokenization

In [54]:
%%time
rule_tokenizer = WordPunctTokenizer()

def rule_tokenize(text):
    return rule_tokenizer.tokenize(text)

df['rule_tokenization'] = df['review_text'].apply(rule_tokenize)

Wall time: 259 ms


In [55]:
calculate_tokenizer_metrics(df['rule_tokenization'], "Rule Based Tokenization")

Vocab size: 19388
Average token length: 5.7547920197076605


### SpaCy Tokenization

In [56]:
# spacy.cli.download("en_core_web_sm")

In [57]:
%%time
nlp = spacy.load('en_core_web_sm')

def spacy_tokenize(text):
    return [token.text for token in nlp(text)]

df['spacy_tokenization'] = df['review_text'].apply(spacy_tokenize)

Wall time: 1min 52s


In [58]:
calculate_tokenizer_metrics(df['spacy_tokenization'], "SpaCy Tokenization")

Vocab size: 19354
Average token length: 5.748707148183291


### Wordpiece Tokenization

In [59]:
%%time
wordpiece_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def wordpiece_tokenize(text):
    return wordpiece_tokenizer.tokenize(text)

df['wordpiece_tokenization'] = df['review_text'].apply(wordpiece_tokenize)

Wall time: 18.2 s


In [60]:
calculate_tokenizer_metrics(df['wordpiece_tokenization'], "Wordpiece Tokenization")

Vocab size: 12442
Average token length: 5.389014950628555


### Comparison

In [61]:
results_df = pd.DataFrame(tkn_results)
results_df.head(10)

Unnamed: 0,Tokenizer,Vocabulary Size,Average Token Length
0,Word Tokenization,19452,5.76474
1,Word Tokenization,19452,5.76474
2,BPE Tokenization,5070,5.939266
3,Bigram Tokenization,274634,2.0
4,Trigram Tokenization,483244,3.0
5,Whitespace Tokenization,19454,5.768445
6,Rule Based Tokenization,19388,5.754792
7,SpaCy Tokenization,19354,5.748707
8,Wordpiece Tokenization,12442,5.389015


# Exporting

In [63]:
pd.reset_option('^display')

In [65]:
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,tripType,username,word_tokenization,bpe_tokenization,bigram_tokenization,trigram_tokenization,whitespace_tokenization,rule_tokenization,spacy_tokenization,wordpiece_tokenization
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,SOLO,219nikal,"[good, service, ayu, friendly, staff, samadhi,...","[▁good, ▁service, ▁ayu, ▁friendly, ▁staff, ▁sa...","[(good, service), (service, ayu), (ayu, friend...","[(good, service, ayu), (service, ayu, friendly...","[good, service, ayu, friendly, staff, samadhi,...","[good, service, ayu, friendly, staff, samadhi,...","[good, service, ayu, friendly, staff, samadhi,...","[good, service, a, ##yu, friendly, staff, sam,..."
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,BUSINESS,rajacool1984itz,"[change, r, lady, manager, r, hotel, morning, ...","[▁change, ▁r, ▁lady, ▁manager, ▁r, ▁hotel, ▁mo...","[(change, r), (r, lady), (lady, manager), (man...","[(change, r, lady), (r, lady, manager), (lady,...","[change, r, lady, manager, r, hotel, morning, ...","[change, r, lady, manager, r, hotel, morning, ...","[change, r, lady, manager, r, hotel, morning, ...","[change, r, lady, manager, r, hotel, morning, ..."
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,FAMILY,857navidj,"[perfect, liked, everything, staff, kind, food...","[▁perfect, ▁liked, ▁everything, ▁staff, ▁kind,...","[(perfect, liked), (liked, everything), (every...","[(perfect, liked, everything), (liked, everyth...","[perfect, liked, everything, staff, kind, food...","[perfect, liked, everything, staff, kind, food...","[perfect, liked, everything, staff, kind, food...","[perfect, liked, everything, staff, kind, food..."
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,FAMILY,809mickaelt,"[stay, unforgettable, hotel, beautiful, staff,...","[▁stay, ▁unforgettable, ▁hotel, ▁beautiful, ▁s...","[(stay, unforgettable), (unforgettable, hotel)...","[(stay, unforgettable, hotel), (unforgettable,...","[stay, unforgettable, hotel, beautiful, staff,...","[stay, unforgettable, hotel, beautiful, staff,...","[stay, unforgettable, hotel, beautiful, staff,...","[stay, un, ##for, ##get, ##table, hotel, beaut..."
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,FAMILY,M1879HRchloet,"[amazing, hotel, well, situated, colombo, dire...","[▁amazing, ▁hotel, ▁well, ▁situated, ▁colombo,...","[(amazing, hotel), (hotel, well), (well, situa...","[(amazing, hotel, well), (hotel, well, situate...","[amazing, hotel, well, situated, colombo, dire...","[amazing, hotel, well, situated, colombo, dire...","[amazing, hotel, well, situated, colombo, dire...","[amazing, hotel, well, situated, colombo, dire..."


In [66]:
df.shape

(11809, 24)

In [68]:
df.columns.tolist()

['helpfulVotes',
 'id',
 'address',
 'city',
 'placeName',
 'numberOfReviews',
 'placeRating',
 'webUrl',
 'publishedDate',
 'userRating',
 'roomTip',
 'review_text',
 'review_title',
 'travelDate',
 'tripType',
 'username',
 'word_tokenization',
 'bpe_tokenization',
 'bigram_tokenization',
 'trigram_tokenization',
 'whitespace_tokenization',
 'rule_tokenization',
 'spacy_tokenization',
 'wordpiece_tokenization']

In [69]:
df.to_csv('../data/dataset_cleaned.csv', index=False)