In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import WordNetLemmatizer, word_tokenize, ngrams
from nltk.tokenize import WordPunctTokenizer, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect
import emoji
from transformers import BertTokenizer
import sentencepiece as spm
import inflect
import contractions

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

# Data Import & Initial Exploration

In [None]:
# load the raw individual datasets
colombo = pd.read_csv('../data/raw/dataset_colombo_raw.csv')
east = pd.read_csv('../data/raw/dataset_east_raw.csv')
ella = pd.read_csv('../data/raw/dataset_ella_raw.csv')
galle = pd.read_csv('../data/raw/dataset_galle_raw.csv')
kandy = pd.read_csv('../data/raw/dataset_kandy_raw_1.csv')
nuwaraeliya = pd.read_csv('../data/raw/dataset_nuwaraeliya_raw.csv')

In [None]:
# print the shape of the datasets
colombo.shape, east.shape, ella.shape, galle.shape, kandy.shape, nuwaraeliya.shape

Checking which columns to keep

In [None]:
kandy.columns.tolist()

In [None]:
kandy.head()

In [None]:
# based on manual analysis of the scrapped data the following columns were deemed relevant
useful_columns = [
    'helpfulVotes',
    'id',
    'placeInfo/address',
    'placeInfo/addressObj/city',
    'placeInfo/name',
    'placeInfo/numberOfReviews',
    'placeInfo/rating',
    'placeInfo/webUrl',
    'publishedDate',
    'rating',
    'roomTip',
    'text',
    'title',
    'travelDate',
    'tripType',
    'user/username',
]

In [None]:
# filter dataframes to only include useful columns
colombo = colombo[useful_columns]
east = east[useful_columns]
ella = ella[useful_columns]
galle = galle[useful_columns]
kandy = kandy[useful_columns]
nuwaraeliya = nuwaraeliya[useful_columns]

# check shape of dataframes after filteration
colombo.shape, east.shape, ella.shape, galle.shape, kandy.shape, nuwaraeliya.shape 

In [None]:
# Concatenate dataframes
data = pd.concat([colombo, east, ella, galle, kandy, nuwaraeliya])
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# renaming columns to be more readable
column_names = {
    'helpfulVotes': 'helpfulVotes',
    'id': 'id',
    'placeInfo/address': 'address',
    'placeInfo/addressObj/city': 'city',
    'placeInfo/name': 'placeName',
    'placeInfo/numberOfReviews': 'numberOfReviews',
    'placeInfo/rating': 'placeRating',
    'placeInfo/webUrl': 'webUrl',
    'publishedDate': 'publishedDate',
    'rating': 'userRating',
    'roomTip': 'roomTip',
    'text': 'review_text',
    'title': 'review_title',
    'travelDate': 'travelDate',
    'tripType': 'tripType',
    'user/username': 'username',
}

data = data.rename(columns=column_names)
data.head()

In [None]:
# data.to_csv('..data/dataset_raw.csv', index=False)

# Data Cleaning

In [None]:
df = pd.read_csv("../data/dataset_raw.csv")
df.head()

In [None]:
# Identify incomplete reviews
print("Number of incomplete reviews:", df[df['review_text'].isnull()].shape[0])

### Handling Duplicates

In [None]:
duplicates = df.duplicated(subset=['review_text'])
print("Number of duplicate reviews removed:", df[df.duplicated(subset=['review_text'])].shape[0])

df = df[~duplicates]

### Handling Missing Values

In [None]:
# check for missing data
print(f"Number of missing data: \n{df.isnull().sum()}")

In [None]:
# check null rows for review_text column
df[df['review_text'].isnull()]

In [None]:
# drop null row
df.dropna(subset=['review_text'], inplace=True)

In [None]:
df.isnull().sum()

## Text Preprocessing

### Convert text to lowercase

In [None]:
df['review_text'] = df['review_text'].str.lower()

### Remove HTML tags

In [None]:
df['review_text'] = df['review_text'].str.replace(r'<.*?>', ' ', regex=True)

### Remove URLs/emails from reviews

In [None]:
df['review_text'] = df['review_text'].str.replace(r'http\S+|www.\S+|\S+@\S+', ' ', regex=True)

### Remove punctuation & special characters

In [None]:
df['review_text'] = df['review_text'].str.replace(r'[^\w\s]', ' ', regex=True)

In [None]:
# check the reviews for emojis using emoji library
def contains_emoji(text):
    return emoji.emoji_count(text) > 0

print(f"Review titles with emojis: {df['review_title'].apply(contains_emoji).sum()}")
print(f"Review text with emojis: {df['review_text'].apply(contains_emoji).sum()}")

In [None]:
# handling line breaks
lb_count = df['review_text'].str.contains('\n').sum()

df['review_text'] = df['review_text'].str.replace('\n', ' ')

print(f"Number of line breaks before: {lb_count}")
print("Number of line breaks after: ", df['review_text'].str.contains('\n').sum())

In [None]:
# expand contractions
df['review_text'] = df['review_text'].apply(lambda x: contractions.fix(x))

In [None]:
# dealing with numbers (inflecting numbers to words) 
p = inflect.engine()

# Function to replace numbers with words
def convert_numbers_to_words(text):
    return ' '.join([p.number_to_words(word) if word.isdigit() else word for word in text.split()])

df['review_text'] = df['review_text'].apply(convert_numbers_to_words)

### Removing Stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
# check stopword count in review_text column
pre_sw_count = df['review_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).sum()
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

post_sw_count = df['review_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).sum()

print(f"Number of stop words before: {pre_sw_count}")
print(f"Number of stop words after: {post_sw_count}")

### Lemmatize Text

In [None]:
lemmatizer = WordNetLemmatizer()
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

### Handling Multilingual Data

In [None]:
# Detect language
df['language'] = df['review_text'].apply(detect)
df['language'].value_counts()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [None]:
df.loc[df['language'] != 'en', ['id', 'review_text', 'language']]

In [None]:
# keep english reviews only
df = df[df['language'] == 'en']
df.drop(columns=['language'], inplace=True)

### Normalize Whitespace

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: ' '.join(x.split()))

# Tokenization Exploration

Techniques explored:
1. Word Tokenization
2. Subword Tokenization (Byte-pair or BPE)
3. n-Gram Tokenization (bi-gram/tri-gram)
4. Whitespace Tokenization
5. Rule-Based Tokenization
6. SpaCy Tokenization
7. WordPiece Tokenization

In [None]:
tkn_results = []

In [None]:
def calculate_vocab_size(tokenized_column):
    vocab_size = len(set([token for tokens in tokenized_column for token in tokens]))
    return vocab_size

In [None]:
def calculate_average_token_length(tokenized_column):
    total_token_length = sum(len(token) for tokens in tokenized_column for token in tokens)
    total_tokens = sum(len(tokens) for tokens in tokenized_column)
    average_token_length = total_token_length / total_tokens
    return average_token_length

In [None]:
def calculate_tokenizer_metrics(tokenized_column, tokenizer_name):
    vocab_size = len(set([token for tokens in tokenized_column for token in tokens]))
    avg_tkn_len = calculate_average_token_length(tokenized_column)

    tkn_results.append({
        "Tokenizer": tokenizer_name,
        "Vocabulary Size": vocab_size,
        "Average Token Length": avg_tkn_len,
    })

    print(f'Vocab size: {vocab_size}')
    print(f'Average token length: {avg_tkn_len}')

### Word Tokenization

In [None]:
%%time
df['word_tokens'] = df['review_text'].apply(lambda x: word_tokenize(x))

In [None]:
calculate_tokenizer_metrics(df['word_tokens'], "Word Tokenization")

### BPE Tokenization

In [None]:
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
bpe_tokenizer.pre_tokenizer = Whitespace()

# Train on a sample of the data
corpus = df['review_text'].astype(str).tolist()
bpe_tokenizer.train_from_iterator(corpus, trainer)

df['bpe_tokens'] = df['review_text'].apply(
    lambda x: bpe_tokenizer.encode(str(x)).tokens
)

In [None]:
calculate_tokenizer_metrics(df['bpe_tokens'], "BPE Tokenization")

### n-Gram Tokenization

Bigram & Trigram

In [None]:
%%time
# bigram tokenization
df['bigram_tokens'] = df['word_tokens'].apply(lambda x: list(ngrams(x, 2)))

In [None]:
calculate_tokenizer_metrics(df['bigram_tokens'], "Bigram Tokenization")

In [None]:
%%time
# trigram tokenization
df['trigram_tokens'] = df['word_tokens'].apply(lambda x: list(ngrams(x, 3)))

In [None]:
calculate_tokenizer_metrics(df['trigram_tokens'], "Trigram Tokenization")

### Whitespace Tokenization

In [None]:
%%time
df['whitespace_tokens'] = df['review_text'].apply(lambda x: str(x).split())

In [None]:
calculate_tokenizer_metrics(df['whitespace_tokens'], "Whitespace Tokenization")

In [None]:
# check if whitespace tokenization is the same as word tokenization
df[df['word_tokens'] != df['whitespace_tokens']]['word_tokens'].shape

### Rule Based Tokenization

In [None]:
%%time
rule_tokenizer = RegexpTokenizer(r'\w+')
df['rule_based_tokens'] = df['review_text'].apply(lambda x: rule_tokenizer.tokenize(str(x)))

In [None]:
calculate_tokenizer_metrics(df['rule_based_tokens'], "Rule Based Tokenization")

### SpaCy Tokenization

In [None]:
# spacy.cli.download("en_core_web_sm")

In [None]:
%%time

nlp = spacy.load('en_core_web_sm')
df['spacy_tokens'] = df['review_text'].apply(lambda x: [token.text for token in nlp(str(x))])

In [None]:
calculate_tokenizer_metrics(df['spacy_tokens'], "SpaCy Tokenization")

### Wordpiece Tokenization

In [None]:
%%time

wordpiece_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

wordpiece_trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=20000
)
wordpiece_tokenizer.pre_tokenizer = Whitespace()

# Train on the corpus
wordpiece_tokenizer.train_from_iterator(corpus, wordpiece_trainer)

df['wordpiece_tokens'] = df['review_text'].apply(lambda x: wordpiece_tokenizer.encode(str(x)).tokens)

In [None]:
calculate_tokenizer_metrics(df['wordpiece_tokens'], "Wordpiece Tokenization")

### Comparison

In [None]:
results_df = pd.DataFrame(tkn_results)
results_df.head(10)

# Exporting

In [None]:
pd.reset_option('^display')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns.tolist()

In [None]:
df.to_csv('../data/dataset_cleaned.csv', index=False)