# Installations

In [None]:
!{sys.executable} -m pip install spacy

In [None]:
!{sys.executable} -m spacy download en_core_web_sm

In [None]:
pip install sentencepiece nltk

In [None]:
import sys
!{sys.executable} -m pip install scikit-learn

In [None]:
import sys
!{sys.executable} -m pip install gensim

In [None]:
import sys
!{sys.executable} -m pip install sentence-transformers

# Commit to GitHub

In [None]:
!pwd                # shows your current folder
!git status         # check uncommitted changes
!git add .
!git commit -m "comparing tokenization schemes,"
!git push origin main

# Import Statements

In [None]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
from nltk import ngrams
from textblob import TextBlob
import nltk
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import sentencepiece as spm
import collections
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import itertools
from tqdm import tqdm
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sentence_transformers import SentenceTransformer

stop_words = set(stopwords.words('english'))

from langdetect import detect
import langdetect

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

import spacy
nlp = spacy.load("en_core_web_sm")
print("spaCy loaded OK!")

# Initial Preprocessing Steps

In [None]:
# Load dataset
df_posts = pd.read_csv("Final_Posts_Data.csv")
df_posts.head()

In [None]:
# Dataset overview
print("Dataset shape:", df_posts.shape)

In [None]:
# Check for duplicates based on a specific column
duplicates_name = df_posts.duplicated(subset=['content'])
print("\nDuplicates based on 'content':")
print(df_posts[duplicates_name])

In [None]:
# Missing Values
missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts)

In [None]:
# Missing Values
missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts) 

In [None]:
# Fill 'keyword' missing values
df_posts['keyword'] = df_posts['keyword'].fillna('no keyword')

# Fill 'author' missing values
df_posts['author'] = df_posts['author'].fillna('no author')

# Fill 'score' missing values with the median
median_score = df_posts['score'].median()
df_posts['score'] = df_posts['score'].fillna(median_score)

# Fill 'num_comments' missing values with 0 only where source == 'comments'
mask = df_posts['source'] == 'comments'
df_posts.loc[mask, 'num_comments'] = df_posts.loc[mask, 'num_comments'].fillna(0)

# Fill 'parent_post' missing values
df_posts['parent_post'] = df_posts['parent_post'].fillna('no post')

# Drop rows where 'content' is missing (only 1 row)
df_posts = df_posts.dropna(subset=['content'])
# reset the index
df_posts.reset_index(drop=True, inplace=True)

missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts) 

print("\n Dataset shape:", df_posts.shape)

df_posts.head()

In [None]:
# converting date column into a readable format
df_posts['created_date'] = pd.to_datetime(df_posts['created_utc'], unit='s').dt.date
df_posts['created_time'] = pd.to_datetime(df_posts['created_utc'], unit='s').dt.time

# Drop the original 'created_utc' column
df_posts.drop(columns=['created_utc'], inplace=True)
df_posts


### Drop non-english data (sinhala and tamil)

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return 'unknown'

# Create a new column for language
df_posts['language'] = df_posts['content'].apply(detect_language)

In [None]:
# Keep only English posts
df_posts = df_posts[df_posts['language'] == 'en'].copy()

# Drop the language column
df_posts.drop(columns=['language'], inplace=True)
print(df_posts.shape)

df_posts.head()

# Text Cleaning Pipeline

In [None]:
# Load English model for lemmatization
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# Function to replace emojis with textual description
def emoji_to_text(text):
    return emoji.demojize(text, delimiters=(" ", " "))

In [None]:
# Function to clean text
def clean_text(text):
    # Convert emojis to text
    text = emoji_to_text(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove Reddit platform metadata (u/username, r/subreddit)
    text = re.sub(r'u\/\w+', '', text)
    text = re.sub(r'r\/\w+', '', text)
    
    # Remove escape sequences and extra whitespace
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text)
    
    # Remove punctuation (except within words like can't, won't)
    text = re.sub(r'[^\w\s\']', '', text)
    
    # Lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    
    # Lemmatization
    doc = nlp(" ".join(tokens))
    lemmatized = [token.lemma_ for token in doc]
    
    # Join back into string
    cleaned_text = " ".join(lemmatized)
    
    return cleaned_text

In [None]:
# Apply cleaning function
df_posts['content_cleaned'] = df_posts['content'].astype(str).apply(clean_text)

In [None]:
# Create a word count column
df_posts["word_count"] = df_posts["content_cleaned"].apply(lambda x: len(str(x).split()))

# Plot histogram of word counts
plt.figure(figsize=(8, 5))
plt.hist(df_posts["word_count"])
plt.xlabel("Word Count per Document")
plt.ylabel("Frequency")
plt.title("Distribution of Word Counts in df_posts['content_cleaned']")
plt.show()

In [None]:
# Compute recommended thresholds using quantiles
lower_threshold = df_posts["word_count"].quantile(0.3)
upper_threshold = df_posts["word_count"].quantile(0.95)

print("Lower Threshold (10th percentile):", lower_threshold)
print("Upper Threshold (95th percentile):", upper_threshold)

In [None]:
df_posts_backup = df_posts.copy()
df_posts = df_posts[
    (df_posts["word_count"] >= lower_threshold) &
    (df_posts["word_count"] <= upper_threshold)
]

In [None]:
# 12. Report corpus statistics
all_text = " ".join(df_posts['content_cleaned']) 
all_words = all_text.split()
total_words = len(all_words)
unique_words = len(set(all_words))

In [None]:
print(f"Total words in cleaned corpus: {total_words}")
print(f"Unique words in cleaned corpus: {unique_words}") 

In [None]:
df_posts.head(20)

In [None]:
df_posts.shape

In [None]:
import unicodedata

def normalize_unicode(text):
    """
    Normalize unicode characters using NFKC.
    Converts full-width characters, combined characters, and compatibility characters 
    into a consistent canonical form.
    """
    return unicodedata.normalize("NFKC", text)

# Apply normalization
df_posts["content_cleaned"] = (
    df_posts["content_cleaned"]
    .astype(str)
    .apply(normalize_unicode)
)

In [None]:
df_posts.head(10)

In [None]:
# save cleaned data

df_posts.to_csv("cleaned_Posts_Data.csv", index=False) 

In [None]:
# Load cleaned data
df_posts = pd.read_csv("cleaned_Posts_Data.csv")
df_posts.head()

# Tokenization

## Traditional Word-Based

In [None]:
# Word-level tokenizer

def word_tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

In [None]:
df_posts["processed"] = df_posts["content_cleaned"].apply(word_tokenizer)

## Prepare corpus for SentencePiece

In [None]:
# Prepare corpus for SentencePiece
corpus_file = "corpus.txt"
with open(corpus_file, "w", encoding="utf-8") as f:
    for t in df_posts["content_cleaned"]:
        f.write(t + "\n")

### BPE Tokenizer

In [None]:
spm.SentencePieceTrainer.Train(
    f"--input={corpus_file} --model_prefix=bpe --vocab_size=8000 --model_type=bpe"
)
bpe = spm.SentencePieceProcessor()
bpe.load("bpe.model")

### Unigram tokenizer

In [None]:
spm.SentencePieceTrainer.Train(
    f"--input={corpus_file} --model_prefix=unigram --vocab_size=8000 --model_type=unigram"
)
unigram = spm.SentencePieceProcessor()
unigram.load("unigram.model")

## Compare Tokenization Schemes Impact using Corpus Statistics

Calculating:
- vocabulary size
- average tokens per document
- total tokens
- rare token frequency (<3 occurrences)
- OOV rate (for word-based tokenizers only)

In [None]:
# Corpus statistics per tokenizer

def compute_stats(tokenizer_fn, tokenizer_name):
    all_tokens = []
    doc_lengths = []

    for text in df_posts["content_cleaned"]:
        tokens = tokenizer_fn(text)
        all_tokens.extend(tokens)
        doc_lengths.append(len(tokens))

    vocab = set(all_tokens)
    counter = collections.Counter(all_tokens)

    return {
        "Tokenizer": tokenizer_name,
        "Vocabulary Size": len(vocab),
        "Total Tokens": len(all_tokens),
        "Avg Tokens per Doc": sum(doc_lengths)/len(doc_lengths),
        "Rare Tokens (<3)": sum(1 for t,c in counter.items() if c < 3)
    }

stats_word = compute_stats(word_tokenizer, "Word+Lemmatization")
stats_bpe = compute_stats(lambda t: bpe.encode(t, out_type=str), "BPE")
stats_unigram = compute_stats(lambda t: unigram.encode(t, out_type=str), "Unigram LM")

stats_df = pd.DataFrame([stats_word, stats_bpe, stats_unigram])

In [None]:
stats_df

In [None]:
# Set index to Tokenizer (for easier plotting)
plot_df = stats_df.set_index("Tokenizer")

# Plot each metric
plot_df.plot(kind="bar", figsize=(10,6))
plt.title("Comparison of Tokenizer Statistics")
plt.xlabel("Tokenizer")
plt.ylabel("Value")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


The tokenization analysis shows clear differences between the three approaches: Word+Lemmatization, BPE, and Unigram LMâ€”and the results strongly align with theoretical expectations. The word-level tokenizer produced an extremely large vocabulary of 47,222 items and a very high number of rare tokens (31,098), even after lemmatization and stopword removal. This indicates severe sparsity: each spelling variant, slang term, and morphological form becomes a separate token, making the representation unstable and prone to poor generalization. In contrast, BPE reduced the vocabulary size to 8,887 and dropped rare tokens to just 1,023, demonstrating its ability to decompose infrequent or noisy words into reusable subword units. This leads to a more robust and consistent representation, though at the cost of slightly longer token sequences, which is expected for subword models. The Unigram LM tokenizer achieved similar vocabulary compression (8,962 tokens) but yielded the lowest number of rare tokens (855), reflecting its probabilistic approach to selecting the most efficient and informative subword units. Overall, the results validate the progression predicted by NLP literature: word-level tokenization is the least efficient, BPE offers substantial improvements, and Unigram LM provides the most balanced and linguistically consistent tokenization strategy for noisy text data.


# Evaluate Tokenization Schemes and effectiveness for LLM Tasks

## LSTM Language Model

In [None]:
# Enable GPU Training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# Dataset class
class LMDataset(Dataset):
    def __init__(self, token_ids, seq_len=30):
        self.seq_len = seq_len
        self.data = token_ids

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx+self.seq_len], dtype=torch.long)
        y = torch.tensor(self.data[idx+1:idx+1+self.seq_len], dtype=torch.long)
        return x, y

In [None]:
# LSTM LM model
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        e = self.embed(x)
        o, _ = self.lstm(e)
        return self.fc(o)

In [None]:
# Training function
def train_lm(model, dataloader, epochs=3, lr=0.001):
    model = model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optim.zero_grad()
            pred = model(x)
            loss = loss_fn(pred.view(-1, pred.size(-1)), y.view(-1))
            loss.backward()
            optim.step()
    return model

In [None]:
# Perplexity
def perplexity(model, dataloader):
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = loss_fn(pred.view(-1, pred.size(-1)), y.view(-1))
            total_loss += loss.item() * x.numel()
            total_tokens += x.numel()
    return torch.exp(torch.tensor(total_loss / total_tokens))

### Train LSTMs per Tokenizer

In [None]:
# Hyperparameters
# -------------------------------
SEQ = 30
BATCH = 64

In [None]:
# Word-level Tokenizer
word_vocab = {w:i for i,w in enumerate(set(itertools.chain.from_iterable(
    df_posts["content_cleaned"].apply(word_tokenizer)
)))}
word_id = lambda toks: [word_vocab[t] for t in toks if t in word_vocab]

word_ids = list(itertools.chain.from_iterable(
    df_posts["content_cleaned"].apply(word_tokenizer).apply(word_id)
))
word_dl = DataLoader(LMDataset(word_ids, SEQ), batch_size=BATCH)
lm_word = LSTMLM(len(word_vocab)).to(device)
lm_word = train_lm(lm_word, word_dl)
pp_word = perplexity(lm_word, word_dl)
print("Word-level LM perplexity:", pp_word.item())

In [None]:
# BPE
bpe_ids = [i for t in df_posts["content_cleaned"] for i in bpe.encode(t)]
bpe_dl = DataLoader(LMDataset(bpe_ids, SEQ), batch_size=BATCH)
lm_bpe = LSTMLM(bpe.get_piece_size()).to(device)
lm_bpe = train_lm(lm_bpe, bpe_dl)
pp_bpe = perplexity(lm_bpe, bpe_dl)
print("BPE LM perplexity:", pp_bpe.item())

In [None]:
# Unigram
uni_ids = [i for t in df_posts["content_cleaned"] for i in unigram.encode(t)]
uni_dl = DataLoader(LMDataset(uni_ids, SEQ), batch_size=BATCH)
lm_uni = LSTMLM(unigram.get_piece_size()).to(device)
lm_uni = train_lm(lm_uni, uni_dl) 
pp_uni = perplexity(lm_uni, uni_dl)
print("Unigram LM perplexity:", pp_uni.item())

### Analysis and Comparison

In [None]:
final_results = pd.DataFrame({
    "Tokenizer": ["Word+Lemmatization", "BPE", "Unigram"],
    "Vocabulary Size": [stats_word["Vocabulary Size"], stats_bpe["Vocabulary Size"], stats_unigram["Vocabulary Size"]],
    "Total Tokens": [stats_word["Total Tokens"], stats_bpe["Total Tokens"], stats_unigram["Total Tokens"]],
    "Avg Tokens per Doc": [stats_word["Avg Tokens per Doc"], stats_bpe["Avg Tokens per Doc"], stats_unigram["Avg Tokens per Doc"]],
    "Rare Tokens (<3)": [stats_word["Rare Tokens (<3)"], stats_bpe["Rare Tokens (<3)"], stats_unigram["Rare Tokens (<3)"]],
    "Perplexity": [pp_word, pp_bpe, pp_uni]
})
print(final_results)

# comparison = pd.DataFrame({
#     "Tokenizer": ["Word", "BPE", "Unigram"],
#     "Vocabulary Size": [len(word_vocab), bpe.get_piece_size(), unigram.get_piece_size()],
#     "Perplexity": [pp_word.item(), pp_bpe.item(), pp_uni.item()]
# })
# print(comparison)