In [37]:
import re
import math
import copy

In [38]:
def preprocess(sentence):
    # 1. Handle zero-width joiner
    sentence = re.sub("\u200c", " ", sentence)

    # 2. Replace URLs (http, https, www)
    sentence = re.sub(r'https?://\S+|www\.\S+', '<URL>', sentence)

    # 3. Replace numbers (any continuous digits)
    sentence = re.sub(r'\d+', '<NUMBER>', sentence)

    sentence = re.sub(r'[^\w\s]', ' <PUNCT> ', sentence)

    sentence = sentence.lower()

    tokens = sentence.split()

    return tokens

In [39]:
def compute_tf_with_normalization(sentence, smoothing=False):
    TF = {}
    for word in sentence:
        try:
            TF[word] += 1
        except:
            TF[word] = 1

    length = len(sentence)

    if not smoothing:
        for key in TF.keys():
            TF[key] /= length

    else:
        denom = 0
        for key in TF.keys():
            denom += (1 + math.log(TF[key]))

        for key in TF.keys():
            TF[key] /= denom

    return TF


In [40]:
def compute_idf(sentence, sentences, smoothing=False):
    IDF = {}
    N = len(sentences)
    for word in sentence:
        IDF[word] = 0
        for s in sentences:
            if word in s:
                IDF[word] += 1

    if not smoothing:
        for key in IDF.keys():
            IDF[key] = math.log(N/IDF[key])

    else:
        for key in IDF.keys():
            IDF[key] = math.log(((1+N)/(1+IDF[key]))) + 1

    return IDF

In [41]:
def compute_tf_idf_scores(sentences, smoothing=False):
    TF_IDF = {}
    for sentence in sentences:
        List = []
        TF = compute_tf_with_normalization(sentence, smoothing)
        IDF = compute_idf(sentence, sentences, smoothing)
        for key in TF.keys():
            List.append(TF[key]*IDF[key])

        TF_IDF[tuple(sentence)] = List

    return TF_IDF

In [42]:
def main():
    sentences = [
        "Apple released the new iPhone 15 today! Visit https://apple.com",
        "The price of the iPhone 15 is 799 dollars.",
        "Check www.example.com for more details."
    ]

    preprocessed_sentences = [preprocess(s) for s in sentences]

    print("=== Preprocessed Sentences ===")
    for i, s in enumerate(preprocessed_sentences):
        print(f"Sentence {i+1}: {s}")

    tfidf_results = compute_tf_idf_scores(preprocessed_sentences, smoothing=False)

    print("\n=== TF-IDF Scores (per sentence) ===")
    for i, sentence_tokens in enumerate(preprocessed_sentences):
        print(f"\nSentence {i+1}:")
        tfidf_list = tfidf_results[tuple(sentence_tokens)]
        for token, score in zip(sentence_tokens, tfidf_list):
            print(f"{token:15} -> {score:.6f}")

main()

=== Preprocessed Sentences ===
Sentence 1: ['apple', 'released', 'the', 'new', 'iphone', '<punct>', 'number', '<punct>', 'today', '<punct>', 'visit', '<punct>', 'url', '<punct>']
Sentence 2: ['the', 'price', 'of', 'the', 'iphone', '<punct>', 'number', '<punct>', 'is', '<punct>', 'number', '<punct>', 'dollars', '<punct>']
Sentence 3: ['check', '<punct>', 'url', '<punct>', 'for', 'more', 'details', '<punct>']

=== TF-IDF Scores (per sentence) ===

Sentence 1:
apple           -> 0.078472
released        -> 0.078472
the             -> 0.028962
new             -> 0.078472
iphone          -> 0.028962
<punct>         -> 0.000000
number          -> 0.028962
<punct>         -> 0.078472
today           -> 0.078472
<punct>         -> 0.028962

Sentence 2:
the             -> 0.057924
price           -> 0.078472
of              -> 0.078472
the             -> 0.028962
iphone          -> 0.000000
<punct>         -> 0.057924
number          -> 0.078472
<punct>         -> 0.078472

Sentence 3:
check   

In [43]:
from collections import Counter, defaultdict

In [44]:
sentences = [
    "The boy hugs the cat.",
    "The boys are hugging the dogs.",
    "The dogs are chasing the cats.",
    "The dog and the cat sit quietly.",
    "The boy is sitting on the dog."
]

def preprocess(sentences):
    processed = []
    for s in sentences:
        s = s.lower()
        s = re.sub(r'([.,!?])', r' \1 ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        tokens = s.split()
        processed.append(tokens)
    return processed

preprocessed_sentences = preprocess(sentences)
print(preprocessed_sentences)

[['the', 'boy', 'hugs', 'the', 'cat', '.'], ['the', 'boys', 'are', 'hugging', 'the', 'dogs', '.'], ['the', 'dogs', 'are', 'chasing', 'the', 'cats', '.'], ['the', 'dog', 'and', 'the', 'cat', 'sit', 'quietly', '.'], ['the', 'boy', 'is', 'sitting', 'on', 'the', 'dog', '.']]


In [45]:
# Initialize vocab with all characters and a special end-of-word token ##
vocab = Counter()

# Split each word into characters + end-of-word marker (##)
def get_initial_vocab(sentences):
    words = []
    for sent in sentences:
        for token in sent:
            chars = list(token)
            chars.append('##')
            words.append(chars)
    return words

word_sequences = get_initial_vocab(preprocessed_sentences)

for seq in word_sequences:
    vocab.update([''.join(seq)])

print("Initial vocab:", vocab)

Initial vocab: Counter({'the##': 10, '.##': 5, 'boy##': 2, 'cat##': 2, 'are##': 2, 'dogs##': 2, 'dog##': 2, 'hugs##': 1, 'boys##': 1, 'hugging##': 1, 'chasing##': 1, 'cats##': 1, 'and##': 1, 'sit##': 1, 'quietly##': 1, 'is##': 1, 'sitting##': 1, 'on##': 1})


In [46]:
def get_pair_counts(word_sequences):
    pairs = Counter()
    for seq in word_sequences:
        for i in range(len(seq)-1):
            pair = (seq[i], seq[i+1])
            pairs[pair] += 1
    return pairs

pair_counts = get_pair_counts(word_sequences)
print("Top pairs:", pair_counts.most_common(10))

Top pairs: [(('e', '##'), 12), (('t', 'h'), 10), (('h', 'e'), 10), (('s', '##'), 6), (('.', '##'), 5), (('g', '##'), 5), (('d', 'o'), 4), (('o', 'g'), 4), (('b', 'o'), 3), (('o', 'y'), 3)]


In [47]:
def merge_pair(pair_to_merge, word_sequences):
    new_sequences = []
    bigram = ''.join(pair_to_merge)
    for seq in word_sequences:
        i = 0
        new_seq = []
        while i < len(seq):
            if i < len(seq)-1 and seq[i] == pair_to_merge[0] and seq[i+1] == pair_to_merge[1]:
                new_seq.append(bigram)
                i += 2
            else:
                new_seq.append(seq[i])
                i += 1
        new_sequences.append(new_seq)
    return new_sequences

In [48]:
num_merges = 20

for i in range(num_merges):
    pairs = get_pair_counts(word_sequences)
    if not pairs:
        break
    most_freq_pair = pairs.most_common(1)[0][0]
    word_sequences = merge_pair(most_freq_pair, word_sequences)
    vocab[''.join(most_freq_pair)] += pairs[most_freq_pair]

print("Final WordPiece vocabulary:")
print(sorted(vocab.keys()))

Final WordPiece vocabulary:
['.##', 'and##', 'ar', 'are##', 'bo', 'boy', 'boy##', 'boys##', 'ca', 'cat', 'cat##', 'cats##', 'chasing##', 'do', 'dog##', 'dogs##', 'e##', 'g##', 'gs##', 'hu', 'hugging##', 'hugs##', 'in', 'ing##', 'is##', 'on##', 'quietly##', 's##', 'sit##', 'sitting##', 'th', 'the##']


In [49]:
def wordpiece_tokenize(sentence, vocab):
    sentence = sentence.lower()
    sentence = re.sub(r'([.,!?])', r' \1 ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    words = sentence.split()

    tokens = []
    for word in words:
        chars = list(word) + ['##']
        i = 0
        current_tokens = []
        while i < len(chars):
            match = None
            for j in range(len(chars), i, -1):
                candidate = ''.join(chars[i:j])
                if candidate in vocab:
                    match = candidate
                    break
            if match:
                current_tokens.append(match)
                i += len(match)
            else:
                current_tokens.append(chars[i])
                i += 1
        tokens.extend(current_tokens)
    return tokens

test_sentence = "The cat is chasing the dog quietly."
tokenized_sentence = wordpiece_tokenize(test_sentence, vocab)
print("Tokenized sentence:", tokenized_sentence)

Tokenized sentence: ['the##', 'cat##', 'is##', 'chasing##', 'the##', 'dog##', 'quietly##', '.##']


In [50]:
def NGram(Gram: int = 1, Paragraph: list = None, Smoothing = None):
    L_List = []
    G_List = []

    if Paragraph is None:
        return

    # Function to flatten nested lists inside each sentence
    def flatten(sentence):
        flat = []
        for w in sentence:
            if isinstance(w, list):
                flat.extend(w)   # add inner tokens
            else:
                flat.append(w)
        return flat

    # --- Unigrams ---
    if Gram >= 1:
        Gram_1 = {}
        total = 0
        for sentence in Paragraph:
            sentence = flatten(sentence)   # ðŸŒŸ FIX APPLIED HERE
            for word in sentence:
                Gram_1[word] = Gram_1.get(word, 0) + 1
                total += 1

        Count_1 = copy.deepcopy(Gram_1)
        for k in Gram_1:
            Gram_1[k] = Gram_1[k] / total

        L_List.append(Count_1)
        G_List.append(Gram_1)

    # --- Bigrams ---
    if Gram >= 2:
        Gram_2 = {}
        for sentence in Paragraph:
            sentence = flatten(sentence)   # ðŸŒŸ FIX APPLIED HERE TOO
            for i in range(len(sentence)-1):
                s = tuple(sentence[i:i+2])
                Gram_2[s] = Gram_2.get(s, 0) + 1

        Count_2 = copy.deepcopy(Gram_2)
        for k in Gram_2:
            Gram_2[k] = Gram_2[k] / Count_1.get(k[0], 1)

        L_List.append(Count_2)
        G_List.append(Gram_2)

    # --- Trigrams ---
    if Gram >= 3:
        Gram_3 = {}
        for sentence in Paragraph:
            sentence = flatten(sentence)
            for i in range(len(sentence)-2):
                s = tuple(sentence[i:i+3])
                Gram_3[s] = Gram_3.get(s, 0) + 1

        Count_3 = copy.deepcopy(Gram_3)
        for k in Gram_3:
            Gram_3[k] = Gram_3[k] / Count_2.get(k[:2], 1)

        L_List.append(Count_3)
        G_List.append(Gram_3)

    # --- Quadgrams ---
    if Gram >= 4:
        Gram_4 = {}
        for sentence in Paragraph:
            sentence = flatten(sentence)
            for i in range(len(sentence)-3):
                s = tuple(sentence[i:i+4])
                Gram_4[s] = Gram_4.get(s, 0) + 1

        Count_4 = copy.deepcopy(Gram_4)
        for k in Gram_4:
            Gram_4[k] = Gram_4[k] / Count_3.get(k[:3], 1)

        L_List.append(Count_4)
        G_List.append(Gram_4)

    # --- Apply Smoothing if Provided ---
    if Smoothing is not None:
        L1, G1 = L_List[0], G_List[0]
        L2 = L3 = L4 = None
        G2 = G3 = G4 = None
        if Gram >= 2:
            L2, G2 = L_List[1], G_List[1]
        if Gram >= 3:
            L3, G3 = L_List[2], G_List[2]
        if Gram >= 4:
            L4, G4 = L_List[3], G_List[3]

        Smoothing(Gram=Gram, L1=L1, L2=L2, L3=L3, L4=L4,
                  G1=G1, G2=G2, G3=G3, G4=G4, Data=None)

    return L_List, G_List

In [51]:
def AddK_Smoothing(Gram=1, L1=None, L2=None, L3=None, L4=None, G1=None, G2=None, G3=None, G4=None, Data=None, K=0.3):

    if Gram >= 1:
        total_count = sum(L1.values())
        V = len(L1)  # vocabulary size
        for word in L1:
            G1[word] = (L1[word] + K) / (total_count + K * V)

    if Gram >= 2 and L2 is not None:
        for bigram in L2:
            history = bigram[0]
            history_count = L1.get(history, 0)
            G2[bigram] = (L2[bigram] + K) / (history_count + K * len(L1))

    if Gram >= 3 and L3 is not None:
        for trigram in L3:
            history = trigram[:2]
            history_count = L2.get(history, 0)
            G3[trigram] = (L3[trigram] + K) / (history_count + K * len(L1))

    if Gram >= 4 and L4 is not None:
        for quadgram in L4:
            history = quadgram[:3]
            history_count = L3.get(history, 0)
            G4[quadgram] = (L4[quadgram] + K) / (history_count + K * len(L1))

In [52]:
Inform = ["Check out https://example.com for more info!", "Your package #12345 will arrive tomorrow.", "Download the report from https://reports.com."]
Reminder = ["Meeting at 3pm, don't forget to bring the files.", "The meeting is starting in 10 minutes.", "Reminder: submit your timesheet by 5pm today."]
Promo = ["Order 3 items, get 1 free! Limited offer!!!", "Win $1000 now, visit http://winbig.com!!!", "Exclusive deal for you: buy 2, get 1 free!!!"]

Inform = [preprocess(s) for s in Inform]
Reminder = [preprocess(s) for s in Reminder]
Promo = [preprocess(s) for s in Promo]

In [53]:
Inform_Count, Inform_Probability = NGram(2, Inform, AddK_Smoothing)
Reminder_Count, Reminder_Probability = NGram(2, Reminder, AddK_Smoothing)
Promo_Count, Promo_Probability = NGram(2, Promo, AddK_Smoothing)

In [54]:
def sentence_probability(tokens, Count):
    # Flatten any nested lists (IMPORTANT FIX)
    flat = []
    for t in tokens:
        if isinstance(t, list):
            flat.extend(t)
        else:
            flat.append(t)
    tokens = flat  # overwrite

    unigram_counts = Count[0]
    bigram_counts = Count[1]

    V = len(unigram_counts)
    K = 1  # smoothing

    prob = 1

    for i in range(len(tokens)-1):
        bg = (tokens[i], tokens[i+1])  # ensure tuple, not list

        bg_count = bigram_counts.get(bg, 0)
        history_count = unigram_counts.get(tokens[i], 0)

        prob_bg = (bg_count + K) / (history_count + K * V)
        prob *= prob_bg

    return prob

In [55]:
test_sentence = "You will get an exclusive offer in the meeting!"
test_tokens = preprocess(test_sentence)

prob_Inform = sentence_probability(test_tokens, Inform_Count)
prob_Reminder = sentence_probability(test_tokens, Reminder_Count)
prob_Promo = sentence_probability(test_tokens, Promo_Count)

In [56]:
print(f"Inform: {prob_Inform} | Reminder: {prob_Reminder} | Promotion: {prob_Promo}")
categories = {
    "Inform": prob_Inform,
    "Reminder": prob_Reminder,
    "Promotion": prob_Promo
}
predicted_label = max(categories.items(), key=lambda x: x[1])[0]
print(f"Predicted class: {predicted_label}")

Inform: 9.691501262634235e-55 | Reminder: 7.623614053873445e-49 | Promotion: 4.102768402059395e-52
Predicted class: Reminder
