In [18]:
import re
import numpy as np
import pandas as pd

## prepare docs

In [27]:
d1 = "Climate change is accelerating faster than previously thought, \
      according to a new report from the Intergovernmental Panel on \
      Climate Change (IPCC). The report highlights that global \
      temperatures have risen by an average of 1.1°C since the late \
      19th century, with most of this increase occurring in the last 40 years. Rising sea levels, more frequent extreme weather events, and loss of biodiversity are some of the key impacts outlined in the report."
d2 = "The human genome consists of approximately 3 billion base pairs of DNA, which are organized into 23 pairs of chromosomes. Each gene contains the instructions necessary to build proteins, which are the molecular machines that carry out many functions in the body. The process of gene expression involves transcription, where DNA is copied into RNA, and translation, where RNA is used to build proteins."
d3 = "Person A: Hey, did you finish the report for the marketing team? \
      Person B: Not yet, I’m almost done. I just need to add a few more details. \
      Person A: Alright, make sure it’s submitted by 5 PM. \
      Person B: No problem, I’ve got it covered."
d4 = "n the year 1066, the Battle of Hastings took place in England, marking a pivotal moment in the country’s history. William the Conqueror, the Duke of Normandy, defeated King Harold II, which led to the Norman conquest of England. This event significantly influenced the culture, language, and governance of England."
d5 = "I recently bought the new XYZ smartphone, and I have to say, I’m really impressed. The camera quality is excellent, especially in low light. The battery life easily lasts a full day with heavy use, and the screen resolution is sharp and vibrant. My only complaint is the fingerprint sensor, which can be a bit finicky at times."


docs = [d1, d2, d3, d4, d5]

# Tokenization

## Normal Tokenization

In [30]:
def tokenize(docs:list[str]):
    sep = r'[ \n\r\t\f,.\(\)\?:]+'
    pattern = re.compile(sep)
    vocabs = set()
    for d in docs:
        words = pattern.split(d.strip().lower())
        vocabs.update(words)
    if '' in vocabs:
        vocabs.remove('')
    return vocabs

#print(tokenize(docs))

{'used', 'camera', 'say', 'risen', 'more', 'battery', 'bought', 'rising', 'william', 'no', 'by', 'governance', '3', 'organized', 'outlined', 'problem', 'where', 'intergovernmental', 'are', 'moment', 'you', 'conquest', 'rna', 'billion', 'i', 'make', 'instructions', 'with', 'events', 'climate', 'out', 'proteins', 'it', 'details', 'year', 'late', 'vibrant', 'bit', 'a', 'covered', 'influenced', 'expression', 'n', 'country’s', 'my', 'few', 'only', 'from', 'took', 'culture', 'times', 'gene', 'translation', 'machines', 'just', 'really', 'yet', 'for', 'since', 'this', 'not', 'history', 'into', 'ipcc', 'involves', 'life', '40', 'and', 'harold', 'report', 'some', 'excellent', 'norman', 'duke', 'significantly', 'sensor', 'according', 'biodiversity', 'sure', '23', '1°c', '5', 'did', 'occurring', 'it’s', 'got', 'marking', 'low', 'of', 'base', 'defeated', 'full', 'loss', 'average', 'weather', 'i’m', 'battle', 'key', 'transcription', 'on', 'century', 'chromosomes', 'alright', 'approximately', 'dna', 

## BPE (byte-pair encoding)

In [51]:
from collections import Counter


def get_pair_counts(tokenized_doc:list[str]):
    pair_counts = Counter()
    for word in tokenized_doc:
        chars = word.split()
        for i in range(len(chars)-1):
            pair = (chars[i],chars[i+1])
            pair_counts[pair] +=1
    return pair_counts


def BPE_training(docs:list[str], numMerges:int = 1000):
    vocabs = set()
    for d in docs:
        vocabs.update(re.split(r"", d.strip().lower()))
    if '' in vocabs:
        vocabs.remove('')

    word_tokens = []
    for d in docs:
        words = d.strip().lower().split()
        word_tokens.extend([' '.join(w) for w in words])

    merged_rule = []
    for i in range(numMerges):
        pair_counts = get_pair_counts(word_tokens)
        if not pair_counts:
            break
        pair = pair_counts.most_common(1)[0][0]
        vocabs.add(''.join(pair))

        new_word_tokens = []
        for word in word_tokens:
            pattern = re.escape(" ".join(pair))
            merged_word = re.sub(pattern, "".join(pair), word)
            new_word_tokens.append(merged_word)
        
        word_tokens = new_word_tokens
        merged_rule.append(pair)
    
    
    for word in word_tokens:
        vocabs.update(word.split())

    return vocabs, merged_rule



def BPE_infer(phrase:str, merged_rule:list[tuple]):
    tokenized_doc = [' '.join(w) for w in phrase.split()]
    for pair in merged_rule:
        pattern = re.escape(" ".join(pair))
        new_tokenized_doc = []
        for word in tokenized_doc:
            merged_word = re.sub(pattern, "".join(pair),word)
            new_tokenized_doc.append(merged_word)
            tokenized_doc = new_tokenized_doc
    return "  ".join(tokenized_doc)


vocabs , merged_rule = BPE_training(docs)
print(BPE_infer("Alright, make sure it’s submitted by", merged_rule))

A l r i ght ,  make  sure  it’s  submitted  by


# Term Frequency Table

In [26]:
tf_table = pd.DataFrame(columns=range(len(docs)), index=list(vocabs))
for v in vocabs:
    for di in range(len(docs)):
        d = docs[di]
        freq = len(re.findall(v, d, flags=re.IGNORECASE))
        tf_table.loc[v, di] = freq
tf_table.head()


Unnamed: 0,0,1,2,3,4
,475,402,254,315,328
used,0,1,0,0,0
camera,0,0,0,0,1
say,0,0,0,0,1
risen,1,0,0,0,0


# generate inverted indexed matrix

In [None]:
IDM = {}
for v in vocabs:
    wposi = {}
    for di in range(len(docs)):
        d = docs[di]
        matches = re.finditer(v, d, flags=re.IGNORECASE)
        posis = []
        for match in matches:
            pid = match.start()
            posis.append(pid)
        wposi[di] = sorted(posis)
    wposi['freq'] = len(wposi)
    IDM[v] = wposi

print(IDM['is'][0])