In [56]:
import re
import numpy as np
import pandas as pd
import os
import glob

## prepare docs

In [102]:
d1 = "Climate change is accelerating faster than previously thought, \
      according to a new report from the Intergovernmental Panel on \
      Climate Change (IPCC). The report highlights that global \
      temperatures have risen by an average of 1.1°C since the late \
      19th century, with most of this increase occurring in the last 40 years. Rising sea levels, more frequent extreme weather events, and loss of biodiversity are some of the key impacts outlined in the report."
d2 = "The human genome consists of approximately 3 billion base pairs of DNA, which are organized into 23 pairs of chromosomes. Each gene contains the instructions necessary to build proteins, which are the molecular machines that carry out many functions in the body. The process of gene expression involves transcription, where DNA is copied into RNA, and translation, where RNA is used to build proteins."
d3 = "Person A: Hey, did you finish the report for the marketing team? \
      Person B: Not yet, I’m almost done. I just need to add a few more details. \
      Person A: Alright, make sure it’s submitted by 5 PM. \
      Person B: No problem, I’ve got it covered."
d4 = "n the year 1066, the Battle of Hastings took place in England, marking a pivotal moment in the country’s history. William the Conqueror, the Duke of Normandy, defeated King Harold II, which led to the Norman conquest of England. This event significantly influenced the culture, language, and governance of England."
d5 = "I recently bought the new XYZ smartphone, and I have to say, I’m really impressed. The camera quality is excellent, especially in low light. The battery life easily lasts a full day with heavy use, and the screen resolution is sharp and vibrant. My only complaint is the fingerprint sensor, which can be a bit finicky at times."


docs = [d1, d2, d3, d4, d5]

## docs loader

In [59]:

def docs_loader(corpus_dir:str):
    fpaths = glob.iglob(os.path.join(corpus_dir, '*.txt'))
    docs = []
    for fp in fpaths:
        with open(fp, 'r', encoding= "utf-8") as file:
            doc = file.read()
            docs.append(doc)
    return docs
        


# Tokenization

## Normal Tokenization

In [104]:
def tokenize(docs:list[str]):
    sep = r'(?<=\w)(?=[,.!?:;"\'\[\]\(\)])|(?<=[,.!?:;"\'\[\]\(\)])(?=\w)|\s+'
    pattern = re.compile(sep)
    for d in docs:
        words = pattern.split(d.strip().lower())
        for w in words:
            yield w


#seen = set(tokenize(docs))
#print(seen)

{'used', 'camera', 'say', 'risen', 'more', 'battery', 'bought', 'rising', 'william', 'no', 'by', 'governance', '3', 'organized', 'outlined', 'problem', 'where', 'intergovernmental', 'are', 'moment', 'you', 'conquest', 'rna', 'billion', 'i', 'make', 'instructions', 'with', 'events', 'climate', 'out', 'proteins', 'it', 'details', 'year', '(', 'late', 'vibrant', 'bit', 'a', 'covered', 'influenced', 'expression', 'n', 'country’s', 'my', 'few', 'only', 'from', 'took', 'culture', 'times', 'gene', 'translation', 'machines', 'just', 'really', 'yet', 'for', 'since', 'this', 'not', 'history', 'into', 'ipcc', 'involves', 'life', '40', 'and', 'harold', 'report', 'some', 'excellent', 'norman', 'duke', 'significantly', 'sensor', 'according', 'biodiversity', 'sure', '23', '1°c', '5', 'did', 'occurring', 'it’s', 'got', 'marking', 'low', 'of', 'base', 'defeated', 'full', 'loss', 'average', 'weather', 'i’m', 'battle', 'key', 'transcription', 'on', 'century', 'chromosomes', 'alright', 'approximately', 'd

## BPE (byte-pair encoding)

In [98]:
from collections import Counter


def get_pair_counts(tokenized_doc:list[str]):
    pair_counts = Counter()
    for word in tokenized_doc:
        chars = word.split()
        for i in range(len(chars)-1):
            pair = (chars[i],chars[i+1])
            pair_counts[pair] += 1
    return pair_counts


def BPE_train_english(docs:list[str], numMerges:int = 500):
    vocabs = set()
    for d in docs:
        vocabs.update(re.split(r"", d.strip().lower()))
    if '' in vocabs:
        vocabs.remove('')

    word_tokens = []
    for d in docs:
        words = d.strip().lower().split()
        word_tokens.extend([' '.join(w) for w in words])

    merged_rule = []
    for i in range(numMerges):
        pair_counts = get_pair_counts(word_tokens)
        if not pair_counts:
            break
        pair = pair_counts.most_common(1)[0][0]
        vocabs.add(''.join(pair))

        new_word_tokens = []
        for word in word_tokens:
            pattern = re.escape(" ".join(pair))
            merged_word = re.sub(pattern, "".join(pair), word)
            new_word_tokens.append(merged_word)
        
        word_tokens = new_word_tokens
        merged_rule.append(pair)
    
    
    for word in word_tokens:
        vocabs.update(word.split())

    return vocabs, merged_rule



def BPE_infer_english(phrase:str, merged_rule:list[tuple]):
    tokenized_doc = [' '.join(w) for w in phrase.split()]
    for pair in merged_rule:
        pattern = re.escape(" ".join(pair))
        new_tokenized_doc = []
        for word in tokenized_doc:
            merged_word = re.sub(pattern, "".join(pair),word)
            new_tokenized_doc.append(merged_word)
            tokenized_doc = new_tokenized_doc
    return tokenized_doc


def BPE_train_chinese(docs:list[str], numMerge:int=500):
    # initialize vocabs
    vocabs = set()
    for d in docs:
        chars = list(d.strip())
        vocabs.update(chars)
    

    # initialze split
    tokenized_doc = []
    for d in docs:
        tokenized_doc.append(' '.join(d.strip()))
        #print(' '.join(d.strip()))
    
    # merge words
    merged_rule = []
    for i in range(numMerge):
        pair_counts = get_pair_counts(tokenized_doc)
        if not pair_counts:
            break
        pair = pair_counts.most_common(1)[0][0]
        merged_rule.append(pair)

        # substitude
        new_tokenized_doc = []
        for tokenized_str in tokenized_doc:
            merged_word = re.sub(re.escape(" ".join(pair)), ''.join(pair), tokenized_str)            
            new_tokenized_doc.append(merged_word)
        tokenized_doc = new_tokenized_doc
    
    #print(tokenized_doc)


    # update vocabs
    for phrase in tokenized_doc:
        vocabs.update(phrase.split())

    return vocabs, merged_rule


#print(BPE_infer("Alright, make sure it’s submitted by", merged_rule))

## Unigram Algorithm

In [110]:


def get_len_n_subwords(word:str, n:int):
    subwords = []
    for i in range(len(word)-n+1):
        ppp = []
        for k in range(n):
            ppp.append(word[i+k])
        subwords.append(tuple(ppp))
    return subwords

def segmente(vocabs, )
        

def get_loss(more_gram:Counter, docs:list[str],removal_word:str= ''):
    doc_size = len(docs)
    tot_freq = more_gram.total()

    
    for i in range(doc_size):
        summation = []

        offset = more_gram[removal_word] if removal_word else 0

        for pair, count in more_gram.items():
            n_gram = ' '.join(pair)
            if removal_word == n_gram:
                continue
            px = count/(tot_freq-offset)
            summation.append(px)

        inner = np.log10(sum(summation))




def unigram_algo_english(docs:list[str], n:int):
    # populate vocabs (unigram and more_gram)
    tokenized_doc = tokenize(docs)
    vocabs = set('abcdefghijklmnopqrstuvwxyz')
    
    more_gram = Counter()
    for i in range(n-1):
        length = i+2
        # determine what-gram we looking for, and iterate to get those grams
        for word in tokenized_doc:
            #pairs = get_pairs(word, length)
            more_gram.update()
    
    # pruning
    # get total loss


    # update final vocab

   


[('a', 'p', 'p'), ('p', 'p', 'l'), ('p', 'l', 'e')]

In [100]:
docs = docs_loader("./data")
vocabs , merged_rule = BPE_train_chinese(docs, 200)
print(vocabs)

{'客', '些', '修', '颁', '天癸', '界', '地', '惰', '无论', '潮', '趣', '步', '延', '诗', '现', '保养', '来', '将', '四', '生活', '的，以', '及', '俱', '时，任', '专', '腾', '万物的', '，牙齿', '很', '上', '刚', '压', '蓬', '精神', '养', '天', '睡', '域', '构', '不会', '随着', '干', '量', '区', '泻', '索', '即', '怒', '供', '围', '标', '暖', '七', '命', '炫', '阴阳', '岁，肾气', '葵', '岁', '惑', '蕴', '真气', '，形体', '润', '盛，人们', '世俗', '。其', '变化', '辰', '会', '厌', '答', '女子', '五脏', '受', '助', '般', '世', '传', '生命', '耗散', '的', '热', '可以', '和', '追', '耳', '歌', '云', '患', '场', '元', '操', '生育', '明', '厥', '色', '活', '炼', '喜', '仍', '月', '怨', '乱', '息', '止', '弱', '后', '同', '沉', '的时候', '晚', '。如果', '圣人', '良', '神', '散', '春', '合', '肃', '展', '有子', '秩', '眼', '，精神', '筋', '置', '。如', '歧伯', '者，以', '酒', '列', '度', '纪', '立', '茂', '呼', '久', '样', '排', '危', '生育子女', '，以致', '暴', '入', '治', '房', '伤', '脱', '顺从', '绪', '伐', '由', '俗', '成', '务', '男', '恨', '伏', '密', '衰', '肺', '的变化', '滋', '呢', '让', '东', '精神，使', '防', '验', '痿', '龟', '迅', '创', '肉', '前', '软', '四时', '望', '稳', '闭', '等', '子', '去', '掘', '未', '悠', '它', '

# Term Frequency Table

In [26]:
tf_table = pd.DataFrame(columns=range(len(docs)), index=list(vocabs))
for v in vocabs:
    for di in range(len(docs)):
        d = docs[di]
        freq = len(re.findall(v, d, flags=re.IGNORECASE))
        tf_table.loc[v, di] = freq
tf_table.head()


Unnamed: 0,0,1,2,3,4
,475,402,254,315,328
used,0,1,0,0,0
camera,0,0,0,0,1
say,0,0,0,0,1
risen,1,0,0,0,0


# generate inverted indexed matrix

In [None]:
IDM = {}
for v in vocabs:
    wposi = {}
    for di in range(len(docs)):
        d = docs[di]
        matches = re.finditer(v, d, flags=re.IGNORECASE)
        posis = []
        for match in matches:
            pid = match.start()
            posis.append(pid)
        wposi[di] = sorted(posis)
    wposi['freq'] = len(wposi)
    IDM[v] = wposi

print(IDM['is'][0])