In [1]:
import numpy as np
import re
from operator import itemgetter

In [2]:
Corpus = {
    'l o w _':5,
    'l o w e r _':2,
    'n e w e s t _':6,
    'w i d e s t _':3,
    'h a p p i e r _':2
}

In [3]:
def getPairCounts(Corpus):
    pairs = {}
    for word, fr in Corpus.items(): #fr = freequency
        symbols = word.split(' ')
        for i in range(len(symbols)-1):
            pair = (symbols[i], symbols[i+1])
            cfr = pairs.get(pair, 0) #cfr = current freequency
            pairs[pair] = cfr + fr
    return pairs

In [4]:
pairsCounts = getPairCounts(Corpus)

In [5]:
pairsCounts

{('l', 'o'): 7,
 ('o', 'w'): 7,
 ('w', '_'): 5,
 ('w', 'e'): 8,
 ('e', 'r'): 4,
 ('r', '_'): 4,
 ('n', 'e'): 6,
 ('e', 'w'): 6,
 ('e', 's'): 9,
 ('s', 't'): 9,
 ('t', '_'): 9,
 ('w', 'i'): 3,
 ('i', 'd'): 3,
 ('d', 'e'): 3,
 ('h', 'a'): 2,
 ('a', 'p'): 2,
 ('p', 'p'): 2,
 ('p', 'i'): 2,
 ('i', 'e'): 2}

In [6]:
def getBestPair(pairsCounts):
    return max(pairsCounts, key = pairsCounts.get)

In [7]:
bestPair = getBestPair(pairsCounts)
bestPair

('e', 's')

In [8]:
def mergeInCorpus(Corpus, bestPair):
    newCorpus = {}
    for word in Corpus:
        newWord = re.sub(' '.join(bestPair), ''.join(bestPair), word)
        newCorpus[newWord] = Corpus[word]
    return newCorpus

In [9]:
# bestPair = getBestPair(pairsCounts)
newCorpus = mergeInCorpus(Corpus, bestPair)

In [10]:
newCorpus

{'l o w _': 5,
 'l o w e r _': 2,
 'n e w es t _': 6,
 'w i d es t _': 3,
 'h a p p i e r _': 2}

In [11]:
def runBPE(Corpus, k):
    bpeStats = {}
    for i in range (k):
        pairsCounts = getPairCounts(Corpus)
        if not pairsCounts:
            break
        bestPair = getBestPair(pairsCounts)
        bpeStats[bestPair] = i
        Corpus = mergeInCorpus(Corpus, bestPair)
    return Corpus, bpeStats

In [12]:
Corpus = {
    'l o w _':5,
    'l o w e r _':2,
    'n e w e s t _':6,
    'w i d e s t _':3,
    'h a p p i e r _':2
}

newCorpus, bpeStats = runBPE(Corpus, 10)

In [13]:
newCorpus

{'low_': 5, 'low er _': 2, 'newest_': 6, 'w i d est_': 3, 'h a p p i er _': 2}

In [14]:
bpeStats

{('e', 's'): 0,
 ('es', 't'): 1,
 ('est', '_'): 2,
 ('l', 'o'): 3,
 ('lo', 'w'): 4,
 ('n', 'e'): 5,
 ('ne', 'w'): 6,
 ('new', 'est_'): 7,
 ('low', '_'): 8,
 ('e', 'r'): 9}

In [15]:
newWord = 'lowest'
newWord2 = ' '.join(list(newWord))+' _'

word3 = newWord2.split(' ')
word4 = list(newWord+'_')
l1 = word3[2]
l2 = word4[2]

In [16]:
print(newWord)
print(newWord2)
print(word3)
print(word4)
print(l1)
print(l2)

lowest
l o w e s t _
['l', 'o', 'w', 'e', 's', 't', '_']
['l', 'o', 'w', 'e', 's', 't', '_']
w
w


In [17]:
def getAllPairs(word):
    pairs = []
    # word = list(word)
    word = word.split(' ') #if sent splitted word
    # word = list(word+'_') #if sent direct word
    prevChar = word[0]
    for char in word[1:]:
        pairs.append((prevChar, char))
        prevChar = char
    return pairs

In [18]:
pairs = getAllPairs(newWord2) #if sent splitted word
# pairs = getAllPairs(newWord) #if sent direct word

In [19]:
pairs

[('l', 'o'), ('o', 'w'), ('w', 'e'), ('e', 's'), ('s', 't'), ('t', '_')]

In [20]:
def getPairToBeMerged(bpeStats, pairs):
    # bpeCodes = []
    # for pair in pairs:
    #     if pair in bpeStats:
    #         bpeCodes.append((pair, bpeStats[pair]))
        
    bpeCodes = [(pair, bpeStats[pair]) for pair in pairs if pair in bpeStats]
    if len(bpeCodes) == 0:
        return (-1,-1)
    pairToBeMerged = min(bpeCodes, key=itemgetter(1))[0]
    return pairToBeMerged

In [21]:
pairToBeMerged = getPairToBeMerged(bpeStats, pairs)
pairToBeMerged

('e', 's')

In [22]:
def mergeLetters(word, pairToBeMerged):
    # word = ' '.join(list(word))+' _' #if sent direct word
    newWord = re.sub(' '.join(pairToBeMerged), ''.join(pairToBeMerged), word)
    return newWord

In [23]:
mergeLetters(newWord2, pairToBeMerged)
# mergeLetters(newWord, pairToBeMerged) #if sent direct word

'l o w es t _'

In [24]:
def bpeTokenize(word, bpeStats):
    if len(word) == 1:
        return word
    word = ' '.join(list(word))+' _' #if sent direct word

    while True:
        pairs = getAllPairs(word)
        pairToBeMerged = getPairToBeMerged(bpeStats, pairs)
        if pairToBeMerged[0] == -1:
            break
        word = mergeLetters(Word, pairToBeMerged)
    return word        

In [25]:
bpeTokenize('lowest', bpeStats)

NameError: name 'Word' is not defined