In [None]:
import math
from collections import defaultdict

bigram_counts = {
    ('the', 'longest'): 3000,
    ('longest', 'list'): 1000,
    ('list', 'of'): 2000,
    ('of', 'the'): 2500,
    ('stuff', 'at'): 1000,
    ('at', 'the'): 2000,
    ('the', 'longest'): 3000,
    ('longest', 'domain'): 500,
    ('domain', 'name'): 1500,
    ('name', 'at'): 800,
    ('at', 'long'): 1000,
    ('long', 'last'): 1200
}

unigram_counts = {
    'the': 50000, 'longest': 2000, 'list': 3000, 'of': 40000,
    'stuff': 1500, 'at': 35000, 'domain': 1200, 'name': 2000,
    'long': 2500, 'last': 2200
}

def get_bigram_cost(w1, w2):
    if w1 in unigram_counts and (w1, w2) in bigram_counts:
        prob = bigram_counts[(w1, w2)] / unigram_counts[w1]
        return -math.log(prob)
    else:
        return 10  

max_word_len = max(len(w) for w in unigram_counts)


In [None]:
def segment_text_bigram(text, unigram_counts, max_word_len):
    n = len(text)
    cost = [0]
    backtrace = [('', 0)]  

    for i in range(1, n + 1):
        candidates = []

        for k in range(1, min(i, max_word_len) + 1):
            word = text[i - k:i]
            word_lower = word.lower()

            if word_lower not in unigram_counts:
                continue 
            for j in range(len(cost)):
                if j == i - k:
                    prev_word = backtrace[j][0]
                    transition_cost = get_bigram_cost(prev_word, word_lower) if prev_word else 0
                    curr_cost = cost[j] + transition_cost
                    candidates.append((curr_cost, word_lower, j))

        if not candidates:
            cost.append(cost[-1] + 10)  
            backtrace.append((None, i - 1))
        else:
            best_cost, best_word, best_j = min(candidates)
            cost.append(best_cost)
            backtrace.append((best_word, best_j))

    # Reconstruct
    words = []
    i = n
    while i > 0:
        word, j = backtrace[i]
        if word:
            words.append(text[j:i])
        i = j

    return list(reversed(words))


"input_str" is the place holder for the string

In [3]:
input_str = "thelongestlistofthelongeststuffatthelongestdomainnameatlonglast"
segmented_bigram = segment_text_bigram(input_str, unigram_counts, max_word_len)
print("Segmented Output (Bigram):", segmented_bigram)


Segmented Output (Bigram): ['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domain', 'name', 'at', 'long', 'last']
