In [1]:
def generate_ngrams(sentence: list[str], order: int) -> list[tuple[str, ...]]:
    sentence = ["<s>"] * (order - 1) + sentence + ["</s>"]
    ngrams = []
    for i in range(len(sentence) - order + 1):
            ngrams.append(tuple(sentence[i:i+order]))
    return ngrams

In [2]:
generate_ngrams("был тихий серый вечер".split(), 3)

[('<s>', '<s>', 'был'),
 ('<s>', 'был', 'тихий'),
 ('был', 'тихий', 'серый'),
 ('тихий', 'серый', 'вечер'),
 ('серый', 'вечер', '</s>')]

In [3]:
from collections import Counter
def generate_model(text: list[list[str]], order: int) -> tuple[set, Counter, Counter]:
    ngram_counter = Counter()
    n1gram_counter = Counter()
    vocab = set()
    for line in text:
        vocab.update(line)
        ngram_counter.update(generate_ngrams(line, order))
        n1gram_counter.update(generate_ngrams(line, order - 1))
    return vocab, ngram_counter, n1gram_counter

In [51]:
from math import log
def calculate_logprob(sentence: list[str], model: tuple[set, Counter, Counter], order: int, k: int = 1) -> float:
    prob = 0
    for ngram in generate_ngrams(sentence, order):
        prob += log((k + model[1].get(ngram, 0)) / (len(model[0]) * k + model[2].get(ngram[:-1], 0)), 2)
    return prob

In [5]:
from collections import defaultdict
def good_turing(ngram_counter: Counter, k: int = 10) -> tuple[Counter, defaultdict]:
    smoothed_model = Counter
    count2ngrams = defaultdict(set)
    for ngram, count in ngram_counter.items():
        count2ngrams[count].add(ngram)
    for ngram, count in ngram_counter.items():
        smoothed_model[ngram] = (count + 1) * count2ngrams[count + 1] / count2ngrams[count]
    return smoothed_model, count2ngrams

In [6]:
def generate_backoff_model(text: list[list[str]], order: int) -> tuple[Counter, ...]:
    model = [Counter() for i in range(order)]
    for line in text:
        for i, ctr in enumerate(model):
            ctr.update(generate_ngrams(line), order=i+1)
    return model

In [8]:
def katz(ngram: tuple[str, ...], model: tuple[Counter, ...], k: int = 10) -> float:
    order = len(ngram) - 1
    if order == 0:
        return model[order][ngram] / sum(model[order].values())
    count2ngrams, smoothed_model = good_turing(model[order])
    r = model[order][ngram]
    r_ = smoothed_model[ngram]
    if r > k:
        return model[order][ngram] / model[order - 1][ngram[:-1]]
    elif k >= r > 0:
        modifier = (k + 1) * count2ngrams[k + 1] / count2ngrams[1]
        dr = (r_ / r - modifier) / (1 - modifier)
        return dr * model[order][ngram] / model[order - 1][ngram[:-1]]
    else:
        alpha_numerator = (1 - sum(katz(ngr, model) for ngr in model[order] if ngr[:-1] == ngram[:-1] and model[order][ngr] > 0))
        alpha_denominator = (1 - sum(katz(ngr[-1:], model) for ngr in model[order] if ngr[:-1] == ngram[:-1] and model[order][ngr] > 0))
        alpha = alpha_numerator / alpha_denominator
        return alpha * katz(ngram[1:], model)

In [28]:
import random
def generate_sentence(ngram_counter: Counter, order: int, limit = 20) -> list[str]:
    sentence = []
    first_ngrams = [i for i in ngram_counter if i[:-1] == ("<s>",) * (order - 1)]
    ngram = random.choices(first_ngrams, weights=[ngram_counter[i] for i in first_ngrams])[0]
    sentence.append(ngram[-1])
    while True:
        if len(sentence) >= limit or sentence[-1] == "</s>":
            break
        new_ngrams = [i for i in ngram_counter if i[:-1] == ngram[1:]]
        ngram = random.choices(new_ngrams, weights=[ngram_counter[i] for i in new_ngrams])[0]
        sentence.append(ngram[-1])
    return sentence

In [12]:
import glob
files = glob.glob("m:/corpres/careva/cta/**/*.seg_Y1", recursive=True)

In [14]:
import seg
import re
text = []
for file in files:
    s = seg.Seg()
    s.read_file(file)
    words = [label.name for label in s if label.name]
    words = [re.sub("\[[+-]\]", "", w) for w in words]
    text.append(words)

In [16]:
with open("obmen.txt", "w") as f:
    for line in text:
        f.write(" ".join(line) + "\n")

In [6]:
with open("obmen.txt") as f:
    text = [i.strip().split() for i in f.readlines()]

In [7]:
vocab, ngram_counter, n1gram_counter = generate_model(text, order=3)

In [None]:
ngram_counter.most_common()

In [49]:
" ".join(generate_sentence(ngram_counter, 3))

'положила голову ему на плечо спросила можно </s>'

In [53]:
calculate_logprob("положила на голову ему плечо".split(), (vocab, ngram_counter, n1gram_counter), 3)

-73.01485415644521