# Odia n-gram language model

In [40]:
import copy
import os
import random
from collections import defaultdict
from typing import List

import dill
import numpy as np
from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic
from tqdm import tqdm

In [2]:
# set random seed
random_seed = 123
random.seed(random_seed)
np.random.seed(random_seed)

## Load data

In [3]:
data_filepath = os.path.join('data/or')
assert os.path.isfile(data_filepath)  # sanity check
with open(data_filepath, 'r', encoding='utf-8') as f:
    lines = [s.strip() for s in tqdm(f.readlines(), desc='read lines from file')]

read lines from file: 100%|██████████| 3594672/3594672 [00:02<00:00, 1642670.08it/s]


## Tokenize

In [4]:
def tokenize_text(text: List[str]) -> List[List[str]]:
    """Tokenize text"""
    return [trivial_tokenize_indic(sample) for sample in tqdm(text, desc='tokenize', unit=' samples')]


In [5]:

tokenized_text = tokenize_text(lines)

tokenize: 100%|██████████| 10000/10000 [00:00<00:00, 42551.87 samples/s]


## Split into training and validation data

In [14]:

num_val = 500

# shuffle
random.shuffle(tokenized_text)

# split
tokenized_train, tokenized_val = tokenized_text[:-num_val], tokenized_text[-num_val:]

## Compute vocabulary

In [15]:
def compute_vocab(tok_text: List[List[str]]) -> List[str]:
    return list(set(
        [tok for tokens in tqdm(tok_text, unit=' samples') for tok in tokens]
    ))


In [16]:
odia_vocab = compute_vocab(tokenized_text)

100%|██████████| 10000/10000 [00:00<00:00, 716191.52 samples/s]


In [17]:
print('vocab size:', len(odia_vocab))

vocab size: 20216


## N-Gram language model

In [18]:
class NGramLM(object):
    def __init__(self, n: int, delta: float, vocab: List[str]):
        self.n = n
        self.delta = delta
        self.count = defaultdict(lambda: defaultdict(float))
        self.total = defaultdict(float)
        self.vocab = vocab
        if '<eos>' not in self.vocab:
            self.vocab.append('<eos>')
        self.vsize = len(vocab)

    def estimate(self, sequences: List[List[str]]) -> None:
        for sequence_raw in tqdm(sequences, unit=' sequences', desc='LM estimate'):
            sequence = ['<bos>'] * (self.n - 1) + sequence_raw + ['<eos>']
            for i in range(len(sequence) - self.n + 1):
                ngram = tuple(sequence[i:i + self.n])
                prefix, word = ngram[:-1], ngram[-1]
                self.count[prefix][word] += 1
                self.total[prefix] += 1

    def sequence_logp(self, sequence_raw: List[str]) -> float:
        """Compute perplexity for a sequence"""
        sequence = ['<bos>'] * (self.n - 1) + sequence_raw + ['<eos>']
        total_logp = 0
        for i in range(len(sequence) - self.n + 1):
            ngram = tuple(sequence[i:i + self.n])
            prefix = ngram[:-1]
            word = ngram[-1]
            logp = np.log2((self.delta + self.count[prefix][word]) /
                           (self.total[prefix] + self.delta * self.vsize))
            total_logp += logp
        return total_logp

    def perplexity(self, sequences: List[List[str]]) -> float:
        """Compute perplexity for multiple sequences"""
        n_total = 0
        logp_total = 0
        for sequence_raw in tqdm(sequences, unit=' sequences'):
            logp_total += self.sequence_logp(sequence_raw)
            n_total += len(sequence_raw) + 1  # add 1 for <eos>
        ppl = 2 ** (- (1.0 / n_total) * logp_total)  # the log needs to be in base 2!
        return ppl

    def generate(self, context: List[str] = None) -> str:
        """Generate text"""

        if context is None:
            prefix = None
            context = ['<bos>'] * (self.n - 1)
        elif len(context) < self.n - 1:
            prefix = copy.deepcopy(context)
            context = ['<bos>'] * (self.n - 1 - len(context)) + context
        elif len(context) > self.n - 1:
            prefix = copy.deepcopy(context)
            context = context[-(self.n - 1):]
        else:  # len(context) = self.n - 1
            prefix = None

        output = context
        while output[-1] != '<eos>':
            # Form conditional distribution to sample from
            probs, tokens = [], []
            for token in self.count[tuple(context)]:
                p = self.count[tuple(context)][token] / self.total[tuple(context)]
                probs.append(p)
                tokens.append(token)
            # Sample
            wt = np.random.choice(tokens, p=probs)
            output = output + [wt]
            context = context[1:] + [wt]
        if prefix is not None:
            return ' '.join(prefix + output[self.n - 1:])
        else:
            return ' '.join(output)

In [19]:
ns = [3]
deltas = [0.001]

lm_odia = {}
ppl_odia = {}

for n in ns:
    for delta in deltas:
        lm_odia[n, delta] = NGramLM(n=n, delta=delta, vocab=odia_vocab, )

        # estimate
        lm_odia[n, delta].estimate(tokenized_train)

        # compute perplexity
        ppl_odia[n, delta] = lm_odia[n, delta].perplexity(tokenized_val)


LM estimate: 100%|██████████| 9500/9500 [00:00<00:00, 20872.94 sequences/s]
100%|██████████| 500/500 [00:00<00:00, 17905.55 sequences/s]


## Generate Odia text

In [22]:
for n in ns:
    for delta in deltas:
        print(89 * '-')
        print(f'Odia LM, n = {n}, delta = {delta:.4f}')
        for _ in range(5):
            print(lm_odia[n, delta].generate())
        print(89 * '-')

-----------------------------------------------------------------------------------------
Odia LM, n = 3, delta = 0.0010
<bos> <bos> ଶାହାଙ୍କ ଗସ୍ତ ଲାଗି ରାଜ୍ୟ ସରକାରଙ୍କ ପାଖରେ ଜନସାଧାରଣଙ୍କ ସ୍ୱାର୍ଥର ସୁରକ୍ଷା ପାଇଁ ଦୃଢ଼ ପଦକ୍ଷେପ ଗ୍ରହଣ କରିବାକୁ ସାଧାରଣରେ ଦାବି ହୋଉଛି । <eos>
<bos> <bos> ବିଭାଗୀୟ ଉଚ୍ଚ କର୍ତ୍ତୃପକ୍ଷ ତୁରନ୍ତ ଭଦ୍ରେଶ୍ୱର ପଞ୍ଚାୟତରେ ହୋଇଥିବା ପ୍ରଧାନମନ୍ତ୍ରୀ ଆବାସ ଯୋଜନାରେ ଘର ଖଣ୍ଡିଏ ଯୋଗାଇ ଦେବା ସହିତ ମୁସଲିମଙ୍କୁ ଅନ୍ୟତ୍ର ବିକଳ୍ପ ଜାଗା ପ୍ରଦାନ ପାଇଁ ନିଦେ୍ର୍ଦଶ ଥିଲା ଏହା ମଧ୍ୟ କହିଛନ୍ତି କେବଳ ଭାରତରେ ନୁହେଁ ବିଶ୍ୱ ବଜାରରେ ମଧ୍ୟ ଏହି ଯୋଜନାରେ ଆଉ କାହାରିକୁ ଗ୍ୟାସ ସଂଯୋଗ ଲୁଟି ନେଇଥିଲେ । <eos>
<bos> <bos> ସେଥିପାଇଁ ଡାକ୍ତରଙ୍କୁ ଦ୍ୱିତୀୟ ଭଗବାନ ବୋଲି କୁହାଯାଇଥାଏ । <eos>
<bos> <bos> ପ୍ରାକୃତିକ ପ୍ରଣାଳୀରେ ପ୍ରସ୍ତୁତ ପ୍ରସାଧାନରେ ଶ୍ରୀବିଗ୍ରହମାନଙ୍କ ଶ୍ରୀମୁଖକୁ ଶୃଙ୍ଗାର କରାଯିବ । <eos>
<bos> <bos> 64 ପ୍ରସ୍ତୁତି ପ୍ରଣାଳୀ ପ୍ରଥମେ ପନିରକୁ ଛୋଟ ଛୋଟ କରି ସେଥିରେ କଟା ପିଆଜ , କଟା ଅଦା , କଟା ପି଼ଆଜ , ତେଜପତ୍ର , କଟା କଞ୍ଚା ଲଙ୍କା , କଟା କଞ୍ଚା ଲଙ୍କା ଓ କଟା ରସୁଣକୁ ଭାଜି ପୁର ପ୍ରସ୍ତୁତ କରନ୍ତୁ । <eos>
------------------------------------------------------------------------------------

In [27]:
lm_odia[3, deltas[0]].generate(context=['ଦୂରରେ'])

ValueError: 'a' cannot be empty unless no samples are taken

In [45]:
# dump lm to file
with open(os.path.join('ngram.lm.pkl'), 'wb') as f:
    s = dill.dumps(lm_odia[3, deltas[0]])
    f.write(s)