In [1]:
from datasets import load_dataset
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
import itertools

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", streaming=True, split="tel_Telu")

In [3]:
def telugu_sentence_tokenizer(text):
    return re.split(r'(?<=[।!?॥.])\s+', text)

def telugu_word_tokenizer(text):
    url = r'https?://\S+'
    email = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    decimal = r'\b\d+\.\d+\b'
    number = r'\b\d+\b'
    telugu = r'[\u0C00-\u0C7F]+'
    english = r'[a-zA-Z]+'
    punctuation = r'[.,!?;:"(){}\[\]<>|/@#$%^&*_+=~`\'“”‘’₹…-]'

    pattern = f'{url}|{email}|{date}|{decimal}|{number}|{telugu}|{english}|{punctuation}'
    return re.findall(pattern, text)

In [4]:
DOC_LIMIT = 1000
TRAIN_LIMIT = 990

unigram_counts = Counter()
bigram_counts = Counter()

dataset_subset = itertools.islice(dataset,TRAIN_LIMIT)

for i,item in enumerate(dataset_subset):
  text = item['text']
  sentences = telugu_sentence_tokenizer(text)

  for sentence in sentences:
    tokens = telugu_word_tokenizer(sentence)

    if not tokens:
      continue

    unigram_padded = ['<s>'] + tokens + ['</s>']
    unigram_counts.update(ngrams(unigram_padded,1))

    bigram_padded = ['<s>'] * 1 + tokens + ['</s>'] * 1
    bigram_counts.update(ngrams(bigram_padded,2))

print("\nFinished counting n-grams!")
print(f"Total unique unigrams: {len(unigram_counts):,}")
print(f"Total unique bigrams:  {len(bigram_counts):,}")


Finished counting n-grams!
Total unique unigrams: 8,277
Total unique bigrams:  17,031


In [5]:
def calculated_add_one(bigram,bigram_counts,unigram_counts, V):
  prefix = (bigram[0],)
  numerator = bigram_counts.get(bigram, 0) + 1
  denominator = unigram_counts.get(prefix, 0) + V
  return numerator / denominator

def calculated_add_k(bigram, bigram_counts, unigram_counts, V, k):
  prefix = (bigram[0],)
  numerator = bigram_counts.get(bigram, 0) + k
  denominator = unigram_counts.get(prefix, 0) + (k * V)
  return numerator / denominator

def calculate_add_token_type(bigram, bigram_counts, unigram_counts, V):
    prefix = (bigram[0],)
    numerator = bigram_counts.get(bigram, 0) + V
    denominator = unigram_counts.get(prefix, 0) + (V * V)
    if denominator == 0:
        return 0
    return numerator / denominator

In [6]:
from urllib.request import proxy_bypass
V = len(unigram_counts)
k = 0.1

remaining_docs = itertools.islice(dataset,TRAIN_LIMIT,DOC_LIMIT)

sentence_probs = []

for item in remaining_docs:
  text = item['text']
  sentences = telugu_sentence_tokenizer(text)

  for sentence in sentences:
    tokens = telugu_word_tokenizer(sentence)
    if not tokens:
      continue

    padded_tokens = ['<s>', '<s>', '<s>'] + tokens + ['</s>']
    bigrams = list(ngrams(padded_tokens,2))

    prob_add_one = 1
    prob_add_k = 1
    prob_token_type = 1

    for bg in bigrams:
      prob_add_one *= calculated_add_one(bg, bigram_counts, unigram_counts, V)
      prob_add_k *= calculated_add_k(bg, bigram_counts, unigram_counts, V, k)
      prob_token_type *= calculate_add_token_type(bg, bigram_counts, unigram_counts, V)

    sentence_probs.append({
        "sentence" : sentence,
        "add_one" : prob_add_one,
        "add_k" : prob_add_k,
        "add_token_type" : prob_token_type
    })

In [7]:
for i in range(min(5, len(sentence_probs))):
    print(f"Sentence: {sentence_probs[i]['sentence']}")
    print(f"Add-One Prob: {sentence_probs[i]['add_one']}")
    print(f"Add-k Prob: {sentence_probs[i]['add_k']}")
    print(f"Add-Token-Type Prob: {sentence_probs[i]['add_token_type']}\n")

Sentence: 1.
Add-One Prob: 1.1064344678686987e-16
Add-k Prob: 8.984097822469902e-16
Add-Token-Type Prob: 3.067881105950275e-20

Sentence: ఫిర్ నూనె.
Add-One Prob: 2.229275271489771e-21
Add-k Prob: 4.727144307255056e-22
Add-Token-Type Prob: 3.7051705801030824e-24

Sentence: ఇది ఒక చిన్న మొత్తంలో ప్రభావిత ప్రాంతం వర్తింప చేయాలి.
Add-One Prob: 1.6487628464848099e-43
Add-k Prob: 1.6407583783686973e-42
Add-Token-Type Prob: 1.1539825670649588e-47

Sentence: ఈ సాధనం వేగంగా మంట తొలగించడానికి సహాయపడుతుంది.
Add-One Prob: 1.2078529098592325e-34
Add-k Prob: 1.4393039451899222e-33
Add-Token-Type Prob: 7.97827631292114e-40

Sentence: - ఇది సక్సెస్ అయ్యాక మీ రిక్వెస్ట్ (మొబైల్ నెంబర్ చేంజ్) పూర్తవుతుంది.
Add-One Prob: 1.1501365169409796e-59
Add-k Prob: 1.7063564420327652e-59
Add-Token-Type Prob: 2.457221822947555e-63

