In [6]:
from datasets import load_dataset
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
import itertools

In [7]:
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", streaming=True, split="tel_Telu")

sample = next(iter(dataset))
print("\nSample Text : ")
print(sample['text'])


Sample Text : 
అమెరికా అధ్యక్షుడు డొనాల్డ్ ట్రంప్ కు రాష్ట్రపతి  భవన్ వద్ద ఘనస్వాగతం లభించింది. ఆయనకు రాష్ట్రపతి రామ్ నాథ్ కోవింద్ దంపతులు, ప్రధాని మోదీ సాదరంగా ఆహ్వానం పలకడంతో పాటు సైనికులు గౌరవ వందనాన్ని అందించారు.


In [8]:
def telugu_sentence_tokenizer(text):
    return re.split(r'(?<=[।!?॥.])\s+', text)

def telugu_word_tokenizer(text):
    url = r'https?://\S+'
    email = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    decimal = r'\b\d+\.\d+\b'
    number = r'\b\d+\b'
    telugu = r'[\u0C00-\u0C7F]+'
    english = r'[a-zA-Z]+'
    punctuation = r'[.,!?;:"(){}\[\]<>|/@#$%^&*_+=~`\'“”‘’₹…-]'

    pattern = f'{url}|{email}|{date}|{decimal}|{number}|{telugu}|{english}|{punctuation}'
    return re.findall(pattern, text)

In [9]:
DOC_LIMIT = 200000 # Limit for the number of documents to process.

# Initialize counters for each n-gram level
unigram_counts = Counter()
bigram_counts = Counter()
trigram_counts = Counter()
quadrigram_counts = Counter()

dataset_subset = itertools.islice(dataset,DOC_LIMIT)

for i,item in enumerate(dataset_subset):
  text = item['text']
  sentences = telugu_sentence_tokenizer(text)

  for sentence in sentences:
    tokens = telugu_word_tokenizer(sentence)

    if not tokens:
      continue

    # Add padding tokens for the models. We add n-1 start tokens,
    # where n is the size of our largest model (4 for Quadrigram).
    # We also add one end token.
    # padded_tokens = ['<s>', '<s>', '<s>'] + tokens + ['</s>']

    # unigram_counts.update(ngrams(padded_tokens,1))
    # bigram_counts.update(ngrams(padded_tokens,2))
    # trigram_counts.update(ngrams(padded_tokens,3))
    # quadrigram_counts.update(ngrams(padded_tokens,4))

    unigram_padded = ['<s>'] + tokens + ['</s>']
    unigram_counts.update(ngrams(unigram_padded,1))

    bigram_padded = ['<s>'] * 1 + tokens + ['</s>'] * 1
    bigram_counts.update(ngrams(bigram_padded,2))

    trigram_padded = ['<s>'] * 3 + tokens + ['</s>'] * 2
    trigram_counts.update(ngrams(trigram_padded,3))

    quadrigram_padded = ['<s>'] * 3 + tokens + ['</s>'] * 3
    quadrigram_counts.update(ngrams(quadrigram_padded,4))

print("\nFinished counting n-grams!")
print(f"Total unique unigrams: {len(unigram_counts):,}")
print(f"Total unique bigrams:  {len(bigram_counts):,}")
print(f"Total unique trigrams: {len(trigram_counts):,}")
print(f"Total unique quadrigrams: {len(quadrigram_counts):,}")

# Finished counting n-grams! # without extra ending
# Total unique unigrams: 314,285
# Total unique bigrams:  2,093,327
# Total unique trigrams: 3,276,969
# Total unique quadrigrams: 3,748,629


Finished counting n-grams!
Total unique unigrams: 314,285
Total unique bigrams:  2,093,326
Total unique trigrams: 3,286,149
Total unique quadrigrams: 3,824,965


In [10]:
from ast import If
unigram_model = defaultdict(float)
bigram_model = defaultdict(float)
trigram_model = defaultdict(float)
quadrigram_model = defaultdict(float)

# Unigram Model
total_tokens = sum(unigram_counts.values())
for unigram, count in unigram_counts.items():
  unigram_model[unigram] = count / total_tokens

# Bigram Model
for bigram, count in bigram_counts.items():
  prefix = bigram[0:1]
  prefix_count = unigram_counts[prefix]

  if prefix_count > 0:
    bigram_model[bigram] = count / prefix_count

# Trigram Model
for trigram, count in trigram_counts.items():
  prefix = trigram[0:2]
  prefix_count = bigram_counts[prefix]

  if prefix_count > 0:
    trigram_model[trigram] = count / prefix_count

# Quadrigram Model
for quadrigram, count in quadrigram_counts.items():
  prefix = quadrigram[0:3]
  prefix_count = trigram_counts[prefix]

  if prefix_count > 0:
    quadrigram_model[quadrigram] = count / prefix_count

# We will find a common bigram starting with 'ఈ'
for bg in bigram_model:
  if bg[0] == 'ఈ':
    print(f"\nExample probability: P({bg[1]} | {bg[0]}) = {bigram_model[bg]:.6f}")
    break


Example probability: P(మేరకు | ఈ) = 0.014012
