In [5]:
import os, nltk
import requests
import io #codecs
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk import word_tokenize, sent_tokenize, bigrams,trigrams
import pandas as pd
import codecs, re

# # Ensure necessary NLTK data packages are downloaded
# nltk.download('punkt_tab')

# Step 1: Load the Balochi corpus from a text file
if os.path.isfile('balochi.txt'):
    with io.open('balochi.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    print('file not found')

In [7]:
# number of words in corpus
len(text.split())

317181

In [8]:
# Step 2: Split the text into training and testing sets (80% train, 20% test)
split = int(0.8 * len(text))

train = text[:split]
test = text[split:]

print(len(train.split()))
print(len(test.split()))

253652
63530


In [9]:
# Step 3: Split texts into sentences using '۔' as the delimiter
train_sentences = train.split('۔')
test_sentences = test.split('۔')

In [12]:
# # Step 3: Define a function to clean and tokenize text

# # Training text
# # Remove unwanted characters and digits from corpus as our file is already cleaned then we don't need this one
#  tokenized_text = [train_sentences.translate(str.maketrans('', '', '''’؟':؛،”“"…‘0123456789><*&,^%$#@!_.-:;?|\/][)(}{+='"''')) for sent in sentences]

# Step 4: Tokenize sentences using WhitespaceTokenizer

from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

# This tokenizer splits sentences into words based on whitespace

tokenized_train = [tokenizer.tokenize(sentence) for sentence in train_sentences if sentence.strip()]
tokenized_test = [tokenizer.tokenize(sentence) for sentence in test_sentences if sentence.strip()]


# tokenized_train

In [14]:
# tokenized_test

In [50]:
# Step 5: Prepare the data for the N-gram model
from nltk.lm.preprocessing import padded_everygram_pipeline

n = 3  # Trigram model
# Generate padded n-grams and corresponding padded sentences
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_train)
test_data, padded_test = padded_everygram_pipeline(n, tokenized_test)

In [51]:
# Convert generators to lists for multiple iterations
train_data = list(train_data)
padded_sents = list(padded_sents)
test_data = list(test_data)
padded_test = list(padded_test)

In [52]:
# Step 6: Initialize and train the Lidstone N-gram model
from nltk.lm import Laplace, KneserNeyInterpolated, Lidstone

# Lidstone smoothing with gamma=0.5 helps handle zero probabilities for unseen n-grams
model = Lidstone(0.5, n)
model.fit(train_data, padded_sents)

In [53]:
# Print vocab and stats new
print("Vocabulary size:", len(model.vocab))
print("Ngram counts:", model.counts)


Vocabulary size: 10268
Ngram counts: <NgramCounter with 3 ngram orders and 993936 ngrams>


In [54]:
#number of unigrams, bigrams, trigrams, 4-grams and so on...
print(model.counts)

<NgramCounter with 3 ngram orders and 993936 ngrams>


In [55]:
# Step 7: Evaluate the model's perplexity on the test data
# Perplexity measures how well the model predicts the test data; lower is better
total_perplexity = 0
for i, test_ngram in enumerate(test_data):
    perplexity = model.perplexity(test_ngram)
    total_perplexity += perplexity

average_perplexity = total_perplexity / (i + 1)
print(f"Perplexity: {average_perplexity}")

Perplexity: 313.8818666764716


In [15]:
pip install python-bidi


Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.4.2


In [56]:
# Import necessary modules
from collections import Counter  # For counting hashable objects
from itertools import chain      # For flattening lists
from bidi.algorithm import get_display  # For proper display of bidirectional text (e.g., Arabic, Balochi)
import pandas as pd  # For creating and handling dataframes

# Generate bigrams from the tokenized training data
# 'tokenized_train' is assumed to be a list of tokenized sentences (i.e., list of lists of tokens)
# For each sentence, zip the sentence with itself offset by one to create bigrams
# Then, flatten the list of bigrams from all sentences into a single iterable using chain.from_iterable
bigram_fd = Counter(
    chain.from_iterable(
        zip(sublist, sublist[1:]) for sublist in tokenized_train
    )
)

# Create a DataFrame from the 10 most common bigrams
# 'most_common(10)' returns a list of the 10 most frequent bigrams and their counts
bigram_df = pd.DataFrame(
    bigram_fd.most_common(10),
    columns=['bigrams', 'bigram_count']
)

# Display the DataFrame
bigram_df


Unnamed: 0,bigrams,bigram_count
0,"(تہ, ءَ)",3563
1,"(ءِ, تہ)",2415
2,"(سر, ءَ)",1146
3,"(کہ, آ)",1037
4,"(بوتگ, اَت)",888
5,"(وت, ءَ)",692
6,"(ءَ, یک)",672
7,"(ءِ, سر)",633
8,"(نہ, اَت)",615
9,"(لوٹ, اِت)",613


In [57]:
input_text = (input('Enter a word: ')).split()
sorted(model.counts[input_text].most_common(3))

Enter a word:  تو


[('من', 46), ('منی', 36), ('وتی', 35)]