In [16]:
import os, nltk
import requests
import io #codecs
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk import word_tokenize, sent_tokenize, bigrams,trigrams
import pandas as pd
import codecs, re

# # Ensure necessary NLTK data packages are downloaded
# nltk.download('punkt_tab')

# Step 1: Load the Balochi corpus from a text file
if os.path.isfile('balochi.txt'):
    with io.open('balochi.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    print('file not found')

In [17]:
# number of words in corpus
len(text.split())

317181

In [18]:
# Step 2: Split the text into training and testing sets (80% train, 20% test)
split = int(0.8 * len(text))

train = text[:split]
test = text[split:]

print(len(train.split()))
print(len(test.split()))

253652
63530


In [19]:
# Step 3: Split texts into sentences using '۔' as the delimiter
train_sentences = train.split('۔')
test_sentences = test.split('۔')

# Step 4: Tokenize sentences using WhitespaceTokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

# This tokenizer splits sentences into words based on whitespace
tokenized_train = [tokenizer.tokenize(sentence) for sentence in train_sentences if sentence.strip()]
tokenized_test = [tokenizer.tokenize(sentence) for sentence in test_sentences if sentence.strip()]


# tokenized_train
# tokenized_test

In [20]:
# Step 5: Prepare the data for the N-gram model
from nltk.lm.preprocessing import padded_everygram_pipeline

n = 4  # Trigram model
# Generate padded n-grams and corresponding padded sentences
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_train)
test_data, padded_test = padded_everygram_pipeline(n, tokenized_test)

In [21]:
# Convert generators to lists for multiple iterations
train_data = list(train_data)
padded_sents = list(padded_sents)
test_data = list(test_data)
padded_test = list(padded_test)

In [22]:
# Step 6: Initialize and train the Lidstone N-gram model
from nltk.lm import Laplace, KneserNeyInterpolated, Lidstone

# Lidstone smoothing with gamma=0.5 helps handle zero probabilities for unseen n-grams
model = Lidstone(0.5, n)
model.fit(train_data, padded_sents)

In [23]:
# Print vocab and stats new
print("Vocabulary size:", len(model.vocab))
print("Ngram counts:", model.counts)


Vocabulary size: 10268
Ngram counts: <NgramCounter with 4 ngram orders and 1480570 ngrams>


In [24]:
# Step 7: Evaluate the model's perplexity on the test data
# Perplexity measures how well the model predicts the test data; lower is better
total_perplexity = 0
for i, test_ngram in enumerate(test_data):
    perplexity = model.perplexity(test_ngram)
    total_perplexity += perplexity

average_perplexity = total_perplexity / (i + 1)
print(f"Perplexity: {average_perplexity}")

Perplexity: 281.6997351199508


In [None]:
!pip install python-bidi


Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (292 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.9/292.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-bidi
Successfully installed python-bidi-0.6.6


In [26]:
from collections import Counter
from itertools import chain
from nltk.util import ngrams
import pandas as pd

# Generate bigrams (2-grams) from tokenized training sentences
bigrams = list(
    chain.from_iterable(
        ngrams(sent, 2) for sent in tokenized_train if len(sent) >= 2
    )
)

# Count the frequency of each bigram
bigram_fd = Counter(bigrams)

# Create a DataFrame of the 10 most common bigrams
bigram_df = pd.DataFrame(
    bigram_fd.most_common(10),
    columns=['bigrams', 'bigram_count']
)

# Display the DataFrame
bigram_df

Unnamed: 0,bigrams,bigram_count
0,"(تہ, ءَ)",3563
1,"(ءِ, تہ)",2415
2,"(سر, ءَ)",1146
3,"(کہ, آ)",1037
4,"(بوتگ, اَت)",888
5,"(وت, ءَ)",692
6,"(ءَ, یک)",672
7,"(ءِ, سر)",633
8,"(نہ, اَت)",615
9,"(لوٹ, اِت)",613


In [None]:
# Generate trigrams (3-grams) from tokenized training sentences
trigrams = list(
    chain.from_iterable(
        ngrams(sent, 3) for sent in tokenized_train if len(sent) >= 3
    )
)

# Count the frequency of each trigrams
trigram_fd = Counter(trigrams)

# Create a DataFrame of the 10 most common trigrams
trigram_df = pd.DataFrame(
    trigram_fd.most_common(10),
    columns=['trigrams', 'trigram_count']
)

# Display the DataFrame
trigram_df

Unnamed: 0,trigrams,trigram_count
0,"(ءِ, تہ, ءَ)",2403
1,"(ءِ, سر, ءَ)",628
2,"(وت, ءَ, را)",562
3,"(چی, ءَ, کہ)",330
4,"(میر, مُراد, جان)",319
5,"(وتی, جند, ءِ)",284
6,"(تہ, ءَ, یک)",230
7,"(ایشی, تہ, ءَ)",185
8,"(کہ, اد, ءَ)",175
9,"(نہ, زان, آں)",175


In [None]:
# Generate quadgrams (4-grams) from tokenized training sentences
quad_grams = list(
    chain.from_iterable(
        ngrams(sent, 4) for sent in tokenized_train if len(sent) >= 4
    )
)

# Count the frequency of each quadgram
quad_gram_fd = Counter(quad_grams)

# Create a DataFrame of the 10 most common quadgrams
quad_gram_df = pd.DataFrame(
    quad_gram_fd.most_common(10),
    columns=['quad_grams', 'quad_gram_count']
)

# Display the DataFrame
quad_gram_df

Unnamed: 0,quad_grams,quad_gram_count
0,"(دنیا, ءِ, تہ, ءَ)",165
1,"(ءِ, تہ, ءَ, یک)",154
2,"(دل, ءِ, تہ, ءَ)",112
3,"(آ, وت, ءَ, را)",111
4,"(کہ, ایشی, تہ, ءَ)",106
5,"(میر, احمد, یار, خان)",99
6,"(میر, مُراد, جان, گچکی)",93
7,"(اے, دنیا, ءِ, تہ)",91
8,"(ءَ, وت, ءَ, را)",90
9,"(ءِ, تہ, ءَ, آ)",84


In [None]:
import plotly.express as px

# Convert bigram tuples to strings
bigram_df['bigrams'] = bigram_df['bigrams'].apply(lambda x: ' '.join(x))

# Plot using px.bar
fig = px.bar(
    bigram_df,
    x='bigrams',
    y='bigram_count',
    title="Top 10 Most Frequent Bigrams",
    labels={'bigrams': 'Bigrams', 'bigram_count': 'Frequency'},
    color_discrete_sequence=['RebeccaPurple']
)

fig.update_layout(
    xaxis_title="Bigrams",
    yaxis_title="Frequency of Bigrams",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)

fig.show()


In [None]:
# Convert trigram tuples to strings for better plotting
trigram_df['trigrams'] = trigram_df['trigrams'].apply(lambda x: ' '.join(x))

# Create bar plot using Plotly Express
fig = px.bar(
    trigram_df,
    x='trigrams',
    y='trigram_count',
    title="Top 10 Most Frequent Trigrams",
    labels={'trigrams': 'Trigrams', 'trigram_count': 'Frequency'},
    color_discrete_sequence=['RebeccaPurple']
)

# Customize layout
fig.update_layout(
    xaxis_title="Trigrams",
    yaxis_title="Frequency of Trigrams",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)

# Show plot
fig.show()

In [None]:
# Convert quadgram tuples to strings for better plotting
quad_gram_df['quad_grams'] = quad_gram_df['quad_grams'].apply(lambda x: ' '.join(x))

# Create bar plot using Plotly Express
fig = px.bar(
    quad_gram_df,
    x='quad_grams',
    y='quad_gram_count',
    title="Top 10 Most Frequent Quadgrams",
    labels={'quad_grams': 'Quadgrams', 'quad_gram_count': 'Frequency'},
    color_discrete_sequence=['RebeccaPurple']
)

# Customize layout
fig.update_layout(
    xaxis_title="Quadgrams",
    yaxis_title="Frequency of Quadgrams",
    font=dict(
        size=14,
        color="RebeccaPurple"
    )
)

# Show plot
fig.show()


In [None]:
# Prompt the user to enter a word or phrase, split the input into tokens (words)
input_text = (input('Enter a word: ')).split()

# Retrieve the counts of n-grams starting with the input_text from the trained model
# Then get the 3 most common next words and sort them
sorted(model.counts[input_text].most_common(3))

Enter a word: تو


[('من', 46), ('منی', 36), ('وتی', 35)]

In [None]:
import dill as pickle  # Import dill library as pickle for enhanced serialization (can save complex objects)

# Save the trained language model to a file named 'Balochi_NextWordPrediction.pkl' in binary write mode
with open('Balochi_NextWordPrediction.pkl', 'wb') as fout:
    pickle.dump(model, fout)  # Serialize and write the 'model' object to the file
