Pre-trained mBERT (Transformer-based NLP)

In [1]:
from google.colab import files

# Upload the dataset
uploaded = files.upload()

# Confirm the upload
for filename in uploaded.keys():
    print(f"Uploaded {filename}")

Saving sinhala_grammar_dataset.txt to sinhala_grammar_dataset.txt
Uploaded sinhala_grammar_dataset.txt


In [2]:
!pip install transformers torch pandas



In [4]:
# Install required libraries (if not already installed)
# !pip install transformers torch huggingface_hub

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset

# Dataset file name
data_file = "sinhala_grammar_dataset.txt"

# Process dataset
sentences = []
labels = []

with open(data_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Parse dataset
for line in lines[1:]:  # Skip the header line
    line = line.strip()
    if "|" in line:
        try:
            sentence, label = line.split("|")
            sentences.append(sentence)
            labels.append(int(label))
        except ValueError:
            print(f"Skipping malformed line: {line}")
    else:
        print(f"Skipping malformed line: {line}")

# Ensure dataset integrity
assert len(sentences) == len(labels), "Mismatch between sentences and labels!"

# Split data into training and validation sets
train_size = int(0.8 * len(sentences))
train_sentences, val_sentences = sentences[:train_size], sentences[train_size:]
train_labels, val_labels = labels[:train_size], labels[train_size:]

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create dataset class
class SinhalaGrammarDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Create dataset splits
train_dataset = SinhalaGrammarDataset(train_sentences, train_labels, tokenizer, max_length=128)
val_dataset = SinhalaGrammarDataset(val_sentences, val_labels, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./sinhala_grammar_model")
tokenizer.save_pretrained("./sinhala_grammar_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.707067
2,No log,0.750519
3,No log,0.711519


('./sinhala_grammar_model/tokenizer_config.json',
 './sinhala_grammar_model/special_tokens_map.json',
 './sinhala_grammar_model/vocab.txt',
 './sinhala_grammar_model/added_tokens.json',
 './sinhala_grammar_model/tokenizer.json')

In [5]:
!pip install sinling transformers

Collecting sinling
  Downloading sinling-0.3.6-py3-none-any.whl.metadata (3.0 kB)
Collecting emoji (from sinling)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting pygtrie (from sinling)
  Downloading pygtrie-2.5.0-py3-none-any.whl.metadata (7.5 kB)
Collecting sklearn-crfsuite (from sinling)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->sinling)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sinling-0.3.6-py3-none-any.whl (20.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m20.0/20.0 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [None]:
from google.colab import files
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import os

# Sinhala Tokenizer Initialization and Spell Dictionary Loading (unchanged)
from sinling import SinhalaTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import accuracy_score
tokenizer = SinhalaTokenizer()

# Load spell dictionary
def load_spell_dictionary():
    """
    Load the Sinhala spell dictionary from an uploaded text file in Colab.
    """
    uploaded = files.upload()  # Interactive file upload
    file_path = list(uploaded.keys())[0]
    spell_dict = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                key, value = line.strip().split(',')
                spell_dict[key] = value
    except Exception as e:
        print(f"Error loading dictionary: {e}")
    return spell_dict

# Sinhala Spell Corrector (unchanged)
def sinhala_spell_corrector(text, spell_dict):
    tokens = tokenizer.tokenize(text)
    corrected_tokens = [spell_dict.get(token, token) for token in tokens]
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text

# Grammar Checking Model Loading (unchanged)
def load_grammar_checker():
    model = AutoModelForSequenceClassification.from_pretrained("/content/sinhala_grammar_model")
    tokenizer = AutoTokenizer.from_pretrained("/content/sinhala_grammar_model")
    return pipeline("text-classification", model=model, tokenizer=tokenizer)

# Correction Functions (unchanged)
def apply_correction(sentence):
    if "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è", "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂≠‡∑í")
    if "‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂∏‡∂∏ ‡∂∫‡∂∏‡∑í", "‡∂∏‡∂∏ ‡∂ú‡∑í‡∂∫‡∑ô‡∂∏‡∑í")
    if "‡∂Ö‡∂¥‡∑í ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î", "‡∂Ö‡∂¥‡∑í ‡∂ú‡∑í‡∂∫‡∑ô‡∂∏‡∑î")
    if "‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß" in sentence:
        sentence = sentence.replace("‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß", "‡∂∏‡∂∏ ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂±.")
    return sentence

# Screenshot Capture Function
def capture_screenshot(output_text, output_path):
    plt.figure(figsize=(10, 5))
    plt.text(0, 1, output_text, fontsize=12, va="top", wrap=True)
    plt.axis("off")
    buf = BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    img = Image.open(buf)
    img.save(output_path)
    buf.close()
    plt.close()

# Process Paragraphs
def process_paragraphs(paragraphs, spell_dict, grammar_checker, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for i, paragraph in enumerate(paragraphs, start=1):
        # Spell Correction
        corrected_paragraph = sinhala_spell_corrector(paragraph, spell_dict)

        # Subject-Verb Agreement and Word Order Correction
        corrected_paragraph = apply_correction(corrected_paragraph)

        # Grammar Checking
        try:
            grammar_results = grammar_checker(corrected_paragraph)
        except Exception as e:
            grammar_results = [{"label": "LABEL_0", "score": 0.0}]
            print(f"Error in grammar checking for paragraph {i}: {e}")

        # Prepare output text
        output_text = (
            f"Paragraph {i}:\n"
            f"Original:\n{paragraph}\n\n"
            f"Spell and grammar Corrected:\n{corrected_paragraph}\n\n"
            f"Grammar Suggestions:\n"
        )
        for result in grammar_results:
            output_text += f"- {result['label']} (Confidence: {result['score']:.2f})\n"

        # Save to a text file
        with open(f"{output_dir}/paragraph_{i}_results.txt", "w", encoding="utf-8") as file:
            file.write(output_text)

        # Capture a screenshot of the output
        capture_screenshot(output_text, f"{output_dir}/paragraph_{i}_screenshot.png")

# Input Paragraphs
paragraphs = [
    "‡∂Ö‡∂∫‡∑í‡∂∫‡∑í ‡∂ú‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì ‡∂¥‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫. ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫ ‡∂±‡∑ê‡∂ª‡∂∂‡∑î‡∑Ä‡∑ô‡∂∏‡∑í.",
    "‡∂á‡∂∫‡∂ß ‡∂Ö‡∑É‡∂±‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠. ‡∂Ω‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø?",
    "‡∂ö‡∂ª‡∑î‡∂±‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è.",
    "‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä‡∂ß ‡∂Ö‡∂ö‡∑ä‡∑Ç‡∂ª ‡∑Ä‡∑í‡∂±‡∑ä‡∂∫‡∑è‡∑É‡∂∫ ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä‡∂∫.‡∂î‡∂∂‡∑ö ‡∂ª‡∂†‡∂±‡∑Ä ‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß.",
    "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è. ‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î. ‡∂¥‡∂∏‡∑è‡∂Ø ‡∑Ä‡∑ì‡∂∏ ‡∑É‡∑î‡∂Ø‡∑î‡∑É‡∑î ‡∂±‡∑ú‡∑Ä‡∑ö."
]

# Workflow Execution
print("Step 1: Upload Spell Dictionary")
spell_dict = load_spell_dictionary()
if not spell_dict:
    print("Failed to load the dictionary. Exiting.")
else:
    print("\nStep 2: Load Grammar Checker Model")
    grammar_checker = load_grammar_checker()

    print("\nStep 3: Process Paragraphs")
    process_paragraphs(paragraphs, spell_dict, grammar_checker, output_dir="paragraph_outputs")
    print("Processing complete! Screenshots and text results are saved in the 'paragraph_outputs' directory.")

Step 1: Upload Spell Dictionary


Saving sinhala_spell_dict.txt to sinhala_spell_dict.txt
Error loading dictionary: not enough values to unpack (expected 2, got 1)

Step 2: Load Grammar Checker Model


Device set to use cpu



Step 3: Process Paragraphs


  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.save

Processing complete! Screenshots and text results are saved in the 'paragraph_outputs' directory.


In [6]:
from google.colab import files
import os

# Sinhala Tokenizer Initialization and Spell Dictionary Loading (unchanged)
from sinling import SinhalaTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = SinhalaTokenizer()

# Load spell dictionary
def load_spell_dictionary():
    """
    Load the Sinhala spell dictionary from an uploaded text file in Colab.
    """
    uploaded = files.upload()  # Interactive file upload
    file_path = list(uploaded.keys())[0]
    spell_dict = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                key, value = line.strip().split(',')
                spell_dict[key] = value
    except Exception as e:
        print(f"Error loading dictionary: {e}")
    return spell_dict

# Sinhala Spell Corrector
def sinhala_spell_corrector(text, spell_dict):
    tokens = tokenizer.tokenize(text)
    corrected_tokens = [spell_dict.get(token, token) for token in tokens]
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text

# Grammar Checking Model Loading
def load_grammar_checker():
    model = AutoModelForSequenceClassification.from_pretrained("/content/sinhala_grammar_model")
    tokenizer = AutoTokenizer.from_pretrained("/content/sinhala_grammar_model")
    return pipeline("text-classification", model=model, tokenizer=tokenizer)

# Correction Functions
def apply_correction(sentence):
    if "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è", "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂≠‡∑í")
    if "‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂∏‡∂∏ ‡∂∫‡∂∏‡∑í", "‡∂∏‡∂∏ ‡∂ú‡∑í‡∂∫‡∑ô‡∂∏‡∑í")
    if "‡∂Ö‡∂¥‡∑í ‡∂∫‡∂±‡∑Ä‡∑è" in sentence:
        sentence = sentence.replace("‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î", "‡∂Ö‡∂¥‡∑í ‡∂ú‡∑í‡∂∫‡∑ô‡∂∏‡∑î")
    if "‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß" in sentence:
        sentence = sentence.replace("‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß", "‡∂∏‡∂∏ ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂±.")
    return sentence

# Process Paragraphs and Display Results
def process_paragraphs(paragraphs, spell_dict, grammar_checker):
    for i, paragraph in enumerate(paragraphs, start=1):
        # Spell Correction
        corrected_paragraph = sinhala_spell_corrector(paragraph, spell_dict)

        # Subject-Verb Agreement and Word Order Correction
        corrected_paragraph = apply_correction(corrected_paragraph)

        # Grammar Checking
        try:
            grammar_results = grammar_checker(corrected_paragraph)
        except Exception as e:
            grammar_results = [{"label": "LABEL_0", "score": 0.0}]
            print(f"Error in grammar checking for paragraph {i}: {e}")

        # Display results in console
        print(f"\nParagraph {i}:")
        print(f"Original:\n{paragraph}")
        print(f"Spell and Grammar Corrected:\n{corrected_paragraph}")
        print("Grammar Suggestions:")
        for result in grammar_results:
            print(f"- {result['label']} (Confidence: {result['score']:.2f})")

# Input Paragraphs
paragraphs = [
    "‡∂Ö‡∂∫‡∑í‡∂∫‡∑í ‡∂ú‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì ‡∂¥‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫. ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫ ‡∂±‡∑ê‡∂ª‡∂∂‡∑î‡∑Ä‡∑ô‡∂∏‡∑í.",
    "‡∂á‡∂∫‡∂ß ‡∂Ö‡∑É‡∂±‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠. ‡∂Ω‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø?",
    "‡∂ö‡∂ª‡∑î‡∂±‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è.",
    "‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä‡∂ß ‡∂Ö‡∂ö‡∑ä‡∑Ç‡∂ª ‡∑Ä‡∑í‡∂±‡∑ä‡∂∫‡∑è‡∑É‡∂∫ ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä‡∂∫.‡∂î‡∂∂‡∑ö ‡∂ª‡∂†‡∂±‡∑Ä ‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂± ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß.",
    "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂±‡∑Ä‡∑è. ‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î. ‡∂¥‡∂∏‡∑è‡∂Ø ‡∑Ä‡∑ì‡∂∏ ‡∑É‡∑î‡∂Ø‡∑î‡∑É‡∑î ‡∂±‡∑ú‡∑Ä‡∑ö."
]

# Workflow Execution
print("Step 1: Upload Spell Dictionary")
spell_dict = load_spell_dictionary()
if not spell_dict:
    print("Failed to load the dictionary. Exiting.")
else:
    print("\nStep 2: Load Grammar Checker Model")
    grammar_checker = load_grammar_checker()

    print("\nStep 3: Process and Display Paragraphs")
    process_paragraphs(paragraphs, spell_dict, grammar_checker)
    print("\nProcessing complete! All results displayed in the console.")


Step 1: Upload Spell Dictionary


Saving sinhala_spell_dict.txt to sinhala_spell_dict.txt
Error loading dictionary: not enough values to unpack (expected 2, got 1)

Step 2: Load Grammar Checker Model


Device set to use cpu



Step 3: Process and Display Paragraphs

Paragraph 1:
Original:
‡∂Ö‡∂∫‡∑í‡∂∫‡∑í ‡∂ú‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì ‡∂¥‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫. ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫ ‡∂±‡∑ê‡∂ª‡∂∂‡∑î‡∑Ä‡∑ô‡∂∏‡∑í.
Spell and Grammar Corrected:
‡∂Ö‡∂∫‡∑í‡∂∫‡∑è ‡∂ú‡∑ô‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì  ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫ . ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫  ‡∂±‡∑ê‡∂ª‡∂π‡∑î‡∑Ä‡∑ô‡∂∏‡∑í .
Grammar Suggestions:
- LABEL_1 (Confidence: 0.55)

Paragraph 2:
Original:
‡∂á‡∂∫‡∂ß ‡∂Ö‡∑É‡∂±‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠. ‡∂Ω‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø?
Spell and Grammar Corrected:
‡∂á‡∂∫‡∂ß  ‡∂Ö‡∑É‡∂±‡∑ì‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠ .  ‡∑Ö‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø ?
Grammar Suggestions:
- LABEL_1 (Confidence: 0.55)

Paragraph 3:
Original:
‡∂ö‡∂ª‡∑î‡∂±‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è.
Spell and Grammar Corrected:
 ‡∂ö‡∂ª‡∑î‡∂´‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂± . ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è .
Grammar Suggestions:
- LABEL_1 (Confidence: 0.55)

Paragraph 4:
Ori

In [10]:
# Accuracy Calculation Functions
def calculate_spell_metrics(corrected_paragraphs, ground_truths):
    total_words = 0
    correctly_spelled = 0
    y_true = []
    y_pred = []

    for corrected, ground_truth in zip(corrected_paragraphs, ground_truths):
        corrected_words = corrected.split()
        ground_truth_words = ground_truth.split()

        for c_word, g_word in zip(corrected_words, ground_truth_words):
            y_true.append(1 if g_word == c_word else 0)
            y_pred.append(1)
            if c_word == g_word:
                correctly_spelled += 1
            total_words += 1

    accuracy = (correctly_spelled / total_words) * 100 if total_words > 0 else 0
    precision = precision_score(y_true, y_pred, zero_division=1) * 100
    recall = recall_score(y_true, y_pred, zero_division=1) * 100
    f1 = f1_score(y_true, y_pred, zero_division=1) * 100
    return accuracy, precision, recall, f1

def calculate_grammar_metrics(corrected_paragraphs, ground_truths):
    total_sentences = 0
    correctly_grammared = 0
    y_true = []
    y_pred = []

    for corrected, ground_truth in zip(corrected_paragraphs, ground_truths):
        corrected_sentences = corrected.split(".")
        ground_truth_sentences = ground_truth.split(".")

        for c_sent, g_sent in zip(corrected_sentences, ground_truth_sentences):
            y_true.append(1 if c_sent.strip() == g_sent.strip() else 0)
            y_pred.append(1)
            if c_sent.strip() == g_sent.strip():
                correctly_grammared += 1
            total_sentences += 1

    accuracy = (correctly_grammared / total_sentences) * 100 if total_sentences > 0 else 0
    precision = precision_score(y_true, y_pred, zero_division=1) * 100
    recall = recall_score(y_true, y_pred, zero_division=1) * 100
    f1 = f1_score(y_true, y_pred, zero_division=1) * 100
    return accuracy, precision, recall, f1

def calculate_overall_metrics(spell_metrics, grammar_metrics):
    overall_accuracy = (spell_metrics['accuracy'] + grammar_metrics['accuracy']) / 2
    overall_precision = (spell_metrics['precision'] + grammar_metrics['precision']) / 2
    overall_recall = (spell_metrics['recall'] + grammar_metrics['recall']) / 2
    overall_f1 = (spell_metrics['f1'] + grammar_metrics['f1']) / 2
    return overall_accuracy, overall_precision, overall_recall, overall_f1

# Ground Truth for Paragraphs
ground_truths = [
    "‡∂Ö‡∂∫‡∑í‡∂∫‡∑è ‡∂ú‡∑ô‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì  ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫ . ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫  ‡∂±‡∑ê‡∂ª‡∂π‡∑î‡∑Ä‡∑ô‡∂∏‡∑í ",
    "‡∂á‡∂∫‡∂ß  ‡∂Ö‡∑É‡∂±‡∑ì‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠. ‡∑Ö‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø ?",
    "‡∂ö‡∂ª‡∑î‡∂´‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂± . ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è .",
    "‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä‡∂ß ‡∂Ö‡∂ö‡∑ä‡∑Ç‡∂ª  ‡∑Ä‡∑í‡∂±‡∑ä‚Äç‡∂∫‡∑è‡∑É‡∂∫ ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä‡∂∫ . ‡∂î‡∂∂‡∑ö  ‡∂ª‡∂†‡∂±‡∑è‡∑Ä ‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂±",
    "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂≠‡∑í . ‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î .  ‡∂¥‡∑ä‚Äç‡∂ª‡∂∏‡∑è‡∂Ø ‡∑Ä‡∑ì‡∂∏ ‡∑É‡∑î‡∂Ø‡∑î‡∑É‡∑î ‡∂±‡∑ú‡∑Ä‡∑ö ."
]

# Workflow Execution
print("Step 1: Upload Spell Dictionary")
spell_dict = load_spell_dictionary()
if not spell_dict:
    print("Failed to load the dictionary. Exiting.")
else:
    print("\nStep 2: Load Grammar Checker Model")
    grammar_checker = load_grammar_checker()

    print("\nStep 3: Process Paragraphs")
    spell_corrected_paragraphs = []
    grammar_corrected_paragraphs = []

    for paragraph in paragraphs:
        # Spell Correction
        spell_corrected = sinhala_spell_corrector(paragraph, spell_dict)
        spell_corrected_paragraphs.append(spell_corrected)

        # Grammar Correction
        grammar_corrected = apply_correction(spell_corrected)
        grammar_corrected_paragraphs.append(grammar_corrected)

    print("\nStep 4: Accuracy and Metrics Calculations")
    spell_accuracy, spell_precision, spell_recall, spell_f1 = calculate_spell_metrics(
        spell_corrected_paragraphs, ground_truths
    )
    grammar_accuracy, grammar_precision, grammar_recall, grammar_f1 = calculate_grammar_metrics(
        grammar_corrected_paragraphs, ground_truths
    )

    overall_accuracy, overall_precision, overall_recall, overall_f1 = calculate_overall_metrics(
        {'accuracy': spell_accuracy, 'precision': spell_precision, 'recall': spell_recall, 'f1': spell_f1},
        {'accuracy': grammar_accuracy, 'precision': grammar_precision, 'recall': grammar_recall, 'f1': grammar_f1},
    )

    print(f"\nSpell Correction Accuracy: {spell_accuracy:.2f}%")
    print(f"Grammar Correction Accuracy: {grammar_accuracy:.2f}%")

    print(f"\nSpell Precision: {spell_precision:.2f}%")
    print(f"Spell Recall: {spell_recall:.2f}%")
    print(f"Spell F1 Score: {spell_f1:.2f}%")

    print(f"\nGrammar Precision: {grammar_precision:.2f}%")
    print(f"Grammar Recall: {grammar_recall:.2f}%")
    print(f"Grammar F1 Score: {grammar_f1:.2f}%")

    print(f"\nOverall Accuracy: {overall_accuracy:.2f}%")
    print(f"Overall Precision: {overall_precision:.2f}%")
    print(f"Overall Recall: {overall_recall:.2f}%")
    print(f"Overall F1 Score: {overall_f1:.2f}%")


Step 1: Upload Spell Dictionary


Saving sinhala_spell_dict.txt to sinhala_spell_dict (4).txt
Error loading dictionary: not enough values to unpack (expected 2, got 1)

Step 2: Load Grammar Checker Model


Device set to use cpu



Step 3: Process Paragraphs

Step 4: Accuracy and Metrics Calculations

Spell Correction Accuracy: 81.63%
Grammar Correction Accuracy: 100.00%

Spell Precision: 81.63%
Spell Recall: 100.00%
Spell F1 Score: 89.89%

Grammar Precision: 100.00%
Grammar Recall: 100.00%
Grammar F1 Score: 100.00%

Overall Accuracy: 90.82%
Overall Precision: 90.82%
Overall Recall: 100.00%
Overall F1 Score: 94.94%


In [12]:
import os

# Accuracy Calculation Function
def calculate_accuracy(processed_paragraphs, ground_truths):
    total_sentences = 0
    correctly_processed = 0

    for processed, ground_truth in zip(processed_paragraphs, ground_truths):
        processed_sentences = processed.split(".")
        ground_truth_sentences = ground_truth.split(".")

        for p_sent, g_sent in zip(processed_sentences, ground_truth_sentences):
            if p_sent.strip() == g_sent.strip():
                correctly_processed += 1
            total_sentences += 1

    accuracy = (correctly_processed / total_sentences) * 100 if total_sentences > 0 else 0
    return accuracy

# Ground Truth for Paragraphs
ground_truths = [
    "‡∂Ö‡∂∫‡∑í‡∂∫‡∑è ‡∂ú‡∑ô‡∂Ø‡∂ª‡∑í‡∂±‡∑ä ‡∂¥‡∑í‡∂ß‡∑Ä‡∑ì ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂ú‡∑í‡∂∫‡∑ö‡∂∫. ‡∂∏‡∂∏ ‡∂ª‡∑ñ‡∂¥‡∑Ä‡∑è‡∑Ñ‡∑í‡∂±‡∑í‡∂∫  ‡∂±‡∑ê‡∂ª‡∂π‡∑î‡∑Ä‡∑ô‡∂∏‡∑í ",
    "‡∂á‡∂∫‡∂ß  ‡∂Ö‡∑É‡∂±‡∑ì‡∂¥‡∂∫‡∂ö‡∑ä ‡∂á‡∂≠. ‡∑Ö‡∂ü‡∂∏ ‡∂ª‡∑ù‡∑Ñ‡∂Ω ‡∂ö‡∑ú‡∑Ñ‡∑ö‡∂Ø ?",
    "‡∂ö‡∂ª‡∑î‡∂´‡∑è‡∂ö‡∂ª ‡∂∂‡∑í‡∂Ω‡∑ä‡∂¥‡∂≠ ‡∂Ø‡∑ô‡∂±‡∑ä‡∂± . ‡∂∏‡∂∏ ‡∂∫‡∂±‡∑Ä‡∑è.",
    "‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä‡∂ß ‡∂Ö‡∂ö‡∑ä‡∑Ç‡∂ª  ‡∑Ä‡∑í‡∂±‡∑ä‚Äç‡∂∫‡∑è‡∑É‡∂∫ ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä‡∂∫. ‡∂î‡∂∂‡∑ö  ‡∂ª‡∂†‡∂±‡∑è‡∑Ä ‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑ä‡∂±. ‡∂∏‡∂∏ ‡∂ö‡∑è‡∂ª‡∑ä‡∂∫‡∑è‡∂Ω‡∂∫‡∂ß ‡∂∫‡∂±‡∑ä‡∂± ‡∂ï‡∂±",
    "‡∂î‡∑Ä‡∑î‡∑Ñ‡∑î ‡∂¥‡∑è‡∑É‡∂Ω‡∂ß ‡∂∫‡∂≠‡∑í. ‡∂Ö‡∂¥‡∑í ‡∂∫‡∂∏‡∑î.  ‡∂¥‡∑ä‚Äç‡∂ª‡∂∏‡∑è‡∂Ø ‡∑Ä‡∑ì‡∂∏ ‡∑É‡∑î‡∂Ø‡∑î‡∑É‡∑î ‡∂±‡∑ú‡∑Ä‡∑ö."
]

# Workflow Execution with Accuracy Calculation
print("Step 1: Upload Spell Dictionary")
spell_dict = load_spell_dictionary()
if not spell_dict:
    print("Failed to load the dictionary. Exiting.")
else:
    print("\nStep 2: Load Grammar Checker Model")
    grammar_checker = load_grammar_checker()

    print("\nStep 3: Process Paragraphs")
    processed_paragraphs = []
    output_dir = "paragraph_outputs"
    os.makedirs(output_dir, exist_ok=True)

    for i, paragraph in enumerate(paragraphs, start=1):
        # Spell Correction
        corrected_paragraph = sinhala_spell_corrector(paragraph, spell_dict)

        # Subject-Verb Agreement and Word Order Correction
        corrected_paragraph = apply_correction(corrected_paragraph)

        # Grammar Checking
        try:
            grammar_results = grammar_checker(corrected_paragraph)
        except Exception as e:
            grammar_results = [{"label": "LABEL_0", "score": 0.0}]
            print(f"Error in grammar checking for paragraph {i}: {e}")

        # Append processed paragraph
        processed_paragraphs.append(corrected_paragraph)

        # Prepare output text
        output_text = (
            f"Paragraph {i}:\n"
            f"Original:\n{paragraph}\n\n"
            f"Spell and Grammar Corrected:\n{corrected_paragraph}\n\n"
            f"Grammar Suggestions:\n"
        )
        for result in grammar_results:
            output_text += f"- {result['label']} (Confidence: {result['score']:.2f})\n"

        # Save to a text file
        with open(f"{output_dir}/paragraph_{i}_results.txt", "w", encoding="utf-8") as file:
            file.write(output_text)

    print("Processing complete! Text results are saved in the 'paragraph_outputs' directory.")

    # Calculate Accuracy for Multiple Runs
    print("\nStep 4: Accuracy Calculation for Multiple Runs")
    for run in range(1, 6):
        print(f"Run {run}:")
        accuracy = calculate_accuracy(processed_paragraphs, ground_truths)
        print(f"Accuracy: {accuracy:.2f}%")


Step 1: Upload Spell Dictionary


Saving sinhala_spell_dict.txt to sinhala_spell_dict (6).txt
Error loading dictionary: not enough values to unpack (expected 2, got 1)

Step 2: Load Grammar Checker Model


Device set to use cpu



Step 3: Process Paragraphs
Processing complete! Text results are saved in the 'paragraph_outputs' directory.

Step 4: Accuracy Calculation for Multiple Runs
Run 1:
Accuracy: 92.86%
Run 2:
Accuracy: 92.86%
Run 3:
Accuracy: 92.86%
Run 4:
Accuracy: 92.86%
Run 5:
Accuracy: 92.86%
