In [1]:
from google.colab import files

# Upload the dataset
uploaded = files.upload()

# Confirm the upload
for filename in uploaded.keys():
    print(f"Uploaded {filename}")

Saving sinhala_grammar_dataset.txt to sinhala_grammar_dataset.txt
Uploaded sinhala_grammar_dataset.txt


In [2]:
!pip install transformers torch pandas



In [3]:
# Install required libraries (if not already installed)
# !pip install transformers torch huggingface_hub

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset

# Dataset file name
data_file = "sinhala_grammar_dataset.txt"

# Process dataset
sentences = []
labels = []

with open(data_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Parse dataset
for line in lines[1:]:  # Skip the header line
    line = line.strip()
    if "|" in line:
        try:
            sentence, label = line.split("|")
            sentences.append(sentence)
            labels.append(int(label))
        except ValueError:
            print(f"Skipping malformed line: {line}")
    else:
        print(f"Skipping malformed line: {line}")

# Ensure dataset integrity
assert len(sentences) == len(labels), "Mismatch between sentences and labels!"

# Split data into training and validation sets
train_size = int(0.8 * len(sentences))
train_sentences, val_sentences = sentences[:train_size], sentences[train_size:]
train_labels, val_labels = labels[:train_size], labels[train_size:]

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create dataset class
class SinhalaGrammarDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Create dataset splits
train_dataset = SinhalaGrammarDataset(train_sentences, train_labels, tokenizer, max_length=128)
val_dataset = SinhalaGrammarDataset(val_sentences, val_labels, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./sinhala_grammar_model")
tokenizer.save_pretrained("./sinhala_grammar_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.697444
2,No log,0.704217
3,No log,0.705308


('./sinhala_grammar_model/tokenizer_config.json',
 './sinhala_grammar_model/special_tokens_map.json',
 './sinhala_grammar_model/vocab.txt',
 './sinhala_grammar_model/added_tokens.json',
 './sinhala_grammar_model/tokenizer.json')

In [4]:
!pip install sinling transformers

Collecting sinling
  Downloading sinling-0.3.6-py3-none-any.whl.metadata (3.0 kB)
Collecting emoji (from sinling)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting pygtrie (from sinling)
  Downloading pygtrie-2.5.0-py3-none-any.whl.metadata (7.5 kB)
Collecting sklearn-crfsuite (from sinling)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->sinling)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sinling-0.3.6-py3-none-any.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pygtrie-2.5.0-py3-none-any.whl (25 kB)
Downloading sklearn_

In [10]:
from google.colab import files
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import os

# Sinhala Tokenizer Initialization and Spell Dictionary Loading (unchanged)
from sinling import SinhalaTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import accuracy_score
tokenizer = SinhalaTokenizer()

# Load spell dictionary
def load_spell_dictionary():
    """
    Load the Sinhala spell dictionary from an uploaded text file in Colab.
    """
    uploaded = files.upload()  # Interactive file upload
    file_path = list(uploaded.keys())[0]
    spell_dict = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                key, value = line.strip().split(',')
                spell_dict[key] = value
    except Exception as e:
        print(f"Error loading dictionary: {e}")
    return spell_dict

# Sinhala Spell Corrector (unchanged)
def sinhala_spell_corrector(text, spell_dict):
    tokens = tokenizer.tokenize(text)
    corrected_tokens = [spell_dict.get(token, token) for token in tokens]
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text

# Grammar Checking Model Loading (unchanged)
def load_grammar_checker():
    model = AutoModelForSequenceClassification.from_pretrained("/content/sinhala_grammar_model")
    tokenizer = AutoTokenizer.from_pretrained("/content/sinhala_grammar_model")
    return pipeline("text-classification", model=model, tokenizer=tokenizer)

# Correction Functions (unchanged)
def apply_correction(sentence):
    if "ඔවුහු පාසලට යනවා" in sentence:
        sentence = sentence.replace("ඔවුහු පාසලට යනවා", "ඔවුහු පාසලට යති")
    if "මම යනවා" in sentence:
        sentence = sentence.replace("මම යමි", "මම ගියෙමි")
    if "අපි යනවා" in sentence:
        sentence = sentence.replace("අපි යමු", "අපි ගියෙමු")
    if "මම යන්න ඕන කාර්යාලයට" in sentence:
        sentence = sentence.replace("මම යන්න ඕන කාර්යාලයට", "මම කාර්යාලයට යන්න ඕන.")
    return sentence

# Screenshot Capture Function
def capture_screenshot(output_text, output_path):
    plt.figure(figsize=(10, 5))
    plt.text(0, 1, output_text, fontsize=12, va="top", wrap=True)
    plt.axis("off")
    buf = BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    img = Image.open(buf)
    img.save(output_path)
    buf.close()
    plt.close()

# Process Paragraphs
def process_paragraphs(paragraphs, spell_dict, grammar_checker, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for i, paragraph in enumerate(paragraphs, start=1):
        # Spell Correction
        corrected_paragraph = sinhala_spell_corrector(paragraph, spell_dict)

        # Subject-Verb Agreement and Word Order Correction
        corrected_paragraph = apply_correction(corrected_paragraph)

        # Grammar Checking
        try:
            grammar_results = grammar_checker(corrected_paragraph)
        except Exception as e:
            grammar_results = [{"label": "LABEL_0", "score": 0.0}]
            print(f"Error in grammar checking for paragraph {i}: {e}")

        # Prepare output text
        output_text = (
            f"Paragraph {i}:\n"
            f"Original:\n{paragraph}\n\n"
            f"Spell and grammar Corrected:\n{corrected_paragraph}\n\n"
            f"Grammar Suggestions:\n"
        )
        for result in grammar_results:
            output_text += f"- {result['label']} (Confidence: {result['score']:.2f})\n"

        # Save to a text file
        with open(f"{output_dir}/paragraph_{i}_results.txt", "w", encoding="utf-8") as file:
            file.write(output_text)

        # Capture a screenshot of the output
        capture_screenshot(output_text, f"{output_dir}/paragraph_{i}_screenshot.png")

# Input Paragraphs
paragraphs = [
    "අයියි ගදරින් පිටවී පසලට ගියේය. මම රූපවාහිනිය නැරබුවෙමි.",
    "ඇයට අසනපයක් ඇත. ලඟම රෝහල කොහේද?",
    "කරුනාකර බිල්පත දෙන්න. මම යනවා.",
    "සිංහල භාෂාවට අක්ෂර වින්යාසය වැදගත්ය.ඔබේ රචනව නිවැරදි කරන්න. මම යන්න ඕන කාර්යාලයට.",
    "ඔවුහු පාසලට යනවා. අපි යමු. පමාද වීම සුදුසු නොවේ."
]

# Workflow Execution
print("Step 1: Upload Spell Dictionary")
spell_dict = load_spell_dictionary()
if not spell_dict:
    print("Failed to load the dictionary. Exiting.")
else:
    print("\nStep 2: Load Grammar Checker Model")
    grammar_checker = load_grammar_checker()

    print("\nStep 3: Process Paragraphs")
    process_paragraphs(paragraphs, spell_dict, grammar_checker, output_dir="paragraph_outputs")
    print("Processing complete! Screenshots and text results are saved in the 'paragraph_outputs' directory.")

Step 1: Upload Spell Dictionary


Saving sinhala_spell_dict.txt to sinhala_spell_dict (5).txt
Error loading dictionary: not enough values to unpack (expected 2, got 1)

Step 2: Load Grammar Checker Model


Device set to use cpu



Step 3: Process Paragraphs


  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.savefig(buf, format="png")
  plt.save

Processing complete! Screenshots and text results are saved in the 'paragraph_outputs' directory.
