In [5]:
!pip install sinling transformers

Collecting sinling
  Downloading sinling-0.3.6-py3-none-any.whl.metadata (3.0 kB)
Collecting emoji (from sinling)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting pygtrie (from sinling)
  Downloading pygtrie-2.5.0-py3-none-any.whl.metadata (7.5 kB)
Collecting sklearn-crfsuite (from sinling)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->sinling)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sinling-0.3.6-py3-none-any.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pygtrie-2.5.0-py3-none-any.whl (25 kB)
Downloading sklearn_

In [10]:
from sinling import SinhalaTokenizer

# Initialize Sinhala tokenizer
tokenizer = SinhalaTokenizer()

# Define the spell correction dictionary
spell_dict = {
    "අයියි": "අයියා",
"කොහෙද":"කොහේද",
"පසලට":"පාසලට",
"ගදරින්":"ගෙදරින්",
"අසනපයක්": "අසනීපයක්",
"නැරබුවෙමි":"නැරඹුවෙමි",
"පසදින": "පසුදින",
"රහලට": "රෝහලට",

}

# Spell correction function
def sinhala_spell_corrector(paragraph, spell_dict, max_corrections=5):
    tokens = tokenizer.tokenize(paragraph)
    corrections = 0
    corrected_tokens = []

    for token in tokens:
        if token in spell_dict and corrections < max_corrections:
            corrected_tokens.append(spell_dict[token])
            corrections += 1
        else:
            corrected_tokens.append(token)

    corrected_paragraph = ' '.join(corrected_tokens)
    return corrected_paragraph, corrections

# Example usage
paragraph = "අසනපයක් තිබුනු නිසා මම ගදරින් පසලට නොගියෙමි. අයියි පසදින මා රෝහලට රැගෙන ගියේය."
corrected_paragraph, corrections = sinhala_spell_corrector(paragraph, spell_dict)

print("Original Paragraph:", paragraph)
print("Corrected Paragraph:", corrected_paragraph)
print("Total Corrections:", corrections)


Original Paragraph: අසනපයක් තිබුනු නිසා මම ගදරින් පසලට නොගියෙමි. අයියි පසදින මා රෝහලට රැගෙන ගියේය.
Corrected Paragraph: අසනීපයක් තිබුනු නිසා මම ගෙදරින් පාසලට නොගියෙමි . අයියා පසුදින මා රෝහලට රැගෙන ගියේය .
Total Corrections: 5


Approach 1: Rule-Based Grammar Checker


In [13]:
# Rule-based grammar correction
def rule_based_grammar_correction(text):
    corrections = {
        "ඔවුහු පාසලට යනවා": "ඔවුහු පාසලට යති",
        "මම යන්න ඕන කාර්යාලයට": "මම කාර්යාලයට යන්න ඕන.",
        "අපි ගෙදර යනවා": "අපි ගෙදර යමු",
        "මට  බලන්න ඕන එය": "මට එය බලන්න ඕනේ.",
        "ඔවුන් කාර්යාලයට ගිහින්": "ඔවුන් කාර්යාලයට ගියෝය"
    }
    if text in corrections:
        return corrections[text]
    return text

# Example
text = "ඔවුහු පාසලට යනවා"
corrected_text = rule_based_grammar_correction(text)
print("Corrected Text:", corrected_text)




Corrected Text: ඔවුහු පාසලට යති


Approach 2: Deep Learning Model (LSTM)


In [27]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Sample dataset (Sentences and Labels)
data = [
    ("ඔවුහු පාසලට යනවා", 1),  # Incorrect
    ("ඔවුහු පාසලට යති", 0),  # Correct
    ("මම යන්න ඕන කාර්යාලයට", 1),  # Incorrect
    ("මම කාර්යාලයට යන්න ඕන.", 0),  # Correct
    ("අපි යමු", 0),  # Correct
    ("අපි යනවා", 1),  # Incorrect
]

# Separate sentences and labels
sentences, labels = zip(*data)
labels = np.array(labels)

# Tokenize sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Pad sequences
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=16)

# Test the model
def test_lstm_model(text):
    seq = tokenizer.texts_to_sequences([text])
    padded_seq = pad_sequences(seq, maxlen=max_len, padding="post")
    prediction = model.predict(padded_seq)
    return "Incorrect Grammar" if prediction[0] > 0.5 else "Correct Grammar"

# Example sentences for testing
test_sentences = [
    "ඔවුහු පාසලට යනවා",  # Incorrect
    "ඔවුහු පාසලට යති",    # Correct
    "මම කාර්යාලයට යන්න ඕන",  # Correct
    "අපි යමු",  # Correct

]

# Test the model
print("\nTesting Sentences:")
for sentence in test_sentences:
    result = test_lstm_model(sentence)
    print(f"Sentence: {sentence} => Prediction: {result}")


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - loss: 0.6935 - val_accuracy: 0.0000e+00 - val_loss: 0.6932
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step - accuracy: 0.7500 - loss: 0.6904 - val_accuracy: 0.5000 - val_loss: 0.6930
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.2500 - loss: 0.6978 - val_accuracy: 0.5000 - val_loss: 0.6928
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.7500 - loss: 0.6916 - val_accuracy: 0.5000 - val_loss: 0.6925
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.5000 - loss: 0.6920 - val_accuracy: 0.5000 - val_loss: 0.6923
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.2500 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6920
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━

Approach 3: Pre-trained mBERT (Transformer-based NLP)

In [1]:
from google.colab import files

# Upload the dataset
uploaded = files.upload()

# Confirm the upload
for filename in uploaded.keys():
    print(f"Uploaded {filename}")
# Install required libraries (if not already installed)
# !pip install transformers torch huggingface_hub

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset

# Dataset file name
data_file = "sinhala_grammar_dataset.txt"

# Process dataset
sentences = []
labels = []

with open(data_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Parse dataset
for line in lines[1:]:  # Skip the header line
    line = line.strip()
    if "|" in line:
        try:
            sentence, label = line.split("|")
            sentences.append(sentence)
            labels.append(int(label))
        except ValueError:
            print(f"Skipping malformed line: {line}")
    else:
        print(f"Skipping malformed line: {line}")

# Ensure dataset integrity
assert len(sentences) == len(labels), "Mismatch between sentences and labels!"

# Split data into training and validation sets
train_size = int(0.8 * len(sentences))
train_sentences, val_sentences = sentences[:train_size], sentences[train_size:]
train_labels, val_labels = labels[:train_size], labels[train_size:]

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create dataset class
class SinhalaGrammarDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Create dataset splits
train_dataset = SinhalaGrammarDataset(train_sentences, train_labels, tokenizer, max_length=128)
val_dataset = SinhalaGrammarDataset(val_sentences, val_labels, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./sinhala_grammar_model")
tokenizer.save_pretrained("./sinhala_grammar_model")


Saving sinhala_grammar_dataset.txt to sinhala_grammar_dataset.txt
Uploaded sinhala_grammar_dataset.txt


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.780737
2,No log,0.717377
3,No log,0.702542


('./sinhala_grammar_model/tokenizer_config.json',
 './sinhala_grammar_model/special_tokens_map.json',
 './sinhala_grammar_model/vocab.txt',
 './sinhala_grammar_model/added_tokens.json',
 './sinhala_grammar_model/tokenizer.json')

In [3]:
# Testing data
test_sentences = [
    "ඔවුන් යති.",  # Correct
    "ඔවුන් යනවා.",  # Incorrect
    "මම කාර්යාලයට යන්න ඕන.",  # Correct
    "මම යන්න ඕන කාර්යාලයට.",  # Incorrect
]
test_labels = [1, 0, 1, 0]  # Corresponding labels: 1 = Correct, 0 = Incorrect

# Create test dataset
test_dataset = SinhalaGrammarDataset(test_sentences, test_labels, tokenizer, max_length=128)

# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)

print("\nTest Results:")
print(test_results)

# Add prediction function
def predict(sentence, tokenizer, model):
    inputs = tokenizer.encode_plus(
        sentence,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Set model to evaluation mode
    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities).item()

    return "Correct" if predicted_class == 1 else "Incorrect"

# Test the model on example sentences
print("\nTesting Individual Sentences:")
for sentence in test_sentences:
    result = predict(sentence, tokenizer, model)
    print(f"Sentence: '{sentence}' => Prediction: {result}")


Test Results:
{'eval_loss': 0.6937456130981445, 'eval_runtime': 2.1579, 'eval_samples_per_second': 1.854, 'eval_steps_per_second': 0.463, 'epoch': 3.0}

Testing Individual Sentences:
Sentence: 'ඔවුන් යති.' => Prediction: Correct
Sentence: 'ඔවුන් යනවා.' => Prediction: Correct
Sentence: 'මම කාර්යාලයට යන්න ඕන.' => Prediction: Correct
Sentence: 'මම යන්න ඕන කාර්යාලයට.' => Prediction: Correct
