<a href="https://colab.research.google.com/github/Nuwantha97/Sinhala_spell_and_grammer_checker/blob/Notebooks/Grammer_transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# OLD

In [None]:
import pandas as pd
from datasets import Dataset
import os
import json
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import torch
import evaluate
from typing import Dict, List, Tuple
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

class SinhalaGrammarChecker:
    def __init__(self):
        self.model_path = "/content/model"
        self.tokenizer = None
        self.model = None

    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text"""
        return text.strip()

    def create_dataset(self, texts: List[str], labels: List[int]) -> Dataset:
        """Create a HuggingFace dataset"""
        return Dataset.from_dict({
            'text': [self.preprocess_text(str(text)) for text in texts],
            'label': labels
        })

    def prepare_training_data(self, file_path: str) -> Tuple[Dataset, Dataset]:
        """Prepare training and validation datasets"""
        df = pd.read_csv(file_path)

        texts = []
        labels = []

        # Add incorrect sentences (label 1)
        incorrect_sentences = df['incorrect_sentence'].tolist()
        texts.extend(incorrect_sentences)
        labels.extend([1] * len(incorrect_sentences))

        # Add correct sentences (label 0)
        correct_sentences = df['correct_sentence'].tolist()
        texts.extend(correct_sentences)
        labels.extend([0] * len(correct_sentences))

        # Shuffle the data
        combined = list(zip(texts, labels))
        np.random.shuffle(combined)
        texts, labels = zip(*combined)

        # Create train/validation split
        train_texts = texts[:int(0.9 * len(texts))]
        train_labels = labels[:int(0.9 * len(texts))]
        val_texts = texts[int(0.9 * len(texts)):]
        val_labels = labels[int(0.9 * len(texts)):]

        return (
            self.create_dataset(train_texts, train_labels),
            self.create_dataset(val_texts, val_labels)
        )

    def tokenize_function(self, examples: Dict) -> Dict:
        """Tokenize the texts and prepare for training"""
        tokenized = self.tokenizer(
            examples['text'],
            truncation=True,
            max_length=128,
            padding='max_length'
        )
        tokenized['labels'] = examples['label']
        return tokenized

    def compute_metrics(self, eval_pred: Tuple) -> Dict:
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        metrics = {}

        # Calculate accuracy
        accuracy = evaluate.load("accuracy")
        metrics.update(accuracy.compute(predictions=predictions, references=labels))

        # Calculate precision, recall, and F1 score
        metrics['precision'] = float(precision_score(labels, predictions, average='binary'))
        metrics['recall'] = float(recall_score(labels, predictions, average='binary'))
        metrics['f1'] = float(f1_score(labels, predictions, average='binary'))

        return metrics

    def save_model(self, save_path: str = None) -> None:
        """
        Save the model and tokenizer to the specified path

        Args:
            save_path: Optional custom path to save the model. If None, uses self.model_path
        """
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be initialized before saving")

        save_path = save_path or self.model_path

        # Create directory if it doesn't exist
        os.makedirs(save_path, exist_ok=True)

        # Save model
        print(f"Saving model to {save_path}...")
        self.model.save_pretrained(save_path)

        # Save tokenizer
        print("Saving tokenizer...")
        self.tokenizer.save_pretrained(save_path)

        # Save model configuration
        config = {
            'model_type': 'xlm-roberta',
            'num_labels': 2,
            'max_length': 128,
            'version': '1.0'
        }

        config_path = os.path.join(save_path, 'config.json')
        with open(config_path, 'w') as f:
            json.dump(config, f)

        print(f"Model, tokenizer, and configuration saved to {save_path}")

    def load_model(self, load_path: str = None) -> None:
        """
        Load the model and tokenizer from the specified path

        Args:
            load_path: Optional custom path to load the model from. If None, uses self.model_path
        """
        load_path = load_path or self.model_path

        if not os.path.exists(load_path):
            raise ValueError(f"Model path {load_path} does not exist")

        print(f"Loading model from {load_path}...")

        # Load tokenizer
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(load_path)

        # Load model
        self.model = XLMRobertaForSequenceClassification.from_pretrained(load_path)

        # Move model to available device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)

        print(f"Model and tokenizer loaded successfully from {load_path}")

    def train(self, train_file: str):
        """Train the model"""
        print("Preparing datasets...")
        train_dataset, val_dataset = self.prepare_training_data(train_file)

        print("Initializing tokenizer...")
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

        print("Tokenizing datasets...")
        tokenized_train = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )

        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        print("Initializing model...")
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            'xlm-roberta-base',
            num_labels=2
        )

        training_args = TrainingArguments(
            output_dir=self.model_path,
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            push_to_hub=False,
            warmup_ratio=0.1,
            logging_steps=50,
            gradient_accumulation_steps=2,
            fp16=True
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics
        )

        print("Training model...")
        trainer.train()

        print("Saving model...")
        trainer.save_model(self.model_path)
        self.tokenizer.save_pretrained(self.model_path)

        print("\nFinal Evaluation Metrics:")
        final_metrics = trainer.evaluate()
        for key, value in final_metrics.items():
            print(f"{key}: {value:.4f}")

    def get_correction(self, text: str, df: pd.DataFrame) -> str:
        """Get correction from dataset"""
        match = df[df['incorrect_sentence'] == text]
        if not match.empty:
            return match.iloc[0]['correct_sentence']
        return None

    def check_grammar(self, text: str, df: pd.DataFrame) -> Dict:
        """Check grammar and provide correction"""
        if not self.model or not self.tokenizer:
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(self.model_path)
            self.model = XLMRobertaForSequenceClassification.from_pretrained(self.model_path)

        device = torch.device('cpu')
        self.model = self.model.to(device)

        text = self.preprocess_text(text)

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=128,
            padding='max_length'
        )

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
            has_error = torch.argmax(predictions).item()
            confidence = predictions[0][has_error].item()

        correction = None
        if has_error == 1:
            correction = self.get_correction(text, df)

        return {
            'text': text,
            'has_error': bool(has_error),
            'confidence': confidence,
            'correction': correction,
            'suggestion': correction if correction else ('Grammatical error detected' if has_error else 'No grammatical errors detected.')
        }



In [None]:
def evaluate_model(checker, test_df):
    """Evaluate model performance with balanced testing"""
    all_predictions = []
    all_labels = []
    results = []

    print("\nEvaluating model performance...")

    # Test both incorrect and correct sentences
    for _, row in test_df.iterrows():
        # Test incorrect sentence
        result = checker.check_grammar(row['incorrect_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(1)
        results.append({
            'sentence': row['incorrect_sentence'],
            'expected': 1,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

        # Test correct sentence
        result = checker.check_grammar(row['correct_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(0)
        results.append({
            'sentence': row['correct_sentence'],
            'expected': 0,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

    # Calculate metrics
    accuracy = sum(1 for x, y in zip(all_predictions, all_labels) if x == y) / len(all_labels)
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')

    print("\nTest Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Display confusion matrix
    cm = confusion_matrix(all_labels, all_predictions)
    print("\nConfusion Matrix:")
    print("TN FP")
    print("FN TP")
    print(cm)

    # Show sample predictions
    print("\nSample Predictions (5 correct and 5 incorrect sentences):")
    correct_samples = [r for r in results if r['expected'] == 0][:5]
    incorrect_samples = [r for r in results if r['expected'] == 1][:5]

    print("\nCorrect Sentences:")
    for sample in correct_samples:
        print(f"\nInput: {sample['sentence']}")
        print(f"Predicted has error: {bool(sample['predicted'])}")
        print(f"Confidence: {sample['confidence']:.2f}")

    print("\nIncorrect Sentences:")
    for sample in incorrect_samples:
        print(f"\nInput: {sample['sentence']}")
        print(f"Predicted has error: {bool(sample['predicted'])}")
        print(f"Confidence: {sample['confidence']:.2f}")
        if sample['correction']:
            print(f"Suggested correction: {sample['correction']}")

def main():
    # Initialize checker
    checker = SinhalaGrammarChecker()

    # Load and split dataset
    print("Loading and splitting dataset...")
    full_df = pd.read_csv('/content/drive/MyDrive/projects/spell checker/merged_sentences.csv')

    # Shuffle and split the dataset
    train_df = full_df.sample(frac=0.8, random_state=42)
    test_df = full_df.drop(train_df.index)

    # Save splits
    train_df.to_csv('train_data.csv', index=False)
    test_df.to_csv('test_data.csv', index=False)

    print(f"Dataset split: {len(train_df)} training samples, {len(test_df)} test samples")

    # Train model
    print("\nTraining model...")
    checker.train('train_data.csv')

    # Save model to a custom path (optional)
    checker.save_model('/content/drive/MyDrive/projects/spell checker/model2')
    checker.save_model('/content/model')

    # Load model for inference (can be done in a separate script)
    new_checker = SinhalaGrammarChecker()
    new_checker.load_model('/content/model')

    # Evaluate model
    evaluate_model(new_checker, test_df)

if __name__ == "__main__":
    main()

Loading and splitting dataset...
Dataset split: 12081 training samples, 3020 test samples

Training model...
Preparing datasets...
Initializing tokenizer...
Tokenizing datasets...


Map:   0%|          | 0/21745 [00:00<?, ? examples/s]

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

Initializing model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training model...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.3994,0.694637,0.501862,0.0,0.0,0.0
200,1.3729,0.66011,0.633844,0.745763,0.401993,0.522396
300,1.2174,0.547602,0.740174,0.807036,0.628738,0.706816
400,1.1457,0.511899,0.753,0.850058,0.612126,0.711733
500,1.0708,0.522003,0.733968,0.919283,0.510797,0.6567
600,1.03,0.483288,0.768308,0.895577,0.605482,0.722498
700,1.0042,0.45075,0.795614,0.842664,0.725083,0.779464
800,0.9507,0.468704,0.776169,0.911801,0.609635,0.730712
900,0.8864,0.458867,0.788581,0.913978,0.635382,0.749633
1000,0.9536,0.449927,0.790236,0.922424,0.63206,0.750123


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig
import torch

# Initialize configuration with the exact dimensions from the trained model
config = XLMRobertaConfig(
    vocab_size=250002,  # Match the checkpoint's vocab size
    max_position_embeddings=514,  # Match the position embeddings
    type_vocab_size=1,  # Match the token type embeddings
    hidden_size=768,
    num_labels=2  # Binary classification
)

# Load tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(
    "/content/drive/MyDrive/projects/spell checker/model",
    use_fast=True
)

# Load model with the correct configuration
model = XLMRobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/projects/spell checker/model",
    config=config,
    ignore_mismatched_sizes=True  # Add this to handle any remaining mismatches
)

# Move to available device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Example usage function remains the same
def check_grammar(text, model, tokenizer, device):
    text = text.strip()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding='max_length'
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
        has_error = torch.argmax(predictions).item()
        confidence = predictions[0][has_error].item()

    return {
        'has_error': bool(has_error),
        'confidence': confidence
    }

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig
import torch

def check_sentence_grammar(sentence: str, model_path: str) -> dict:
    """
    Check grammar for a given sentence using the trained model

    Args:
        sentence: Input sentence to check
        model_path: Path to the trained model

    Returns:
        Dictionary containing prediction results
    """
    # Initialize configuration
    config = XLMRobertaConfig(
        vocab_size=250002,
        max_position_embeddings=514,
        type_vocab_size=1,
        hidden_size=768,
        num_labels=2
    )

    # Load tokenizer and model
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_path, use_fast=True)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_path,
        config=config,
        ignore_mismatched_sizes=True
    )

    # Move to CPU device
    device = torch.device('cpu')
    model = model.to(device)

    # Preprocess and tokenize
    text = sentence.strip()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding='max_length'
    )

    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
        has_error = torch.argmax(predictions).item()
        confidence = predictions[0][has_error].item()

    return {
        'text': text,
        'has_error': bool(has_error),
        'confidence': confidence,
        'suggestion': 'Grammatical error detected' if has_error else 'No grammatical errors detected'
    }


In [None]:
# Example usage:
model_path = "/content/drive/MyDrive/projects/spell checker/model"
text = "මම ගියෙම් ගෙදර"
result = check_sentence_grammar(text, model_path)
print(f"Text: {result['text']}")
print(f"Has error: {result['has_error']}")
print(f"Confidence: {result['confidence']:.4f}")
print(f"Suggestion: {result['suggestion']}")

Text: මම ගියෙම් ගෙදර
Has error: True
Confidence: 0.9168
Suggestion: Grammatical error detected


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig
import torch
import pandas as pd

def check_sentence_grammar(sentence: str, model_path: str, corrections_df: pd.DataFrame) -> dict:
    """
    Check grammar and get corrections for a given sentence

    Args:
        sentence: Input sentence to check
        model_path: Path to the trained model
        corrections_df: DataFrame containing incorrect_sentence and correct_sentence pairs

    Returns:
        Dictionary containing prediction results and corrections if available
    """
    # Initialize configuration
    config = XLMRobertaConfig(
        vocab_size=250002,
        max_position_embeddings=514,
        type_vocab_size=1,
        hidden_size=768,
        num_labels=2
    )

    # Load tokenizer and model
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_path, use_fast=True)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_path,
        config=config,
        ignore_mismatched_sizes=True
    )

    # Move to CPU device
    device = torch.device('cpu')
    model = model.to(device)

    # Preprocess and tokenize
    text = sentence.strip()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding='max_length'
    )

    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
        has_error = torch.argmax(predictions).item()
        confidence = predictions[0][has_error].item()

    # Get correction if error is detected
    correction = None
    if has_error == 1:
        # Look up correction in DataFrame
        match = corrections_df[corrections_df['incorrect_sentence'] == text]
        if not match.empty:
            correction = match.iloc[0]['correct_sentence']

    return {
        'text': text,
        'has_error': bool(has_error),
        'confidence': confidence,
        'correction': correction,
        'suggestion': correction if correction else ('Grammatical error detected' if has_error else 'No grammatical errors detected')
    }



In [None]:
model_path = "/content/drive/MyDrive/projects/spell checker/model"
corrections_df = pd.read_csv('/content/drive/MyDrive/projects/spell checker/merged_sentences.csv')  # Load your corrections DataFrame

# Test a sentence
result = check_sentence_grammar("මම යයි ගෙදර", model_path, corrections_df)
print(f"Text: {result['text']}")
print(f"Has error: {result['has_error']}")
print(f"Confidence: {result['confidence']:.4f}")
print(f"Correction: {result['correction']}")
print(f"Suggestion: {result['suggestion']}")


Text: මම යයි ගෙදර
Has error: False
Confidence: 0.8876
Correction: None
Suggestion: No grammatical errors detected


# New

In [15]:
import pandas as pd
from datasets import Dataset
import os
import json
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import torch
import evaluate
from typing import Dict, List, Tuple
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

class SinhalaGrammarChecker:
    def __init__(self):
        self.model_path = None
        self.tokenizer = None
        self.model = None

    def preprocess_text(self, text: str) -> str:
        return text.strip()

    def tokenize_sentence(self, text: str) -> List[str]:
        return text.strip().split()

    def align_words(self, incorrect: str, correct: str) -> List[Tuple[str, str]]:
        incorrect_words = self.tokenize_sentence(incorrect)
        correct_words = self.tokenize_sentence(correct)
        return list(zip(incorrect_words, correct_words))

    def create_dataset(self, texts: List[str], labels: List[int]) -> Dataset:
        return Dataset.from_dict({
            'text': [self.preprocess_text(str(text)) for text in texts],
            'label': labels
        })

    def prepare_training_data(self, file_path: str) -> Tuple[Dataset, Dataset]:
        df = pd.read_csv(file_path)

        texts = []
        labels = []

        texts.extend(df['incorrect_sentence'].tolist())
        labels.extend([1] * len(df['incorrect_sentence']))

        texts.extend(df['correct_sentence'].tolist())
        labels.extend([0] * len(df['correct_sentence']))

        combined = list(zip(texts, labels))
        np.random.shuffle(combined)
        texts, labels = zip(*combined)

        split_idx = int(0.9 * len(texts))
        return (
            self.create_dataset(texts[:split_idx], labels[:split_idx]),
            self.create_dataset(texts[split_idx:], labels[split_idx:])
        )

    def tokenize_function(self, examples: Dict) -> Dict:
        tokenized = self.tokenizer(
            examples['text'],
            truncation=True,
            max_length=128,
            padding='max_length'
        )
        tokenized['labels'] = examples['label']
        return tokenized

    def compute_metrics(self, eval_pred: Tuple) -> Dict:
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        metrics = {}
        accuracy = evaluate.load("accuracy")
        metrics.update(accuracy.compute(predictions=predictions, references=labels))

        metrics['precision'] = float(precision_score(labels, predictions, average='binary'))
        metrics['recall'] = float(recall_score(labels, predictions, average='binary'))
        metrics['f1'] = float(f1_score(labels, predictions, average='binary'))

        return metrics

    def save_model(self, save_path: str = None) -> None:
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be initialized before saving")

        save_path = save_path or self.model_path
        os.makedirs(save_path, exist_ok=True)

        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

        config = {
            'model_type': 'xlm-roberta',
            'num_labels': 2,
            'max_length': 128,
            'version': '1.0'
        }

        with open(os.path.join(save_path, 'config.json'), 'w') as f:
            json.dump(config, f)

#    def load_model(self, load_path: str = None) -> None:
#        load_path = load_path or self.model_path
#        if not os.path.exists(load_path):
#            raise ValueError(f"Model path {load_path} does not exist")
#
#        self.tokenizer = XLMRobertaTokenizer.from_pretrained(load_path)
#        self.model = XLMRobertaForSequenceClassification.from_pretrained(
#            load_path,
#            num_labels=2,
#            ignore_mismatched_sizes=True  # Add this parameter
#        )
#        self.model = self.model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    def load_model(self, load_path: str = None) -> None:
        load_path = load_path or self.model_path
        if not os.path.exists(load_path):
            raise ValueError(f"Model path {load_path} does not exist")

        # Load the tokenizer with potential unknown tokens
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            load_path,
            unk_token="<unk>"  # Specify an unknown token handling
        )

        # Load the model with the correct number of labels and config
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            load_path,
            num_labels=2,
            ignore_mismatched_sizes=True
        )

        # Print vocabulary size and padding token for debugging
        print(f"Tokenizer vocabulary size: {self.tokenizer.vocab_size}")
        print(f"Tokenizer padding token: {self.tokenizer.pad_token}")

        # Manually set the padding token for the model
        self.model.config.pad_token_id = self.tokenizer.pad_token_id

        # Print the vocab size for the embedding to further check
        print(f"Model embedding vocab size: {self.model.roberta.embeddings.word_embeddings.weight.shape[0]}")

        self.model = self.model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

    def train(self, train_file: str):
        train_dataset, val_dataset = self.prepare_training_data(train_file)

        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

        tokenized_train = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )

        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            'xlm-roberta-base',
            num_labels=2
        )

        training_args = TrainingArguments(
            output_dir=self.model_path,
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            push_to_hub=False,
            warmup_ratio=0.1,
            logging_steps=50,
            gradient_accumulation_steps=2,
            fp16=True
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics
        )

        trainer.train()
        trainer.save_model(self.model_path)
        self.tokenizer.save_pretrained(self.model_path)

        final_metrics = trainer.evaluate()
        return final_metrics

    def get_correction(self, text: str, df: pd.DataFrame) -> str:
        match = df[df['incorrect_sentence'] == text]
        return match.iloc[0]['correct_sentence'] if not match.empty else None

    def check_grammar(self, text: str, df: pd.DataFrame) -> Dict:
        if not self.model or not self.tokenizer:
            self.load_model()

        device = torch.device('cpu')
        self.model = self.model.to(device)

        text = self.preprocess_text(text)
        words = self.tokenize_sentence(text)

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=128,
            padding='max_length'
        )

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
            has_error = torch.argmax(predictions).item()
            confidence = predictions[0][has_error].item()

        correction = None
        problematic_words = []

        if has_error == 1:
            correction = self.get_correction(text, df)
            if correction:
                word_alignments = self.align_words(text, correction)
                for i, (incorrect, correct) in enumerate(word_alignments):
                    if incorrect != correct:
                        problematic_words.append({
                            'word': incorrect,
                            'position': i,
                            'correction': correct
                        })

        return {
            'text': text,
            'has_error': bool(has_error),
            'confidence': confidence,
            'correction': correction,
            'problematic_words': problematic_words,
            'suggestion': correction if correction else ('Grammatical error detected' if has_error else 'No grammatical errors detected.')
        }

    def display_errors(self, text: str, df: pd.DataFrame) -> None:
        result = self.check_grammar(text, df)

        if not result['has_error']:
            print("✓ No errors detected")
            return

        words = self.tokenize_sentence(text)
        error_positions = {w['position']: w for w in result['problematic_words']}

        print("Original text with errors highlighted:")
        for i, word in enumerate(words):
            if i in error_positions:
                print(f"\033[91m{word}\033[0m", end=' ')
            else:
                print(word, end=' ')
        print("\n")

        if result['problematic_words']:
            print("Suggested corrections:")
            for error in result['problematic_words']:
                print(f"• '{error['word']}' → '{error['correction']}'")



In [5]:
def evaluate_model(checker, test_df):
    all_predictions = []
    all_labels = []
    results = []

    for _, row in test_df.iterrows():
        result = checker.check_grammar(row['incorrect_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(1)
        results.append({
            'sentence': row['incorrect_sentence'],
            'expected': 1,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

        result = checker.check_grammar(row['correct_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(0)
        results.append({
            'sentence': row['correct_sentence'],
            'expected': 0,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

    metrics = {
        'accuracy': sum(1 for x, y in zip(all_predictions, all_labels) if x == y) / len(all_labels),
        'precision': precision_score(all_labels, all_predictions, average='binary'),
        'recall': recall_score(all_labels, all_predictions, average='binary'),
        'f1': f1_score(all_labels, all_predictions, average='binary'),
        'confusion_matrix': confusion_matrix(all_labels, all_predictions),
        'sample_results': {
            'correct': [r for r in results if r['expected'] == 0][:5],
            'incorrect': [r for r in results if r['expected'] == 1][:5]
        }
    }

    return metrics, results

def main():
    checker = SinhalaGrammarChecker()

    full_df = pd.read_csv('/content/drive/MyDrive/projects/spell check/merged_sentences.csv')
    train_df = full_df.sample(frac=0.8, random_state=42)
    test_df = full_df.drop(train_df.index)

    train_df.to_csv('train_data.csv', index=False)
    test_df.to_csv('test_data.csv', index=False)

    metrics = checker.train('train_data.csv')
    checker.save_model('/content/model')

    new_checker = SinhalaGrammarChecker()
    new_checker.load_model('/content/model')
    eval_metrics, results = evaluate_model(new_checker, test_df)

    return eval_metrics, results

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/21745 [00:00<?, ? examples/s]

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.4055,0.693478,0.50393,0.611111,0.02725,0.052174
200,1.3614,0.65812,0.604055,0.87574,0.244426,0.382182
300,1.2435,0.585168,0.679768,0.890877,0.41123,0.562712
400,1.1047,0.542356,0.727348,0.85567,0.548307,0.668344
500,1.0764,0.497217,0.756309,0.83731,0.63749,0.723863
600,0.9751,0.540675,0.741829,0.864596,0.574732,0.690476
700,0.9719,0.48187,0.774928,0.872626,0.644922,0.74169
800,0.9359,0.465539,0.786926,0.879913,0.665566,0.757875
900,0.9525,0.449644,0.793546,0.877119,0.683732,0.768445
1000,0.9024,0.449454,0.803475,0.876278,0.70768,0.783006


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/model and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([512, 768]) in the model instantiated
- roberta.embeddings.token_type_embeddings.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- roberta.embeddings.word_embeddings.weight: found shape torch.Size([250002, 768]) in the checkpoint and torch.Size([30522, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: index out of range in self

In [16]:
# Initialize and load the model
checker = SinhalaGrammarChecker()
checker.load_model('/content/drive/MyDrive/projects/model1')

# Load your dataset (needed for corrections)
df = pd.read_csv('/content/drive/MyDrive/projects/spell check/merged_sentences.csv')

# Get prediction
sentence = "මම යයි ගෙදර"
result = checker.check_grammar(sentence, df)

# Access results
print(f"Has error: {result['has_error']}")
print(f"Confidence: {result['confidence']}")
print(f"Correction: {result['correction']}")
print(f"Problematic words: {result['problematic_words']}")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/projects/model1 and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([512, 768]) in the model instantiated
- roberta.embeddings.token_type_embeddings.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- roberta.embeddings.word_embeddings.weight: found shape torch.Size([250002, 768]) in the checkpoint and torch.Size([30522, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer vocabulary size: 250002
Tokenizer padding token: <pad>
Model embedding vocab size: 30522


IndexError: index out of range in self

In [10]:
checker = SinhalaGrammarChecker()

# Load the tokenizer directly from the saved model path
checker.tokenizer = XLMRobertaTokenizer.from_pretrained(
    "/content/drive/MyDrive/projects/spell checker/model2",
    trust_remote_code=True
)

# Load the model with matching config
checker.model = XLMRobertaForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/projects/spell checker/model2",
    trust_remote_code=True,
    ignore_mismatched_sizes=True
)

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/projects/spell checker/merged_sentences.csv')

# Get prediction
sentence = "මම යයි ගෙදර"
result = checker.check_grammar(sentence, df)

OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/projects/spell checker/model2'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
checker = SinhalaGrammarChecker()

# First load base model and tokenizer
base_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
base_model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

# Save them to temp location
base_tokenizer.save_pretrained('/tmp/model')
base_model.save_pretrained('/tmp/model')

# Now load your trained model
checker.model_path = "/content/drive/MyDrive/projects/spell checker/model2"
checker.load_model()

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/projects/spell checker/merged_sentences.csv')

# Test prediction
sentence = "මම යයි ගෙදර"
result = checker.check_grammar(sentence, df)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/projects/spell checker/model2 and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([512, 768]) in the model instantiated
- roberta.embeddings.token_type_embeddings.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- roberta.embeddings.word_embeddings.weight: found shape torch.Size([250002, 768]) in the checkpoint and torc

IndexError: index out of range in self

In [None]:
    def load_model(self, load_path: str = None) -> None:
        load_path = load_path or self.model_path
        if not os.path.exists(load_path):
            raise ValueError(f"Model path {load_path} does not exist")

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(load_path)
        self.model = XLMRobertaForSequenceClassification.from_pretrained(load_path)
        self.model = self.model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# 2

In [4]:
import pandas as pd
from datasets import Dataset
import os
import json
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import torch
import evaluate
from typing import Dict, List, Tuple
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

class SinhalaGrammarChecker:
    def __init__(self):
        self.model_path = "/content/model"
        self.tokenizer = None
        self.model = None
        self.max_length = 128

    def preprocess_text(self, text: str) -> str:
        return text.strip()

    def tokenize_sentence(self, text: str) -> List[str]:
        return text.strip().split()

    def align_words(self, incorrect: str, correct: str) -> List[Tuple[str, str]]:
        incorrect_words = self.tokenize_sentence(incorrect)
        correct_words = self.tokenize_sentence(correct)
        return list(zip(incorrect_words, correct_words))

    def create_dataset(self, texts: List[str], labels: List[int]) -> Dataset:
        return Dataset.from_dict({
            'text': [self.preprocess_text(str(text)) for text in texts],
            'label': labels
        })

    def prepare_training_data(self, file_path: str) -> Tuple[Dataset, Dataset]:
        df = pd.read_csv(file_path)

        texts = []
        labels = []

        texts.extend(df['incorrect_sentence'].tolist())
        labels.extend([1] * len(df['incorrect_sentence']))

        texts.extend(df['correct_sentence'].tolist())
        labels.extend([0] * len(df['correct_sentence']))

        combined = list(zip(texts, labels))
        np.random.shuffle(combined)
        texts, labels = zip(*combined)

        split_idx = int(0.9 * len(texts))
        return (
            self.create_dataset(texts[:split_idx], labels[:split_idx]),
            self.create_dataset(texts[split_idx:], labels[split_idx:])
        )

    def tokenize_function(self, examples: Dict) -> Dict:
        tokenized = self.tokenizer(
            examples['text'],
            truncation=True,
            max_length=self.max_length,
            padding='max_length'
        )
        tokenized['labels'] = examples['label']
        return tokenized

    def compute_metrics(self, eval_pred: Tuple) -> Dict:
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        metrics = {}
        accuracy = evaluate.load("accuracy")
        metrics.update(accuracy.compute(predictions=predictions, references=labels))

        metrics['precision'] = float(precision_score(labels, predictions, average='binary'))
        metrics['recall'] = float(recall_score(labels, predictions, average='binary'))
        metrics['f1'] = float(f1_score(labels, predictions, average='binary'))

        return metrics

    def save_model(self, save_path: str = None) -> None:
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be initialized before saving")

        save_path = save_path or self.model_path
        os.makedirs(save_path, exist_ok=True)

        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

        config = {
            'model_type': 'xlm-roberta',
            'num_labels': 2,
            'max_length': self.max_length,
            'version': '1.0'
        }

        with open(os.path.join(save_path, 'config.json'), 'w') as f:
            json.dump(config, f)

    def load_model(self, load_path: str = None) -> None:
        load_path = load_path or self.model_path
        if not os.path.exists(load_path):
            raise ValueError(f"Model path {load_path} does not exist")

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(load_path)
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            load_path,
            num_labels=2,
            problem_type="single_label_classification",
            ignore_mismatched_sizes=True
        )

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = self.model.to(device)

    def train(self, train_file: str):
        train_dataset, val_dataset = self.prepare_training_data(train_file)

        model_name = 'xlm-roberta-base'
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        self.tokenizer.save_pretrained(self.model_path)

        tokenized_train = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )

        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            problem_type="single_label_classification"
        )

        training_args = TrainingArguments(
            output_dir=self.model_path,
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            push_to_hub=False,
            warmup_ratio=0.1,
            logging_steps=50,
            gradient_accumulation_steps=2,
            fp16=True if torch.cuda.is_available() else False
        )

        data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer,
            padding=True,
            max_length=self.max_length
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics
        )

        trainer.train()
        trainer.save_model(self.model_path)
        self.tokenizer.save_pretrained(self.model_path)

        config = {
            'model_type': 'xlm-roberta',
            'num_labels': 2,
            'max_length': self.max_length,
            'version': '1.0'
        }

        with open(os.path.join(self.model_path, 'config.json'), 'w') as f:
            json.dump(config, f)

        final_metrics = trainer.evaluate()
        return final_metrics

    def get_correction(self, text: str, df: pd.DataFrame) -> str:
        match = df[df['incorrect_sentence'] == text]
        return match.iloc[0]['correct_sentence'] if not match.empty else None

    def check_grammar(self, text: str, df: pd.DataFrame) -> Dict:
        if not self.model or not self.tokenizer:
            self.load_model()

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)

        text = self.preprocess_text(text)
        words = self.tokenize_sentence(text)

        try:
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=self.max_length,
                padding='max_length'
            )

            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.softmax(outputs.logits, dim=1)
                has_error = torch.argmax(predictions).item()
                confidence = predictions[0][has_error].item()

            correction = None
            problematic_words = []

            if has_error == 1:
                correction = self.get_correction(text, df)
                if correction:
                    word_alignments = self.align_words(text, correction)
                    for i, (incorrect, correct) in enumerate(word_alignments):
                        if incorrect != correct:
                            problematic_words.append({
                                'word': incorrect,
                                'position': i,
                                'correction': correct
                            })

            return {
                'text': text,
                'has_error': bool(has_error),
                'confidence': confidence,
                'correction': correction,
                'problematic_words': problematic_words,
                'suggestion': correction if correction else ('Grammatical error detected' if has_error else 'No grammatical errors detected.')
            }

        except Exception as e:
            print(f"Error during grammar checking: {str(e)}")
            return {
                'text': text,
                'has_error': None,
                'confidence': None,
                'error': str(e)
            }

    def display_errors(self, text: str, df: pd.DataFrame) -> None:
        result = self.check_grammar(text, df)

        if not result['has_error']:
            print("✓ No errors detected")
            return

        words = self.tokenize_sentence(text)
        error_positions = {w['position']: w for w in result['problematic_words']}

        print("Original text with errors highlighted:")
        for i, word in enumerate(words):
            if i in error_positions:
                print(f"\033[91m{word}\033[0m", end=' ')
            else:
                print(word, end=' ')
        print("\n")

        if result['problematic_words']:
            print("Suggested corrections:")
            for error in result['problematic_words']:
                print(f"• '{error['word']}' → '{error['correction']}'")

def evaluate_model(checker, test_df):
    all_predictions = []
    all_labels = []
    results = []

    for _, row in test_df.iterrows():
        result = checker.check_grammar(row['incorrect_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(1)
        results.append({
            'sentence': row['incorrect_sentence'],
            'expected': 1,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

        result = checker.check_grammar(row['correct_sentence'], test_df)
        all_predictions.append(int(result['has_error']))
        all_labels.append(0)
        results.append({
            'sentence': row['correct_sentence'],
            'expected': 0,
            'predicted': int(result['has_error']),
            'confidence': result['confidence'],
            'correction': result['correction']
        })

    metrics = {
        'accuracy': sum(1 for x, y in zip(all_predictions, all_labels) if x == y) / len(all_labels),
        'precision': precision_score(all_labels, all_predictions, average='binary'),
        'recall': recall_score(all_labels, all_predictions, average='binary'),
        'f1': f1_score(all_labels, all_predictions, average='binary'),
        'confusion_matrix': confusion_matrix(all_labels, all_predictions),
        'sample_results': {
            'correct': [r for r in results if r['expected'] == 0][:5],
            'incorrect': [r for r in results if r['expected'] == 1][:5]
        }
    }

    return metrics, results

def main():
    checker = SinhalaGrammarChecker()

    # Load the full dataset
    full_df = pd.read_csv('/content/drive/MyDrive/projects/spell check/merged_sentences.csv')

    # Split into train and test sets
    train_df = full_df.sample(frac=0.8, random_state=42)
    test_df = full_df.drop(train_df.index)

    # Save splits to files
    train_df.to_csv('train_data.csv', index=False)
    test_df.to_csv('test_data.csv', index=False)

    # Train the model
    print("Training model...")
    metrics = checker.train('train_data.csv')
    print("Training metrics:", metrics)

    # Save the trained model
    print("Saving model...")
    checker.save_model('/content/model')

    # Load and evaluate
    print("Evaluating model...")
    new_checker = SinhalaGrammarChecker()
    new_checker.load_model('/content/model')
    eval_metrics, results = evaluate_model(new_checker, test_df)

    print("\nEvaluation metrics:")
    print(eval_metrics)

    return eval_metrics, results

if __name__ == "__main__":
    main()

Training model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/21745 [00:00<?, ? examples/s]

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.3918,0.692363,0.508895,0.501813,0.92636,0.650985
200,1.3774,0.682915,0.559371,0.531988,0.904603,0.669972
300,1.1956,0.608014,0.686388,0.858785,0.437657,0.579823
400,1.1486,0.503092,0.748035,0.873724,0.573222,0.692269
500,1.0876,0.484289,0.759206,0.862722,0.610042,0.714706
600,1.0464,0.457806,0.783616,0.840771,0.693724,0.760202
700,0.9536,0.475921,0.776996,0.87788,0.637657,0.73873
800,0.9625,0.46902,0.785271,0.876392,0.658577,0.752031
900,0.9299,0.446068,0.802648,0.864837,0.712134,0.781092
1000,0.928,0.439818,0.790236,0.901869,0.646025,0.752804


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Training metrics: {'eval_loss': 0.4249119162559509, 'eval_accuracy': 0.8196110881257758, 'eval_precision': 0.8837209302325582, 'eval_recall': 0.7313807531380753, 'eval_f1': 0.8003663003663004, 'eval_runtime': 5.8654, 'eval_samples_per_second': 412.08, 'eval_steps_per_second': 25.915, 'epoch': 3.0}
Saving model...
Evaluating model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/model and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([512, 768]) in the model instantiated
- roberta.embeddings.token_type_embeddings.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- roberta.embeddings.word_embeddings.weight: found shape torch.Size([250002, 768]) in the checkpoint and torch.Size([30522, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error during grammar checking: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

# Updated

In [20]:
import pandas as pd
from datasets import Dataset
import os
import json
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    XLMRobertaConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import torch
import evaluate
from typing import Dict, List, Tuple
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

class SinhalaGrammarChecker:
    def __init__(self):
        self.model_path = "/content/drive/MyDrive/projects/model1"
        self.tokenizer = None
        self.model = None
        self.max_length = 128
        self.model_name = 'xlm-roberta-base'

    def preprocess_text(self, text: str) -> str:
        return text.strip()

    def tokenize_sentence(self, text: str) -> List[str]:
        return text.strip().split()

    def align_words(self, incorrect: str, correct: str) -> List[Tuple[str, str]]:
        incorrect_words = self.tokenize_sentence(incorrect)
        correct_words = self.tokenize_sentence(correct)
        return list(zip(incorrect_words, correct_words))

    def create_dataset(self, texts: List[str], labels: List[int]) -> Dataset:
        return Dataset.from_dict({
            'text': [self.preprocess_text(str(text)) for text in texts],
            'label': labels
        })

    def prepare_training_data(self, file_path: str) -> Tuple[Dataset, Dataset]:
        df = pd.read_csv(file_path)

        texts = []
        labels = []

        texts.extend(df['incorrect_sentence'].tolist())
        labels.extend([1] * len(df['incorrect_sentence']))

        texts.extend(df['correct_sentence'].tolist())
        labels.extend([0] * len(df['correct_sentence']))

        combined = list(zip(texts, labels))
        np.random.shuffle(combined)
        texts, labels = zip(*combined)

        split_idx = int(0.9 * len(texts))
        return (
            self.create_dataset(texts[:split_idx], labels[:split_idx]),
            self.create_dataset(texts[split_idx:], labels[split_idx:])
        )

    def tokenize_function(self, examples: Dict) -> Dict:
        tokenized = self.tokenizer(
            examples['text'],
            truncation=True,
            max_length=self.max_length,
            padding='max_length'
        )
        tokenized['labels'] = examples['label']
        return tokenized

    def compute_metrics(self, eval_pred: Tuple) -> Dict:
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        metrics = {}
        accuracy = evaluate.load("accuracy")
        metrics.update(accuracy.compute(predictions=predictions, references=labels))

        metrics['precision'] = float(precision_score(labels, predictions, average='binary'))
        metrics['recall'] = float(recall_score(labels, predictions, average='binary'))
        metrics['f1'] = float(f1_score(labels, predictions, average='binary'))

        return metrics

    def initialize_model_and_tokenizer(self):
        # Initialize tokenizer
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            self.model_name,
            model_max_length=self.max_length
        )

        # Initialize config with explicit values to ensure consistency
        config = XLMRobertaConfig(
            num_labels=2,
            vocab_size=self.tokenizer.vocab_size,  # Use tokenizer's vocab size
            max_position_embeddings=self.max_length + 2, # Accommodate special tokens
            type_vocab_size=1  # Ensure consistency
        )

        # Initialize model with config
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            self.model_name,
            config=config,
            ignore_mismatched_sizes=True
        )

    def save_model(self, save_path: str = None) -> None:
        if not self.model or not self.tokenizer:
            raise ValueError("Model and tokenizer must be initialized before saving")

        save_path = save_path or self.model_path
        os.makedirs(save_path, exist_ok=True)

        # Save model and tokenizer
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

        # Save configuration
        config = {
            'model_type': 'xlm-roberta',
            'num_labels': 2,
            'max_length': self.max_length,
            'version': '1.0',
            'vocab_size': self.tokenizer.vocab_size,  # Save tokenizer's vocab size
            'max_position_embeddings': self.max_length + 2,  # Save max position embeddings
            'type_vocab_size': 1  # Save token type embeddings size
        }

        with open(os.path.join(save_path, 'config.json'), 'w') as f:
            json.dump(config, f)

    def load_model(self, load_path: str = None) -> None:
        load_path = load_path or self.model_path
        if not os.path.exists(load_path):
            raise ValueError(f"Model path {load_path} does not exist")

        # Load configuration
        with open(os.path.join(load_path, 'config.json'), 'r') as f:
            config_dict = json.load(f)

        # Load tokenizer and config, using saved values
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            load_path,
            model_max_length=config_dict['max_length']
        )

        config = XLMRobertaConfig.from_pretrained(
            load_path,
            num_labels=config_dict['num_labels'],
            vocab_size=config_dict['vocab_size'],  # Use saved vocab size
            max_position_embeddings=config_dict['max_position_embeddings'],  # Use saved max position embeddings
            type_vocab_size=config_dict['type_vocab_size']  # Use saved token type embeddings size
        )

        # Load model with config
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            load_path,
            config=config,
            ignore_mismatched_sizes=True
        )

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = self.model.to(device)

    def train(self, train_file: str):
        train_dataset, val_dataset = self.prepare_training_data(train_file)

        # Initialize fresh model and tokenizer
        self.initialize_model_and_tokenizer()

        # Process datasets
        tokenized_train = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        tokenized_val = val_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )

        training_args = TrainingArguments(
            output_dir=self.model_path,
            learning_rate=1e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            push_to_hub=False,
            warmup_ratio=0.1,
            logging_steps=50,
            gradient_accumulation_steps=2,
            fp16=True if torch.cuda.is_available() else False
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics
        )

        print("Starting training...")
        trainer.train()

        print("Saving model...")
        self.save_model()

        print("Evaluating final model...")
        final_metrics = trainer.evaluate()
        return final_metrics

    def check_grammar(self, text: str, df: pd.DataFrame) -> Dict:
        if not self.model or not self.tokenizer:
            self.load_model()

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)

        text = self.preprocess_text(text)
        words = self.tokenize_sentence(text)

        try:
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=self.max_length,
                padding='max_length'
            )

            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.softmax(outputs.logits, dim=1)
                has_error = torch.argmax(predictions).item()
                confidence = predictions[0][has_error].item()

            correction = None
            problematic_words = []

            if has_error == 1:
                correction = self.get_correction(text, df)
                if correction:
                    word_alignments = self.align_words(text, correction)
                    for i, (incorrect, correct) in enumerate(word_alignments):
                        if incorrect != correct:
                            problematic_words.append({
                                'word': incorrect,
                                'position': i,
                                'correction': correct
                            })

            return {
                'text': text,
                'has_error': bool(has_error),
                'confidence': confidence,
                'correction': correction,
                'problematic_words': problematic_words,
                'suggestion': correction if correction else ('Grammatical error detected' if has_error else 'No grammatical errors detected.')
            }

        except Exception as e:
            print(f"Error during grammar checking: {str(e)}")
            return {
                'text': text,
                'has_error': None,
                'confidence': None,
                'error': str(e)
            }

    def get_correction(self, text: str, df: pd.DataFrame) -> str:
        match = df[df['incorrect_sentence'] == text]
        return match.iloc[0]['correct_sentence'] if not match.empty else None

    def display_errors(self, text: str, df: pd.DataFrame) -> None:
        result = self.check_grammar(text, df)

        if not result['has_error']:
            print("✓ No errors detected")
            return

        words = self.tokenize_sentence(text)
        error_positions = {w['position']: w for w in result['problematic_words']}

        print("Original text with errors highlighted:")
        for i, word in enumerate(words):
            if i in error_positions:
                print(f"\033[91m{word}\033[0m", end=' ')
            else:
                print(word, end=' ')
        print("\n")

        if result['problematic_words']:
            print("Suggested corrections:")
            for error in result['problematic_words']:
                print(f"• '{error['word']}' → '{error['correction']}'")



In [21]:
def main():
    # Initialize the checker
    checker = SinhalaGrammarChecker()

    # Load your dataset
    print("Loading dataset...")
    full_df = pd.read_csv('/content/drive/MyDrive/projects/spell check/merged_sentences.csv')

    # Split into train and test sets
    print("Splitting dataset...")
    train_df = full_df.sample(frac=0.8, random_state=42)
    test_df = full_df.drop(train_df.index)

    # Save splits
    train_df.to_csv('train_data.csv', index=False)
    test_df.to_csv('test_data.csv', index=False)

    # Train the model
    print("Training model...")
    metrics = checker.train('train_data.csv')
    print("Training metrics:", metrics)

    # Test the model
    print("\nTesting model...")
    sentence = "මම යයි ගෙදර"
    result = checker.check_grammar(sentence, full_df)
    print("Test result:", result)

    return checker, full_df

if __name__ == "__main__":
    checker, df = main()

Loading dataset...
Splitting dataset...
Training model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([130, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/21745 [00:00<?, ? examples/s]

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


KeyboardInterrupt: 