**Install Required Libraries**

In [None]:
pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

**Preprocess the Data**

In [19]:
import torch
from torch.utils.data import DataLoader
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load your dataset
import pandas as pd
df = pd.read_csv("/content/sinhala sentences for grammar checker.csv")


# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, stratify=df["Label"], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data["Label"], random_state=42)

# Save a dummy test_data.csv file
test_data.to_csv("test_data.csv", index=False)  # Create the missing file

# Verify the file was created correctly
test_data = pd.read_csv("test_data.csv")
print("Test data loaded successfully with columns:", test_data.columns)


# Initialize tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define custom dataset class (if not already defined)
class SinhalaGrammarDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create datasets and dataloaders
max_len = 128
batch_size = 16

train_dataset = SinhalaGrammarDataset(train_data["Sentence"].tolist(), train_data["Label"].tolist(), tokenizer, max_len)
val_dataset = SinhalaGrammarDataset(val_data["Sentence"].tolist(), val_data["Label"].tolist(), tokenizer, max_len)
test_dataset = SinhalaGrammarDataset(test_data["Sentence"].tolist(), test_data["Label"].tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 5
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Validation
model.eval()
all_preds = []
all_labels = []

for batch in val_loader:
    with torch.no_grad():
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Classification report
print(classification_report(all_labels, all_preds))


Test data loaded successfully with columns: Index(['Sentence', 'Label'], dtype='object')


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.5994288749668909
Epoch 2/5, Loss: 0.11346823041853697
Epoch 3/5, Loss: 0.042189715860371034
Epoch 4/5, Loss: 0.018151533956963405
Epoch 5/5, Loss: 0.016499707968804338
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85
           1       1.00      1.00      1.00        98

    accuracy                           1.00       183
   macro avg       1.00      1.00      1.00       183
weighted avg       1.00      1.00      1.00       183



In [20]:
# Test the model
model.eval()
test_preds = []
test_labels = []

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Classification report for the test dataset
from sklearn.metrics import classification_report
print("Test Set Evaluation:")
print(classification_report(test_labels, test_preds))


Test Set Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85
           1       1.00      1.00      1.00        98

    accuracy                           1.00       183
   macro avg       1.00      1.00      1.00       183
weighted avg       1.00      1.00      1.00       183



In [21]:
def grammar_check(sentence, tokenizer, model, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=-1).item()

    return pred

# Test with custom sentences
test_sentences = [
    "අපි ගෙදරට යමි",  # Incorrect grammar
    "අපි ගෙදරට යමු",  # Correct grammar
]

for sentence in test_sentences:
    result = grammar_check(sentence, tokenizer, model, device)
    if result == 1:
        print(f"'{sentence}' -> Correct grammar")
    else:
        print(f"'{sentence}' -> Incorrect grammar: Suggest correction")


'අපි ගෙදරට යමි' -> Incorrect grammar: Suggest correction
'අපි ගෙදරට යමු' -> Correct grammar
