In [1]:
!pip install chardet



In [2]:
import chardet

file_path = "/content/dataset.txt"

with open(file_path, "rb") as file:
    raw_data = file.read(1000)  # Read the first 1000 bytes
    result = chardet.detect(raw_data)
    print(result)


{'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}


In [3]:
!pip install sentence-transformers




In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load dataset
file_path = "dataset.txt"
data = []
with open(file_path, "r", encoding="utf-16") as file:
    for line in file:
        label, sentence = line.strip().split(" ", 1)
        data.append((int(label), sentence))

# Convert to DataFrame
df = pd.DataFrame(data, columns=["label", "sentence"])

# Divide dataset into correct and incorrect sentences
correct_sentences = df[df["label"] == 1]["sentence"].tolist()
incorrect_sentences = df[df["label"] == 0]["sentence"].tolist()

# Initialize a pre-trained embedding model (e.g., all-MiniLM-L6-v2)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings
correct_embeddings = model.encode(correct_sentences, convert_to_tensor=True)
incorrect_embeddings = model.encode(incorrect_sentences, convert_to_tensor=True)

# Match each incorrect sentence to the most similar correct sentence
seq2seq_data = []
for idx, incorrect_embedding in enumerate(incorrect_embeddings):
    # Compute cosine similarity with all correct embeddings
    similarities = util.pytorch_cos_sim(incorrect_embedding, correct_embeddings)
    most_similar_idx = similarities.argmax().item()  # Index of the most similar correct sentence
    seq2seq_data.append({
        "input": incorrect_sentences[idx],
        "target": correct_sentences[most_similar_idx]
    })

# Save the seq2seq pairs as JSON
import json
with open("seq2seq_data.json", "w", encoding="utf-8") as json_file:
    json.dump(seq2seq_data, json_file, ensure_ascii=False, indent=4)

print("Seq2seq pairs have been saved to seq2seq_data.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Seq2seq pairs have been saved to seq2seq_data.json


In [5]:
from transformers import MBartForConditionalGeneration, MBartTokenizer, Trainer, TrainingArguments
import json
import torch
from torch.utils.data import Dataset

# Load preprocessed data
with open("/content/seq2seq_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define custom Dataset
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(item["input"], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        targets = self.tokenizer(item["target"], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

# Load tokenizer and model
model_name = "facebook/mbart-large-50"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Prepare dataset
train_dataset = Seq2SeqDataset(data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mbart50-finetuned-sinhala",
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=4,  # Smaller batch size
    num_train_epochs=3,  # Fewer epochs
    learning_rate=5e-5,
    save_total_limit=2,
    weight_decay=0.01,
    logging_steps=10,
    fp16=True,  # Enable mixed precision
    push_to_hub=False
)

# Freeze encoder layers
for param in model.model.encoder.parameters():
    param.requires_grad = False

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./mbart50-finetuned-sinhala")


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,12.8567
20,10.1639
30,9.5309
40,9.193
50,9.0837
60,9.2433
70,9.6391
80,8.9133
90,8.8749
100,8.9179




In [10]:
def correct_sentence(input_sentence, model, tokenizer):
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=128, truncation=True)
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)
    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_sentence

# Load fine-tuned model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("./mbart50-finetuned-sinhala")
tokenizer = MBartTokenizer.from_pretrained(model_name)

# Get user input and correct it
user_input = input("Enter a Sinhala sentence to correct: ")
corrected_output = correct_sentence(user_input, model, tokenizer)
print(f"Corrected Sentence: {corrected_output}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Enter a Sinhala sentence to correct: උකුස්සා සාර්ථකව සුනඛයකු පස්සේ එළවනවා
Corrected Sentence: ලස්සන මම ගෙදර යන්නෙමි
