In [1]:
!pip install transformers torch




In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json

# Load dataset
file_path = "dataset.txt"
data = []
with open(file_path, "r", encoding="utf-16") as file:
    for line in file:
        label, sentence = line.strip().split(" ", 1)
        data.append((int(label), sentence))

# Convert to DataFrame
df = pd.DataFrame(data, columns=["label", "sentence"])

# Divide dataset into correct and incorrect sentences
correct_sentences = df[df["label"] == 1]["sentence"].tolist()
incorrect_sentences = df[df["label"] == 0]["sentence"].tolist()

# Initialize a pre-trained embedding model (e.g., all-MiniLM-L6-v2)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
correct_embeddings = model.encode(correct_sentences, convert_to_tensor=True)
incorrect_embeddings = model.encode(incorrect_sentences, convert_to_tensor=True)

# Function to find most similar correct sentence for an incorrect one
def find_most_similar_sentence(user_input, incorrect_sentences, correct_sentences, incorrect_embeddings, correct_embeddings):
    # Generate embedding for the user input
    user_input_embedding = model.encode([user_input], convert_to_tensor=True)

    # Compute cosine similarity with all incorrect sentence embeddings
    similarities = util.pytorch_cos_sim(user_input_embedding, incorrect_embeddings)
    most_similar_idx = similarities.argmax().item()

    # If the input sentence is marked as incorrect, find the most similar correct sentence
    if similarities[most_similar_idx] > 0.7:  # Threshold for similarity
        # Get the most similar incorrect sentence
        most_similar_incorrect = incorrect_sentences[most_similar_idx]

        # Now match it to the most similar correct sentence
        correct_similarities = util.pytorch_cos_sim(user_input_embedding, correct_embeddings)
        correct_most_similar_idx = correct_similarities.argmax().item()

        return correct_sentences[correct_most_similar_idx]  # Return the corrected sentence
    else:
        return "No matching incorrect sentence found."

# Save the seq2seq pairs as JSON (optional)
seq2seq_data = []
for idx, incorrect_embedding in enumerate(incorrect_embeddings):
    similarities = util.pytorch_cos_sim(incorrect_embedding, correct_embeddings)
    most_similar_idx = similarities.argmax().item()
    seq2seq_data.append({
        "input": incorrect_sentences[idx],
        "target": correct_sentences[most_similar_idx]
    })

# Save to a JSON file
with open("seq2seq_data.json", "w", encoding="utf-8") as json_file:
    json.dump(seq2seq_data, json_file, ensure_ascii=False, indent=4)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Function to find most similar correct sentence for an incorrect one
def find_most_similar_sentence(user_input, incorrect_sentences, correct_sentences, incorrect_embeddings, correct_embeddings):
    # Generate embedding for the user input
    user_input_embedding = model.encode([user_input], convert_to_tensor=True)

    # Compute cosine similarity with all incorrect sentence embeddings
    similarities = util.pytorch_cos_sim(user_input_embedding, incorrect_embeddings)

    # similarities is a 1xN matrix, we need to get the most similar incorrect sentence index
    most_similar_idx = similarities[0].argmax().item()

    # If the input sentence is marked as incorrect, find the most similar correct sentence
    if similarities[0][most_similar_idx] > 0.7:  # Threshold for similarity
        # Get the most similar incorrect sentence
        most_similar_incorrect = incorrect_sentences[most_similar_idx]

        # Now match it to the most similar correct sentence
        correct_similarities = util.pytorch_cos_sim(user_input_embedding, correct_embeddings)
        correct_most_similar_idx = correct_similarities[0].argmax().item()

        return correct_sentences[correct_most_similar_idx]  # Return the corrected sentence
    else:
        return "No matching incorrect sentence found."

# Code for taking user input and correcting the sentence
user_input = input("Enter a sentence: ")

# Call the function from Part 1 to find the corrected sentence
corrected_sentence = find_most_similar_sentence(user_input, incorrect_sentences, correct_sentences, incorrect_embeddings, correct_embeddings)

# Output the corrected sentence
print("Corrected Sentence:", corrected_sentence)


Enter a sentence: අපි උදෙ පානදර නැගගිට්ට. පසල් යන්න ලස්ති උන. අපෙ අම්ම කැම හදුව. අපි කැම කාල පසල් ගිය. පසලෙදි අප සෙලම් කර. ගරුතුම අපිට දන්ඩුවම් දුන්න. අපි කනගටවෙන් හිටිය. අපෙ යලුවො අපිව සනසනන උත්සහ කල. පස්සෙ අපි ඔක්කම එකතු වෙල පන්තිය පිරසිදු කල. ගුරුතුම අපිට සමව දුන්න. අපි සතුතු උන. පස්සෙ අපි කවදවත් පන්තය කලුටු කරන්නෙ නැ කියල පොරොනාදු උන. ගෙදර යනකොට වැස්ස වැටුන. අපි ඔක්කොම තමිල ගිය.
Corrected Sentence: No matching incorrect sentence found.
