In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os

# Step 1: Scrape vocabulary from the website
def scrape_vocab(url, file_path):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        word_list = []
        for table in soup.find_all("table"):
            for row in table.find_all("tr"):
                cells = row.find_all("td")
                if len(cells) >= 2:
                    sinhala_word = cells[1].get_text(strip=True)
                    word_list.append(sinhala_word)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write("\n".join(word_list))
        print(f"Scraped {len(word_list)} Sinhala words and saved to {file_path}.")
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")

# Scrape Sinhala vocabulary and save
sinhala_vocab_path = "/content/drive/MyDrive/AI/project/sinhala_vocab.txt"
scrape_vocab("https://mylanguages.org/learn_sinhala.php", sinhala_vocab_path)

# Step 2: Load dataset vocabulary
file_path = "/content/drive/MyDrive/AI/project/data-spell-checker.xlsx"
data = pd.read_excel(file_path)
correct_words = data[data['label'] == 1]['word'].str.strip().tolist()
dataset_vocab = set(correct_words)

# Step 3: Load vocabulary from scraped file
if os.path.exists(sinhala_vocab_path):
    with open(sinhala_vocab_path, "r", encoding="utf-8") as file:
        scraped_vocab = set(file.read().splitlines())
else:
    scraped_vocab = set()

# Combine vocabularies
vocab = dataset_vocab.union(scraped_vocab)
print(f"Combined Vocabulary Size: {len(vocab)}")

# Step 4: Define spell-checker functions
always_correct_triggers = {"නම", "ගම"}  # Words after these are always correct

def edit_distance(word1, word2):
    dp = np.zeros((len(word1) + 1, len(word2) + 1), dtype=int)
    for i in range(len(word1) + 1):
        dp[i][0] = i
    for j in range(len(word2) + 1):
        dp[0][j] = j

    for i in range(1, len(word1) + 1):
        for j in range(1, len(word2) + 1):
            cost = 0 if word1[i - 1] == word2[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1,       # Deletion
                           dp[i][j - 1] + 1,       # Insertion
                           dp[i - 1][j - 1] + cost)  # Substitution
    return dp[len(word1)][len(word2)]

def correct_word(word, vocab, max_distance=2):
    if word in vocab:
        return word
    closest_word = None
    min_distance = float("inf")
    for v_word in vocab:
        distance = edit_distance(word, v_word)
        if distance < min_distance and distance <= max_distance:
            min_distance = distance
            closest_word = v_word
    return closest_word if closest_word else word

def spell_check(sentence, vocab, max_distance=2):
    words = sentence.split()
    corrected_words = []
    skip_correction = False

    for i, word in enumerate(words):
        if skip_correction and word.endswith("."):
            skip_correction = False
        if i > 0 and words[i - 1] in always_correct_triggers:
            skip_correction = True
        if skip_correction:
            corrected_words.append(word)
        else:
            corrected_words.append(correct_word(word, vocab, max_distance))
    return " ".join(corrected_words)

# Step 5: User input loop
print("Sinhala Spell Checker with Contextual Rules")
print("Type 'exit' to quit.")
while True:
    user_input = input("Enter a Sinhala sentence: ").strip()
    if user_input.lower() == "exit":
        print("Goodbye!")
        break
    corrected_output = spell_check(user_input, vocab)
    print(f"Corrected Output: {corrected_output}")
