In [1]:
!pip install PyPDF2 python-Levenshtein pandas nltk gensim scikit-learn beautifulsoup4 requests
!python -m nltk.downloader punkt

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle

class SinhalaDictionaryBuilder:
    def __init__(self):
        self.words = set()
        self.word_frequencies = defaultdict(int)

    def scrape_wikipedia(self, num_articles=50):
        """Scrape Sinhala Wikipedia articles"""
        base_url = "https://si.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "list": "random",
            "rnlimit": num_articles,
            "rnnamespace": 0
        }
        # Implementation for Wikipedia scraping

    def scrape_news_sites(self):
        """Scrape Sinhala news websites"""
        news_sites = [
            "http://www.dinamina.lk/",
            "http://www.silumina.lk/"
        ]
        # Implementation for news site scraping

    def add_words(self, text):
        """Add words to dictionary"""
        # Extract Sinhala words using Unicode range
        words = re.findall(r'[\u0D80-\u0DFF]+', text)
        for word in words:
            self.words.add(word)
            self.word_frequencies[word] += 1

    def save_dictionary(self, filename='sinhala_dictionary.pkl'):
        """Save dictionary to file"""
        data = {
            'words': self.words,
            'frequencies': dict(self.word_frequencies)
        }
        with open(filename, 'wb') as f:
            pickle.dump(data, f)

In [3]:
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

class SinhalaNLPProcessor:
    def __init__(self, dictionary_file='sinhala_dictionary.pkl'):
        self.load_dictionary(dictionary_file)
        self.vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
        self.train_character_model()

    def load_dictionary(self, filename):
        try:
            with open(filename, 'rb') as f:
                data = pickle.load(f)
                self.dictionary = data['words']
                self.word_frequencies = data['frequencies']
        except FileNotFoundError:
            print("Dictionary file not found. Initializing empty dictionary.")
            self.dictionary = set()
            self.word_frequencies = defaultdict(int)

    def train_character_model(self):
        """Train character-level language model"""
        # Train on dictionary words
        corpus = list(self.dictionary)
        if corpus:
            self.vectorizer.fit(corpus)
            self.char_features = self.vectorizer.transform(corpus)

    def get_word_probability(self, word):
        """Calculate probability of word being correct"""
        if word in self.dictionary:
            return 1.0
        word_vector = self.vectorizer.transform([word])
        similarities = (self.char_features * word_vector.T).toarray()
        return np.max(similarities) if similarities.size > 0 else 0

import PyPDF2
from google.colab import files
import Levenshtein

class SinhalaSpellChecker:
    def __init__(self):
        self.nlp_processor = SinhalaNLPProcessor()
        self.setup_common_corrections()

    def setup_common_corrections(self):
        """Setup common misspelling patterns"""
        self.common_corrections = {
            'මගයපන්වීම': 'මගපෙන්වීම',
            'දරුවන්යේ': 'දරුවන්ගේ',
            # Add more common corrections
        }

        self.patterns = {
            r'([ක-ෆ])්([ක-ෆ])': self.check_joiner,
            # Add more patterns
        }

    def check_joiner(self, match):
        """Check if joiner is needed between characters"""
        first, second = match.groups()
        # Implementation for joiner rules
        return match.group()

    def suggest_corrections(self, word):
        """Suggest corrections for misspelled word"""
        if word in self.nlp_processor.dictionary:
            return []

        suggestions = []

        # Check common corrections
        if word in self.common_corrections:
            suggestions.append(self.common_corrections[word])

        # Generate candidates using Levenshtein distance
        for dict_word in self.nlp_processor.dictionary:
            if Levenshtein.distance(word, dict_word) <= 2:
                prob = self.nlp_processor.get_word_probability(dict_word)
                suggestions.append((dict_word, prob))

        # Sort by probability and return top suggestions
        suggestions.sort(key=lambda x: x[1] if isinstance(x, tuple) else 1.0, reverse=True)
        return [s[0] if isinstance(s, tuple) else s for s in suggestions[:5]]

    def correct_text(self, text):
        """Correct text and return corrections"""
        words = re.findall(r'[\u0D80-\u0DFF]+', text)
        corrections = {}
        corrected_text = text

        for word in words:
            if word not in self.nlp_processor.dictionary:
                suggestions = self.suggest_corrections(word)
                if suggestions:
                    corrections[word] = suggestions[0]
                    corrected_text = corrected_text.replace(word, suggestions[0])

        return corrected_text, corrections

In [4]:
def process_sinhala_pdf():
    """Main function to process PDF and correct text"""
    print("Upload your PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    filename = list(uploaded.keys())[0]

    try:
        # Read PDF
        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"

        # Initialize spell checker
        checker = SinhalaSpellChecker()

        # Correct text
        corrected_text, corrections = checker.correct_text(text)

        # Save results
        results = {
            'original_text.txt': text,
            'corrected_text.txt': corrected_text,
            'corrections.txt': '\n'.join(f"{wrong} -> {right}"
                                       for wrong, right in corrections.items())
        }

        # Save and download files
        for fname, content in results.items():
            with open(fname, 'w', encoding='utf-8') as f:
                f.write(content)
            files.download(fname)

        # Display results
        print("\n=== Corrections Made ===")
        for wrong, right in corrections.items():
            print(f"Changed '{wrong}' to '{right}'")

        print("\n=== Correction Summary ===")
        print(f"Total corrections made: {len(corrections)}")

    except Exception as e:
        print(f"Error processing file: {str(e)}")

In [5]:
if __name__ == "__main__":
    process_sinhala_pdf()

Upload your PDF file...


Saving AI PROJECT_1.pdf to AI PROJECT_1.pdf
Dictionary file not found. Initializing empty dictionary.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


=== Corrections Made ===
Changed 'දරුවන්යේ' to 'දරුවන්ගේ'

=== Correction Summary ===
Total corrections made: 1
