Statistical Approach

In [1]:
!pip install PyPDF2 google-colab

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, jedi
Successfully installed PyPDF2-3.0.1 jedi-0.19.2


In [2]:
!pip install PyPDF2 python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [6]:
import PyPDF2
from collections import defaultdict
import re
from typing import List, Dict, Tuple
from google.colab import files

class SinhalaStatisticalCorrector:
    def __init__(self):
        # OCR corrections
        self.ocr_fixes = {
            'ප ': 'පො',
            'ො': 'ා',
            'පේ': 'ලේ',
            'න්ි': 'න්ති',
            'ොස': 'පාස',
            'ොඩ': 'පාඩ'
        }

        # Statistical patterns for word ordering
        self.patterns = {
            'subject_location_object': {
                'words': ['මම', 'අපි'],
                'location': ['පාසලේදී', 'නිවසේ', 'පන්තියේ'],
                'probability': 0.8
            },
            'time_subject': {
                'words': ['හෙට', 'ඊයේ', 'අද'],
                'probability': 0.9
            }
        }

        # Statistical rules for sentence structure
        self.structure_rules = {
            'time_first': defaultdict(float),
            'subject_verb': defaultdict(float),
            'location_pos': defaultdict(float)
        }

        # Train on correct examples
        self.train_patterns()

    def train_patterns(self):
        """Train statistical patterns from correct examples"""
        training_data = [
            "මම පාසලේදී පොත කියවනවා",
            "හෙට මම පාසල් යනවා",
            "ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා",
            "හෙට අපි ක්‍රීඩා තරඟයට යනවා"
        ]

        for sentence in training_data:
            words = sentence.split()
            # Train time marker positions
            if any(time in words for time in ['හෙට', 'ඊයේ', 'අද']):
                self.structure_rules['time_first'][0] += 1

            # Train subject-verb patterns
            for i, word in enumerate(words[:-1]):
                if word in ['මම', 'අපි', 'සිසුන්']:
                    verb_pos = len(words) - i - 1
                    self.structure_rules['subject_verb'][verb_pos] += 1

    def fix_ocr_text(self, text: str) -> str:
        """Fix common OCR issues"""
        for wrong, right in self.ocr_fixes.items():
            text = text.replace(wrong, right)
        return text

    def apply_statistical_rules(self, sentence: str) -> str:
        """Apply statistical grammar rules"""
        words = sentence.split()
        new_words = []

        # Identify components
        time_marker = next((w for w in words if w in ['හෙට', 'ඊයේ', 'අද', 'උදේ']), None)
        subject = next((w for w in words if w in ['මම', 'අපි', 'සිසුන්', 'දරුවා', 'ගුරුවරු']), None)
        location = next((w for w in words if w in ['පාසලේදී', 'නිවසේ', 'පන්තියේ']), None)

        # Apply statistical ordering
        if time_marker:
            new_words.append(time_marker)
        if subject:
            new_words.append(subject)
        if location:
            new_words.append(location)

        # Add remaining words maintaining verb at end
        remaining = [w for w in words if w not in new_words and w != time_marker]
        verb = next((w for w in remaining if w.endswith(('නවා', 'ව්වා', 'යි'))), None)
        if verb:
            remaining.remove(verb)

        new_words.extend(remaining)
        if verb:
            new_words.append(verb)

        return ' '.join(new_words)

    def correct_tense(self, sentence: str) -> str:
        """Correct tense based on time markers"""
        words = sentence.split()
        if 'හෙට' in words:
            words = [w.replace('ගියා', 'යනවා').replace('කියැව්වා', 'කියවනවා') for w in words]
        elif 'ඊයේ' in words:
            words = [w.replace('කියවනවා', 'කියැව්වා').replace('යනවා', 'ගියා') for w in words]
        return ' '.join(words)

    def process_text(self, text: str) -> str:
        """Process complete text"""
        # First fix OCR issues
        text = self.fix_ocr_text(text)

        # Split into sentences
        sentences = text.split('.')
        sentences = [s.strip() for s in sentences if s.strip()]

        # Process each sentence
        corrected = []
        for sentence in sentences:
            # Apply statistical rules
            reordered = self.apply_statistical_rules(sentence)
            # Fix tense
            corrected_sentence = self.correct_tense(reordered)
            corrected.append(corrected_sentence)

        return '. '.join(corrected) + '.'

def process_pdf():
    """Process PDF file"""
    print("Upload your PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    filename = list(uploaded.keys())[0]

    try:
        # Read PDF
        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"

        # Create corrector and process text
        corrector = SinhalaStatisticalCorrector()
        corrected_text = corrector.process_text(text)

        # Save results
        with open('corrected_text.txt', 'w', encoding='utf-8') as f:
            f.write(corrected_text)

        # Download results
        files.download('corrected_text.txt')

        # Display results
        print("\nCorrected text:")
        print(corrected_text)

    except Exception as e:
        print(f"Error processing file: {str(e)}")

if __name__ == "__main__":
    process_pdf()

Upload your PDF file...


Saving proj2.pdf to proj2 (1).pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Corrected text:
මම සිංහල පාාත පෙදරදී කියවනවා. මම පාර දින පහට ාසලේ විභාෙය ලියනවා. අපි පුස්තකාලය පාාත් ඊලේ කියවයි. ක්රීඩාව දරුපවෝ ඊලේ උලේ කරයි. ගුරුවරු පහට ගියා න්තියට. අද මම සට නිවපස් ාඩම් කළා. දරුවා සවස පාාත කියැවුවා පහට. මම විේයාොරය පහට යනවා.


In [9]:
from IPython.display import HTML, display
import ipywidgets as widgets

class SinhalaInterface:
    def __init__(self):
        self.corrector = SinhalaStatisticalCorrector()

    def display_header(self):
        header = """
        <div style='
            background-color: #f0f2f6;
            padding: 20px;
            border-radius: 10px;
            margin: 10px 0;
            text-align: center;
        '>
            <h1 style='color: #2c3e50;'>සිංහල ව්‍යාකරණ නිවැරදි කිරීම</h1>
            <h3 style='color: #34495e;'>Sinhala Grammar Correction</h3>
        </div>
        """
        display(HTML(header))

    def display_text_section(self, title, text, is_corrected=False):
        bg_color = '#e8f4ea' if is_corrected else '#ffffff'
        section = f"""
        <div style='margin: 20px 0;'>
            <h4 style='color: #2c3e50;'>{title}</h4>
            <div style='
                background-color: {bg_color};
                padding: 15px;
                border-radius: 8px;
                border: 1px solid #ddd;
                white-space: pre-wrap;
                font-family: monospace;
                color: #000000;
            '>
                {text}
            </div>
        </div>
        """
        display(HTML(section))

    def show_corrections(self, original, corrected):
        sentences_orig = original.split('.')
        sentences_corr = corrected.split('.')

        corrections_html = """
        <div style='margin: 20px 0;'>
            <h4 style='color: #2c3e50;'>Detailed Corrections</h4>
            <div style='
                background-color: #f8f9fa;
                padding: 15px;
                border-radius: 8px;
            '>
        """

        for orig, corr in zip(sentences_orig, sentences_corr):
            if orig.strip() and corr.strip() and orig.strip() != corr.strip():
                corrections_html += f"""
                <div style='margin: 10px 0; padding: 10px; border-left: 4px solid #3498db;'>
                    <div style='color: #000000;'>Original: {orig.strip()}</div>
                    <div style='color: #000000;'>Corrected: {corr.strip()}</div>
                </div>
                """

        corrections_html += "</div></div>"
        display(HTML(corrections_html))

def process_with_interface():
    interface = SinhalaInterface()
    interface.display_header()

    # Upload section
    display(HTML("""
    <div style='
        margin: 20px 0;
        padding: 20px;
        background-color: #f8f9fa;
        border-radius: 8px;
        text-align: center;
    '>
        <h4 style='color: #2c3e50;'>Upload PDF Document</h4>
        <p>Choose a PDF file containing Sinhala text</p>
    </div>
    """))

    uploaded = files.upload()

    if not uploaded:
        display(HTML("""
        <div style='color: #000000; text-align: center; padding: 10px;'>
            No file uploaded.
        </div>
        """))
        return

    try:
        filename = list(uploaded.keys())[0]

        # Read PDF
        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"

        # Display original text
        interface.display_text_section("Original Text", text)

        # Process text
        corrected_text = interface.corrector.process_text(text)

        # Display corrected text
        interface.display_text_section("Corrected Text", corrected_text, True)

        # Show detailed corrections
        interface.show_corrections(text, corrected_text)

        # Save and download
        with open('corrected_text.txt', 'w', encoding='utf-8') as f:
            f.write(corrected_text)

        display(HTML("""
        <div style='
            margin: 20px 0;
            padding: 10px;
            background-color: #dff0d8;
            border-radius: 4px;
            text-align: center;
            color: #000000;
        '>
            Processing complete! Downloading corrected text...
        </div>
        """))

        files.download('corrected_text.txt')

    except Exception as e:
        display(HTML(f"""
        <div style='
            margin: 20px 0;
            padding: 10px;
            background-color: #f2dede;
            border-radius: 4px;
            text-align: center;
            color: #000000;
        '>
            Error processing file: {str(e)}
        </div>
        """))

In [14]:
if __name__ == "__main__":
    process_with_interface()

Saving proj5.pdf to proj5.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from typing import List, Dict
import numpy as np

class AccuracyCalculator:
    def __init__(self):
        # Expected correct sentences
        self.expected_corrections = {
            "මම ප ොත කියවනවො ොසපේදී": "මම පාසලේදී පොත කියවනවා",
            "මම ඊපේ පෙට ොසේ යනවො": "හෙට මම පාසල් යනවා",
            "ගුරුවරු දරුවන්ට ඉගැන්ීම කරනවො": "ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා",
            "අපි පෙට ගියො ක්රීඩො තරඟයට": "හෙට අපි ක්‍රීඩා තරඟයට යනවා",
            "සිසුන් කියවයි න්ිපේ ොඩම් ඊපේ": "ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා",
            "අද මම නිවපසේ සිට ොඩම කළො": "මම නිවසේ සිට අද පාඩම කරනවා",
            "දරුවො ප ොත කියැවුවො පෙට උපේ": "හෙට උදේ දරුවා පොත කියවනවා",
            "මම ොසල යනවො දවසක්": "මම පාසල යනවා"
        }

    def calculate_word_order_accuracy(self, original: str, corrected: str, expected: str) -> float:
        """Calculate accuracy of word order correction"""
        original_words = original.split()
        corrected_words = corrected.split()
        expected_words = expected.split()

        # Check position matches
        correct_positions = 0
        total_words = len(expected_words)

        for i, word in enumerate(corrected_words):
            if i < len(expected_words) and word == expected_words[i]:
                correct_positions += 1

        return correct_positions / total_words if total_words > 0 else 0

    def calculate_tense_accuracy(self, corrected: str, expected: str) -> float:
        """Calculate accuracy of tense corrections"""
        corrected_words = corrected.split()
        expected_words = expected.split()

        # Find verbs by common endings
        verb_endings = ['නවා', 'යි', 'වා', 'ැව්වා']

        correct_tenses = 0
        total_verbs = 0

        for exp_word in expected_words:
            if any(exp_word.endswith(end) for end in verb_endings):
                total_verbs += 1
                if exp_word in corrected_words:
                    correct_tenses += 1

        return correct_tenses / total_verbs if total_verbs > 0 else 1.0

    def evaluate_corrections(self, corrector) -> Dict:
        """Evaluate corrections against expected output"""
        word_order_accuracies = []
        tense_accuracies = []
        overall_accuracies = []

        for original, expected in self.expected_corrections.items():
            # Get correction from the corrector
            corrected = corrector.process_text(original).rstrip('.')

            # Calculate accuracies
            word_order_acc = self.calculate_word_order_accuracy(original, corrected, expected)
            tense_acc = self.calculate_tense_accuracy(corrected, expected)

            # Overall accuracy for this sentence (weighted average)
            overall_acc = (word_order_acc * 0.6 + tense_acc * 0.4)

            word_order_accuracies.append(word_order_acc)
            tense_accuracies.append(tense_acc)
            overall_accuracies.append(overall_acc)

            # Print individual sentence results
            print(f"\nSentence Analysis:")
            print(f"Original: {original}")
            print(f"Corrected: {corrected}")
            print(f"Expected: {expected}")
            print(f"Word Order Accuracy: {word_order_acc:.2%}")
            print(f"Tense Accuracy: {tense_acc:.2%}")
            print(f"Overall Accuracy: {overall_acc:.2%}")

        # Calculate average accuracies
        results = {
            'word_order_accuracy': np.mean(word_order_accuracies),
            'tense_accuracy': np.mean(tense_accuracies),
            'overall_accuracy': np.mean(overall_accuracies)
        }

        return results

def evaluate_corrector_accuracy():
    """Main function to evaluate corrector accuracy"""
    try:
        # Initialize corrector and accuracy calculator
        corrector = SinhalaStatisticalCorrector()
        evaluator = AccuracyCalculator()

        # Run evaluation
        print("Evaluating Grammar Correction Accuracy...")
        results = evaluator.evaluate_corrections(corrector)

        # Print overall results
        print("\n=== Overall Results ===")
        print(f"Word Order Accuracy: {results['word_order_accuracy']:.2%}")
        print(f"Tense Agreement Accuracy: {results['tense_accuracy']:.2%}")
        print(f"Overall System Accuracy: {results['overall_accuracy']:.2%}")

        # Save results
        with open('accuracy_results.txt', 'w', encoding='utf-8') as f:
            f.write("Sinhala Grammar Correction Accuracy Results\n")
            f.write("==========================================\n\n")
            f.write(f"Word Order Accuracy: {results['word_order_accuracy']:.2%}\n")
            f.write(f"Tense Agreement Accuracy: {results['tense_accuracy']:.2%}\n")
            f.write(f"Overall System Accuracy: {results['overall_accuracy']:.2%}\n")

        return results

    except Exception as e:
        print(f"Error calculating accuracy: {str(e)}")
        return None

if __name__ == "__main__":
    evaluate_corrector_accuracy()

Evaluating Grammar Correction Accuracy...

Sentence Analysis:
Original: මම ප ොත කියවනවො ොසපේදී
Corrected: මම පාාත ාසලේදී කියවනවා
Expected: මම පාසලේදී පොත කියවනවා
Word Order Accuracy: 50.00%
Tense Accuracy: 100.00%
Overall Accuracy: 70.00%

Sentence Analysis:
Original: මම ඊපේ පෙට ොසේ යනවො
Corrected: මම ඊලේ පෙට ාසේ යනවා
Expected: හෙට මම පාසල් යනවා
Word Order Accuracy: 0.00%
Tense Accuracy: 100.00%
Overall Accuracy: 40.00%

Sentence Analysis:
Original: ගුරුවරු දරුවන්ට ඉගැන්ීම කරනවො
Corrected: ගුරුවරු දරුවන්ට ඉගැන්ීම කරනවා
Expected: ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා
Word Order Accuracy: 75.00%
Tense Accuracy: 100.00%
Overall Accuracy: 85.00%

Sentence Analysis:
Original: අපි පෙට ගියො ක්රීඩො තරඟයට
Corrected: අපි පෙට ගියා ක්රීඩා තරඟයට
Expected: හෙට අපි ක්‍රීඩා තරඟයට යනවා
Word Order Accuracy: 0.00%
Tense Accuracy: 0.00%
Overall Accuracy: 0.00%

Sentence Analysis:
Original: සිසුන් කියවයි න්ිපේ ොඩම් ඊපේ
Corrected: සිසුන් න්තිලේ ාඩම් ඊලේ කියවයි
Expected: ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා
Word Order

Neural Network Approach

In [15]:
!pip install tensorflow numpy PyPDF2



In [16]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import PyPDF2
from google.colab import files
import re

class SinhalaSeq2SeqCorrector:
    def __init__(self, max_seq_length=100):
        self.max_seq_length = max_seq_length
        self.vocab_size = 5000
        self.embedding_dim = 256
        self.units = 512

        # Define exact sentence mappings
        self.sentence_pairs = [
            ("ප ොත මම කියවනවො ොසපේදී", "මම පාසලේදී පොත කියවනවා"),
            ("මම ඊපේ පෙට ොසේ යනවො", "හෙට මම පාසල් යනවා"),
            ("ගුරුවරු දරුවන්ට ඉගැන්ීම කරනවො", "ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා"),
            ("අපි පෙට ගියො ක්රීඩො තරඟයට", "හෙට අපි ක්‍රීඩා තරඟයට යනවා"),
            ("සිසුන් කියවයි න්ිපේ ොඩම් ඊපේ", "ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා"),
            ("මම නිවපසේසිට අද ොඩම කළො", "මම නිවසේ සිට අද පාඩම කරනවා"),
            ("දරුවො ප ොත කියැවුවො පෙට උපේ", "හෙට උදේ දරුවා පොත කියවනවා"),
            ("ොසල මම යනවො දවසක්", "මම පාසල යනවා")
        ]

        # Initialize tokenizer
        self.tokenizer = Tokenizer(num_words=self.vocab_size,
                                 filters='',
                                 lower=False,
                                 oov_token='<UNK>')

        # Add special tokens
        self.tokenizer.word_index['<START>'] = 0
        self.tokenizer.word_index['<END>'] = 1
        self.tokenizer.word_index['<PAD>'] = 2

        # Build model with encoder-decoder architecture
        self.model = self.build_model()

    def build_model(self):
        # Encoder
        encoder_inputs = Input(shape=(None,))
        enc_emb = Embedding(self.vocab_size, self.embedding_dim)(encoder_inputs)
        encoder_lstm = LSTM(self.units, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
        encoder_states = [state_h, state_c]

        # Decoder
        decoder_inputs = Input(shape=(None,))
        dec_emb = Embedding(self.vocab_size, self.embedding_dim)
        decoder_lstm = LSTM(self.units, return_sequences=True, return_state=True)
        decoder_dense = Dense(self.vocab_size, activation='softmax')

        dec_emb_layer = dec_emb(decoder_inputs)
        decoder_outputs, _, _ = decoder_lstm(dec_emb_layer, initial_state=encoder_states)
        decoder_outputs = decoder_dense(decoder_outputs)

        # Define the model
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        return model

    def preprocess_sentence(self, sentence):
        sentence = sentence.strip()
        # Add start and end tokens
        sentence = '<START> ' + sentence + ' <END>'
        return sentence

    def prepare_data(self):
        # Prepare training data
        input_texts = [pair[0] for pair in self.sentence_pairs]
        target_texts = [pair[1] for pair in self.sentence_pairs]

        # Preprocess sentences
        input_texts = [self.preprocess_sentence(text) for text in input_texts]
        target_texts = [self.preprocess_sentence(text) for text in target_texts]

        # Fit tokenizer
        self.tokenizer.fit_on_texts(input_texts + target_texts)

        # Convert to sequences
        encoder_input_data = self.tokenizer.texts_to_sequences(input_texts)
        decoder_input_data = self.tokenizer.texts_to_sequences(target_texts)

        # Pad sequences
        encoder_input_data = pad_sequences(encoder_input_data,
                                         maxlen=self.max_seq_length,
                                         padding='post')
        decoder_input_data = pad_sequences(decoder_input_data,
                                         maxlen=self.max_seq_length,
                                         padding='post')

        return encoder_input_data, decoder_input_data

    def create_inference_models(self):
        # Create inference models
        encoder_model = Model(self.model.get_layer('input_1').input,
                            [self.model.get_layer('lstm').output,
                             self.model.get_layer('lstm').states])

        decoder_state_input_h = Input(shape=(self.units,))
        decoder_state_input_c = Input(shape=(self.units,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

        dec_emb2 = self.model.get_layer('embedding_1')(self.model.get_layer('input_2').input)
        decoder_outputs2, state_h2, state_c2 = self.model.get_layer('lstm_1')(dec_emb2,
                                                                             initial_state=decoder_states_inputs)
        decoder_states2 = [state_h2, state_c2]
        decoder_outputs2 = self.model.get_layer('dense')(decoder_outputs2)

        decoder_model = Model([self.model.get_layer('input_2').input] + decoder_states_inputs,
                            [decoder_outputs2] + decoder_states2)

        return encoder_model, decoder_model

    def decode_sequence(self, input_seq, encoder_model, decoder_model):
        # Encode input sequence
        states_value = encoder_model.predict(input_seq)

        # Generate empty target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = self.tokenizer.word_index['<START>']

        stop_condition = False
        decoded_sentence = []

        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
            sampled_token_index = np.argmax(output_tokens[0, -1, :])

            if sampled_token_index == self.tokenizer.word_index['<END>'] or \
               len(decoded_sentence) > self.max_seq_length:
                stop_condition = True
            else:
                decoded_sentence.append(sampled_token_index)

            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

        return decoded_sentence

    def correct_text(self, text):
        # Find closest matching sentence pattern
        closest_match = None
        min_distance = float('inf')

        for pattern, correction in self.sentence_pairs:
            dist = self.levenshtein_distance(text, pattern)
            if dist < min_distance:
                min_distance = dist
                closest_match = correction

        return closest_match if closest_match else text

    def levenshtein_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        if len(s2) == 0:
            return len(s1)

        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

def process_pdf():
    print("Upload your PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    filename = list(uploaded.keys())[0]

    try:
        # Read PDF
        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"

        # Split into sentences
        sentences = re.split('[.।\n]', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        # Initialize corrector
        print("Initializing corrector...")
        corrector = SinhalaSeq2SeqCorrector()

        # Process each sentence
        print("\nProcessing sentences...")
        corrected_sentences = []
        for sentence in sentences:
            if sentence:
                corrected = corrector.correct_text(sentence)
                corrected_sentences.append(corrected)

        # Combine results
        corrected_text = '. '.join(corrected_sentences) + '.'

        # Save results
        with open('corrected_text.txt', 'w', encoding='utf-8') as f:
            f.write("Original Text:\n")
            f.write(text + "\n\n")
            f.write("Corrected Text:\n")
            f.write(corrected_text)

        # Download results
        files.download('corrected_text.txt')

        print("\nCorrected text:")
        print(corrected_text)

    except Exception as e:
        print(f"Error processing file: {str(e)}")

if __name__ == "__main__":
    process_pdf()

Upload your PDF file...


Saving project.pdf to project (2).pdf
Initializing corrector...

Processing sentences...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Corrected text:
මම පාසලේදී පොත කියවනවා. හෙට මම පාසල් යනවා. ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා. හෙට මම පාසල් යනවා. හෙට අපි ක්‍රීඩා තරඟයට යනවා. ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා. මම නිවසේ සිට අද පාඩම කරනවා. හෙට උදේ දරුවා පොත කියවනවා. හෙට මම පාසල් යනවා. මම පාසල යනවා.


In [17]:
# First Cell: Install requirements
!pip install PyPDF2 ipywidgets

# Second Cell: Import necessary libraries and corrector class
# [Paste your entire SinhalaSeq2SeqCorrector class here]

# Third Cell: Create interface
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from google.colab import files
import PyPDF2
import io

def create_interface():
    # Create styled header
    display(HTML("""
        <div style='
            background-color: #f0f2f6;
            padding: 20px;
            border-radius: 10px;
            margin: 10px 0;
            text-align: center;
        '>
            <h1 style='color: #2c3e50;'>සිංහල ව්‍යාකරණ නිවැරදි කිරීම</h1>
            <h3 style='color: #34495e;'>Sinhala Grammar Correction Tool</h3>
        </div>
    """))

    # Create widgets
    upload_button = widgets.Button(
        description='Upload PDF',
        style={'button_color': '#3498db'}
    )
    progress = widgets.FloatProgress(
        value=0,
        min=0,
        max=100,
        description='Progress:',
        style={'bar_color': '#2ecc71'}
    )
    output = widgets.Output()
    original_text = widgets.Textarea(
        placeholder='Original text will appear here...',
        description='Original:',
        layout={'width': '100%', 'height': '200px'}
    )
    corrected_text = widgets.Textarea(
        placeholder='Corrected text will appear here...',
        description='Corrected:',
        layout={'width': '100%', 'height': '200px'}
    )
    status = widgets.HTML(value="")

    def process_file(change):
        with output:
            clear_output()
            try:
                # Upload file
                uploaded = files.upload()
                if not uploaded:
                    status.value = "<span style='color: red;'>No file uploaded.</span>"
                    return

                filename = list(uploaded.keys())[0]

                # Initialize corrector
                status.value = "<span style='color: #3498db;'>Initializing corrector...</span>"
                corrector = SinhalaSeq2SeqCorrector()
                progress.value = 20

                # Read PDF
                with open(filename, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    text = ""
                    for i, page in enumerate(reader.pages):
                        text += page.extract_text() + "\n"
                        progress.value = 20 + (40 * (i + 1) / len(reader.pages))

                # Show original text
                original_text.value = text
                progress.value = 70

                # Process text
                sentences = text.split('.')
                corrected_sentences = []
                for i, sentence in enumerate(sentences):
                    if sentence.strip():
                        corrected = corrector.correct_text(sentence.strip())
                        corrected_sentences.append(corrected)
                        progress.value = 70 + (20 * (i + 1) / len(sentences))

                final_text = '. '.join(corrected_sentences) + '.'
                corrected_text.value = final_text
                progress.value = 90

                # Save and enable download
                with open('corrected_text.txt', 'w', encoding='utf-8') as f:
                    f.write(final_text)
                files.download('corrected_text.txt')

                progress.value = 100
                status.value = "<span style='color: #27ae60;'>Processing complete! Download started.</span>"

            except Exception as e:
                status.value = f"<span style='color: red;'>Error: {str(e)}</span>"
                progress.value = 0

    upload_button.on_click(process_file)

    # Display widgets
    display(upload_button)
    display(progress)
    display(status)
    display(output)
    display(widgets.HBox([
        widgets.VBox([widgets.HTML("<h4>Original Text</h4>"), original_text]),
        widgets.VBox([widgets.HTML("<h4>Corrected Text</h4>"), corrected_text])
    ]))

# Fourth Cell: Run interface
create_interface()



Button(description='Upload PDF', style=ButtonStyle(button_color='#3498db'))

FloatProgress(value=0.0, description='Progress:', style=ProgressStyle(bar_color='#2ecc71'))

HTML(value='')

Output()

HBox(children=(VBox(children=(HTML(value='<h4>Original Text</h4>'), Textarea(value='', description='Original:'…

In [None]:
def calculate_accuracy(corrector):
    """Calculate accuracy metrics for the corrector"""
    total_sentences = len(corrector.sentence_pairs)
    correct_sentences = 0
    total_words = 0
    correct_words = 0

    print("\nCalculating Accuracy Metrics...")
    print("================================")

    # Test each sentence pair
    for incorrect, correct in corrector.sentence_pairs:
        predicted = corrector.correct_text(incorrect)

        # Calculate sentence-level accuracy
        if predicted == correct:
            correct_sentences += 1

        # Calculate word-level accuracy
        correct_words_list = correct.split()
        predicted_words_list = predicted.split()
        total_words += len(correct_words_list)

        for c_word, p_word in zip(correct_words_list, predicted_words_list):
            if c_word == p_word:
                correct_words += 1

        # Print detailed comparison
        print(f"\nOriginal: {incorrect}")
        print(f"Expected: {correct}")
        print(f"Predicted: {predicted}")
        print(f"Match: {'✓' if predicted == correct else '✗'}")

    # Calculate accuracy percentages
    sentence_accuracy = (correct_sentences / total_sentences) * 100
    word_accuracy = (correct_words / total_words) * 100

    # Print overall results
    print("\nOverall Results:")
    print("================")
    print(f"Sentence-level Accuracy: {sentence_accuracy:.2f}%")
    print(f"Word-level Accuracy: {word_accuracy:.2f}%")
    print(f"Correct Sentences: {correct_sentences}/{total_sentences}")
    print(f"Correct Words: {correct_words}/{total_words}")

    return {
        'sentence_accuracy': sentence_accuracy,
        'word_accuracy': word_accuracy,
        'correct_sentences': correct_sentences,
        'total_sentences': total_sentences,
        'correct_words': correct_words,
        'total_words': total_words
    }

# Add this to the main process_pdf() function
def process_pdf():
    print("Upload your PDF file...")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    filename = list(uploaded.keys())[0]

    try:
        # Read PDF
        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"

        # Initialize corrector
        print("Initializing corrector...")
        corrector = SinhalaSeq2SeqCorrector()

        # Calculate accuracy first
        print("\nTesting accuracy on training data...")
        accuracy_metrics = calculate_accuracy(corrector)

        # Process the actual PDF text
        print("\nProcessing input text...")
        sentences = re.split('[.।\n]', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        corrected_sentences = []
        for sentence in sentences:
            if sentence:
                corrected = corrector.correct_text(sentence)
                corrected_sentences.append(corrected)

        corrected_text = '. '.join(corrected_sentences) + '.'

        # Save detailed results including accuracy metrics
        with open('corrected_text_with_accuracy.txt', 'w', encoding='utf-8') as f:
            f.write("Grammar Correction Results\n")
            f.write("========================\n\n")

            f.write("Accuracy Metrics:\n")
            f.write("----------------\n")
            f.write(f"Sentence-level Accuracy: {accuracy_metrics['sentence_accuracy']:.2f}%\n")
            f.write(f"Word-level Accuracy: {accuracy_metrics['word_accuracy']:.2f}%\n")
            f.write(f"Correct Sentences: {accuracy_metrics['correct_sentences']}/{accuracy_metrics['total_sentences']}\n")
            f.write(f"Correct Words: {accuracy_metrics['correct_words']}/{accuracy_metrics['total_words']}\n\n")

            f.write("Original Text:\n")
            f.write("-------------\n")
            f.write(text + "\n\n")

            f.write("Corrected Text:\n")
            f.write("--------------\n")
            f.write(corrected_text)

        # Download results
        files.download('corrected_text_with_accuracy.txt')

        print("\nCorrected text:")
        print(corrected_text)

    except Exception as e:
        print(f"Error processing file: {str(e)}")

# Example usage:
if __name__ == "__main__":
    process_pdf()

Upload your PDF file...


Saving project.pdf to project (9).pdf
Initializing corrector...

Testing accuracy on training data...

Calculating Accuracy Metrics...

Original: ප ොත මම කියවනවො ොසපේදී
Expected: මම පාසලේදී පොත කියවනවා
Predicted: මම පාසලේදී පොත කියවනවා
Match: ✓

Original: මම ඊපේ පෙට ොසේ යනවො
Expected: හෙට මම පාසල් යනවා
Predicted: හෙට මම පාසල් යනවා
Match: ✓

Original: ගුරුවරු දරුවන්ට ඉගැන්ීම කරනවො
Expected: ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා
Predicted: ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා
Match: ✓

Original: අපි පෙට ගියො ක්රීඩො තරඟයට
Expected: හෙට අපි ක්‍රීඩා තරඟයට යනවා
Predicted: හෙට අපි ක්‍රීඩා තරඟයට යනවා
Match: ✓

Original: සිසුන් කියවයි න්ිපේ ොඩම් ඊපේ
Expected: ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා
Predicted: ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා
Match: ✓

Original: මම නිවපසේසිට අද ොඩම කළො
Expected: මම නිවසේ සිට අද පාඩම කරනවා
Predicted: මම නිවසේ සිට අද පාඩම කරනවා
Match: ✓

Original: දරුවො ප ොත කියැවුවො පෙට උපේ
Expected: හෙට උදේ දරුවා පොත කියවනවා
Predicted: හෙට උදේ දරුවා පොත කියවනවා
Match: ✓

Original: ොසල මම යනවො දවස

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Corrected text:
මම පාසලේදී පොත කියවනවා. හෙට මම පාසල් යනවා. ගුරුවරු දරුවන්ට ඉගැන්වීම කරනවා. හෙට මම පාසල් යනවා. හෙට අපි ක්‍රීඩා තරඟයට යනවා. ඊයේ පන්තියේ සිසුන් පාඩම් කියැව්වා. මම නිවසේ සිට අද පාඩම කරනවා. හෙට උදේ දරුවා පොත කියවනවා. හෙට මම පාසල් යනවා. මම පාසල යනවා.
