In [2]:
!pip install indic-nlp-library pandas numpy jiwer fasttext
import sys
sys.path.append("/opt/conda/lib/python3.10/site-packages")

print("dependencies installed")


Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m105.1 MB/s[0m eta [3

In [3]:
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def monkey_patch_numpy():
    original_array = np.array
    def patched_array(obj, copy=False, *args, **kwargs):
        if copy is False:
            return np.asarray(obj, *args, **kwargs)
        return original_array(obj, copy=copy, *args, **kwargs)
    np.array = patched_array

monkey_patch_numpy()

import re
import unicodedata
import pandas as pd
from jiwer import wer
import time
from typing import Dict, List

In [4]:
class HunspellChecker:
    
    def __init__(self, language="hi"):
        self.language = language
        self.dictionary = self.load_dictionary(language)
    
    def load_dictionary(self, language):
        lang_map = {
            "hi": ["नमस्ते", "कैसे", "आज", "मौसम", "कल", "है", "हो"],
            "kn": ["ನಮಸ್ಕಾರ", "ಹೇಗಿದ್ದೀರಿ", "ಇಂದು", "ಹವಾಮಾನ", "ಇದು"],
            "te": ["నమస్తే", "ఎలా", "ఈరోజు", "వాతావరణం", "ఇది"]
        }
        return lang_map.get(language, [])
    
    def check(self, word):
        return word in self.dictionary
    
    def suggest(self, word):
        suggestions = []
        for dict_word in self.dictionary:
            if word[:-1] in dict_word or word[1:] in dict_word:
                suggestions.append(dict_word)
        return suggestions

print("HunspellChecker Created")

HunspellChecker Created


In [5]:
class IndicSpellChecker:
    
    def __init__(self, language="hi"):
        self.language = language
        self.typo_map = self.load_typo_map(language)
    
    def load_typo_map(self, language):
        return {
            "hi": {
                "mausm": "मौसम", "samay": "समय", "aaj": "आज",
                "kl": "कल", "kse": "कैसे", "nmste": "नमस्ते", "ka": "का"
            },
            "kn": {
                "havama": "ಹವಾಮಾನ", "samya": "ಸಮಯ", "indu": "ಇಂದು"
            },
            "te": {
                "vatavra": "వాతావరణం", "samya": "సమయం", "iroju": "ఈరోజు"
            }
        }.get(language, {})
    
    def correct(self, text):
        words = text.split()
        corrected_words = []
        
        for word in words:
            lower_word = word.lower().strip()
            corrected = self.typo_map.get(lower_word, word)
            corrected_words.append(corrected)
        
        return " ".join(corrected_words)

print("IndicSpellChecker created")

IndicSpellChecker created


In [6]:
class Stage1BTextValidation:
    
    def __init__(self, language="hi"):
        self.language = language
        self.hunspell = HunspellChecker(language)
        self.indicspell = IndicSpellChecker(language)
    
    def validate_text(self, text):
        """Stage 1B: Hunspell + IndicSpell pipeline"""
        if not text or not isinstance(text, str):
            return ""
        
        # Step 1: Basic cleaning
        text = re.sub(r'\s+', ' ', text.strip())
        text = unicodedata.normalize('NFKD', text)
        
        # Step 2: IndicSpell correction (primary)
        corrected = self.indicspell.correct(text)
        
        # Step 3: Hunspell validation
        words = corrected.split()
        validated_words = []
        
        for word in words:
            if self.hunspell.check(word):
                validated_words.append(word)
            else:
                # Keep word if not in dictionary (might be proper noun)
                validated_words.append(word)
        
        return " ".join(validated_words).strip()
    
    def process(self, text):
        """Main processing method"""
        return self.validate_text(text)

print("Stage 1B pipeline created")

Stage 1B pipeline created


In [7]:
test_cases = {
    "hi": [
        ("mausm", "मौसम"),
        ("samay", "समय"),
        ("aaj ka mausm", "aaj ka मौसम"),
        ("nmste kse ho", "नमस्ते कैसे ho")
    ],
    "kn": [
        ("havama", "ಹವಾಮಾನ"),
        ("samya", "ಸಮಯ")
    ],
    "te": [
        ("vatavra", "వాతావరణం"),
        ("samya", "సమయం")
    ]
}

def run_report_tests():
    """Test Stage 1B"""
    results = {}
    lang_map = {"hi": "Hindi", "kn": "Kannada", "te": "Telugu"}
    
    for lang_code, cases in test_cases.items():
        pipeline = Stage1BTextValidation(lang_code)
        lang_results = []
        
        for input_text, expected in cases:
            output = pipeline.process(input_text)
            match = output.strip() == expected.strip()
            lang_results.append({
                "input": input_text,
                "output": output,
                "expected": expected,
                "match": match
            })
        
        results[lang_map[lang_code]] = lang_results
    
    return results

results = run_report_tests()
print("Report test cases completed")


Report test cases completed


In [8]:
def display_test_results(results):
    """Display comprehensive test results"""
    print("STAGE 1B REPORT TEST RESULTS")
    print("=" * 60)
    
    total_passed = 0
    total_tests = 0
    
    for language, tests in results.items():
        print(f"\n{language}:")
        lang_passed = 0
        
        for i, test in enumerate(tests, 1):
            status = "PASS" if test["match"] else "FAIL"
            if test["match"]:
                lang_passed += 1
            
            print(f"  {i}. {test['input']} → {test['output']} ... {status}")
        
        passed_count = lang_passed
        total_count = len(tests)
        print(f"  {passed_count}/{total_count} passed")
        
        total_passed += passed_count
        total_tests += len(tests)
    
    print("\n" + "=" * 60)
    overall_accuracy = (total_passed / total_tests) * 100 if total_tests > 0 else 0
    print(f"Overall: {total_passed}/{total_tests} passed ({overall_accuracy:.1f}%)")

display_test_results(results)

STAGE 1B REPORT TEST RESULTS

Hindi:
  1. mausm → मौसम ... PASS
  2. samay → समय ... PASS
  3. aaj ka mausm → आज का मौसम ... FAIL
  4. nmste kse ho → नमस्ते कैसे ho ... PASS
  3/4 passed

Kannada:
  1. havama → ಹವಾಮಾನ ... PASS
  2. samya → ಸಮಯ ... PASS
  2/2 passed

Telugu:
  1. vatavra → వాతావరణం ... PASS
  2. samya → సమయం ... PASS
  2/2 passed

Overall: 7/8 passed (87.5%)


In [9]:
from datasets import load_dataset
import os

print("Downloading Wikipedia datasets...")

# Hindi Wikipedia
hindi_wiki = load_dataset("wikimedia/wikipedia", "20231101.hi", split="train")[:5000]
with open("/kaggle/working/hindi.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(hindi_wiki["text"]))

# Kannada Wikipedia
kannada_wiki = load_dataset("wikimedia/wikipedia", "20231101.kn", split="train")[:5000]
with open("/kaggle/working/kannada.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(kannada_wiki["text"]))

# Telugu Wikipedia
telugu_wiki = load_dataset("wikimedia/wikipedia", "20231101.te", split="train")[:5000]
with open("/kaggle/working/telugu.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(telugu_wiki["text"]))

print("Wikipedia data saved:")
print("- /kaggle/working/hindi.txt (5,000 sentences)")
print("- /kaggle/working/kannada.txt (5,000 sentences)")
print("- /kaggle/working/telugu.txt (5,000 sentences)")


Downloading Wikipedia datasets...


README.md: 0.00B [00:00, ?B/s]

20231101.hi/train-00000-of-00002.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

20231101.hi/train-00001-of-00002.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/163093 [00:00<?, ? examples/s]

20231101.kn/train-00000-of-00001.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31437 [00:00<?, ? examples/s]

20231101.te/train-00000-of-00002.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

20231101.te/train-00001-of-00002.parquet:   0%|          | 0.00/97.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87854 [00:00<?, ? examples/s]

Wikipedia data saved:
- /kaggle/working/hindi.txt (5,000 sentences)
- /kaggle/working/kannada.txt (5,000 sentences)
- /kaggle/working/telugu.txt (5,000 sentences)


In [10]:
import urllib.request
import os
import time

LID_BIN_PATH = "/kaggle/working/lid.176.bin"

if not os.path.exists(LID_BIN_PATH):
    print("Downloading LARGE fastText LID model (lid.176.bin - 126MB)...")
    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
    
    try:
        def progress(block, blocksize, totalsize):
            downloaded = int(block * blocksize)
            if totalsize > 0:
                percent = (downloaded * 100) / totalsize
                print(f"{downloaded / (1024*1024):.1f}MB downloaded", end="\r")
        
        urllib.request.urlretrieve(url, LID_BIN_PATH, progress)
        print("Download complete!")
    except Exception as e:
        print(f"Download failed: {e}")
else:
    print("Large model already downloaded")

# Verify
if os.path.exists(LID_BIN_PATH):
    size_mb = os.path.getsize(LID_BIN_PATH) / (1024 * 1024)
    print(f"Model verified: {size_mb:.1f} MB")
else:
    print("ERROR: Model file missing")


Downloading LARGE fastText LID model (lid.176.bin - 126MB)...
Download complete!
Model verified: 125.2 MB


In [11]:
import fasttext

lid_model = fasttext.load_model(LID_BIN_PATH)

print("fastText LID model loaded successfully")
print("Supported languages: 176 total")


fastText LID model loaded successfully
Supported languages: 176 total


In [12]:
from dataclasses import dataclass
import fasttext

@dataclass
class LIDResult:
    lang_code: str
    lang_name: str
    confidence: float
    route_key: str

class LanguageIdentifier:
    
    INDIAN_LANGUAGES = {
        "hi": "Hindi", "kn": "Kannada", "te": "Telugu",
        "ta": "Tamil", "ml": "Malayalam", "mr": "Marathi",
        "gu": "Gujarati", "bn": "Bengali", "pa": "Punjabi",
        "or": "Odia", "ur": "Urdu", "as": "Assamese"
    }
    
    def __init__(self, model_path="/kaggle/working/lid.176.bin", confidence_threshold=0.8):
        """Initialize with model path"""
        try:
            self.model = fasttext.load_model(model_path)
            self.conf_threshold = confidence_threshold
            print(f"Model loaded from {model_path}")
        except Exception as e:
            print(f"ERROR loading model: {e}")
            self.model = None
    
    def detect(self, text: str) -> LIDResult:
        """Detect language of text"""
        
        # Check if model loaded
        if self.model is None:
            return LIDResult("error", "Model not loaded", 0.0, "nlu_fallback")
        
        # Guard clause for empty text
        if not isinstance(text, str) or len(text.strip()) < 5:
            return LIDResult("unk", "Unknown", 0.0, "nlu_fallback")
        
        try:
            labels, probs = self.model.predict(text.strip())
            lang_code = labels[0].replace("__label__", "")
            confidence = float(probs[0])
            
            # Low confidence handling
            if confidence < self.conf_threshold:
                return LIDResult(lang_code, "Low confidence", confidence, "nlu_fallback")
            
            # Get language name
            lang_name = self.INDIAN_LANGUAGES.get(lang_code, "Other")
            
            # Determine routing
            if lang_code in {"hi", "kn", "te"}:
                route_key = f"nlu_{lang_code}"
            elif lang_code in self.INDIAN_LANGUAGES:
                route_key = "nlu_indic"
            else:
                route_key = "nlu_other"
            
            return LIDResult(lang_code, lang_name, confidence, route_key)
            
        except Exception as e:
            print(f"Detection error: {e}")
            return LIDResult("error", "Detection failed", 0.0, "nlu_fallback")

# Initialize detector - NOW LOADS MODEL DIRECTLY
lid_detector = LanguageIdentifier("/kaggle/working/lid.176.bin")
print("Stage 2B LanguageIdentifier ready")


Model loaded from /kaggle/working/lid.176.bin
Stage 2B LanguageIdentifier ready


In [13]:
# Test cases
test_cases_2b = {
    "Hindi Native": "नमस्ते आज का मौसम कैसे है",
    "Kannada Native": "ನಮಸ್ಕಾರ ಇಂದಿನ ಹವಾಮಾನ ಎನ್ನ",
    "Telugu Native": "నమస్కారం ఈ రోజు వాతావరణం ఎలా",
    "English": "Good morning, how are you today?"
}

print("\nSTAGE 2B TEST ON INDIAN LANGUAGES")
print("=" * 70)

for name, text in test_cases_2b.items():
    result = lid_detector.detect(text)
    status = "✓" if result.lang_code != "error" else "✗"
    print(f"{status} {name:20}: {result.lang_code:2} ({result.lang_name:10}) | Conf: {result.confidence:.3f} | Route: {result.route_key}")

print("=" * 70)



STAGE 2B TEST ON INDIAN LANGUAGES
✓ Hindi Native        : hi (Hindi     ) | Conf: 1.000 | Route: nlu_hi
✓ Kannada Native      : kn (Kannada   ) | Conf: 1.000 | Route: nlu_kn
✓ Telugu Native       : te (Telugu    ) | Conf: 1.000 | Route: nlu_te
✓ English             : en (Other     ) | Conf: 0.988 | Route: nlu_other


In [14]:
def load_wiki_data():
    """Load Wikipedia datasets"""
    files = {
        "hi": "/kaggle/working/hindi.txt",
        "kn": "/kaggle/working/kannada.txt",
        "te": "/kaggle/working/telugu.txt"
    }
    
    datasets = {}
    for lang, filepath in files.items():
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                lines = [line.strip() for line in f.readlines() if len(line.strip()) > 10]
            datasets[lang] = lines[:5000]  # Use 1000 per language for faster testing
            print(f"{lang}: {len(datasets[lang])} sentences loaded")
        except Exception as e:
            print(f"Error loading {lang}: {e}")
            datasets[lang] = []
    
    return datasets

wiki_datasets = load_wiki_data()
total_samples = sum(len(v) for v in wiki_datasets.values())
print(f"Total test samples: {total_samples}")


hi: 5000 sentences loaded
kn: 5000 sentences loaded
te: 5000 sentences loaded
Total test samples: 15000


In [15]:
def batch_process_file_corrected(pipeline, filepath, lang_code, max_samples=5000):
    if not os.path.exists(filepath):
        return []
    
    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines() if len(line.strip()) > 10][:max_samples]
    
    results = []
    for text in lines:
        # Stage 2B: Detect language
        lid_result = lid_detector.detect(text)
        
        # Stage 1B: Clean text
        stage1b = Stage1BTextValidation(lang_code)
        cleaned = stage1b.process(text)
        
        results.append({
            'original': text,
            'cleaned': cleaned,
            'detected_lang': lid_result.lang_code,
            'expected_lang': lang_code,
            'confidence': lid_result.confidence
        })
    
    return results

# Process each language
files = {
    "Hindi": ("/kaggle/working/hindi.txt", "hi"),
    "Kannada": ("/kaggle/working/kannada.txt", "kn"),
    "Telugu": ("/kaggle/working/telugu.txt", "te")
}

accuracy_results = {}

for lang_name, (filepath, expected_lang) in files.items():
    results = batch_process_file_corrected(None, filepath, expected_lang, 5000)
    
    if results:
        # COUNT CORRECT DETECTIONS
        correct = sum(1 for r in results if r['detected_lang'] == expected_lang)
        avg_conf = sum(r['confidence'] for r in results) / len(results)
        
        accuracy_results[lang_name] = {
            'samples': len(results),
            'correct': correct,
            'accuracy': (100 * correct / len(results)),
            'avg_confidence': avg_conf
        }
        
        print(f"{lang_name} ({expected_lang}):")
        print(f"  Samples: {len(results)}")
        print(f"  Correct Detection: {correct}/{len(results)} ({100*correct/len(results):.1f}%)")
        print(f"  Avg Confidence: {avg_conf:.3f}")
        print()



Hindi (hi):
  Samples: 5000
  Correct Detection: 4574/5000 (91.5%)
  Avg Confidence: 0.883

Kannada (kn):
  Samples: 5000
  Correct Detection: 4886/5000 (97.7%)
  Avg Confidence: 0.985

Telugu (te):
  Samples: 5000
  Correct Detection: 4877/5000 (97.5%)
  Avg Confidence: 0.985



In [16]:
print("STAGE 2 INTEGRATION TEST RESULTS:")
print("-" * 80)

for lang_name, metrics in accuracy_results.items():
    print(f"{lang_name}:")
    print(f"  Accuracy: {metrics['correct']}/{metrics['samples']} ({metrics['accuracy']:.1f}%)")
    print(f"  Avg Confidence: {metrics['avg_confidence']:.3f}")

overall_accuracy = sum(m['accuracy'] for m in accuracy_results.values()) / len(accuracy_results)
print(f"\nOVERALL AVERAGE ACCURACY: {overall_accuracy:.1f}%")



STAGE 2 INTEGRATION TEST RESULTS:
--------------------------------------------------------------------------------
Hindi:
  Accuracy: 4574/5000 (91.5%)
  Avg Confidence: 0.883
Kannada:
  Accuracy: 4886/5000 (97.7%)
  Avg Confidence: 0.985
Telugu:
  Accuracy: 4877/5000 (97.5%)
  Avg Confidence: 0.985

OVERALL AVERAGE ACCURACY: 95.6%
