In [20]:
from datasets import load_dataset
import re

# Configuration for different Indian languages supported by IndicCorpV2
LANGUAGE_CONFIG = {
    # Hindi
    "hin_Deva": {"name": "Hindi", "script": "Devanagari", "unicode_range": "\\u0900-\\u097F"},
    # Marathi  
    "mar_Deva": {"name": "Marathi", "script": "Devanagari", "unicode_range": "\\u0900-\\u097F"},
    # Gujarati
    "guj_Gujr": {"name": "Gujarati", "script": "Gujarati", "unicode_range": "\\u0A80-\\u0AFF"},
    # Bengali
    "ben_Beng": {"name": "Bengali", "script": "Bengali", "unicode_range": "\\u0980-\\u09FF"},
    # Tamil
    "tam_Taml": {"name": "Tamil", "script": "Tamil", "unicode_range": "\\u0B80-\\u0BFF"},
    # Telugu
    "tel_Telu": {"name": "Telugu", "script": "Telugu", "unicode_range": "\\u0C00-\\u0C7F"},
    # Kannada
    "kan_Knda": {"name": "Kannada", "script": "Kannada", "unicode_range": "\\u0C80-\\u0CFF"},
    # Malayalam
    "mal_Mlym": {"name": "Malayalam", "script": "Malayalam", "unicode_range": "\\u0D00-\\u0D7F"},
    # Punjabi
    "pan_Guru": {"name": "Punjabi", "script": "Gurmukhi", "unicode_range": "\\u0A00-\\u0A7F"},
    # Oriya/Odia
    "ori_Orya": {"name": "Odia", "script": "Odia", "unicode_range": "\\u0B00-\\u0B7F"},
    # Assamese
    "asm_Beng": {"name": "Assamese", "script": "Bengali", "unicode_range": "\\u0980-\\u09FF"},
    # Urdu
    "urd_Arab": {"name": "Urdu", "script": "Arabic", "unicode_range": "\\u0600-\\u06FF"},
}

# Configure the language you want to process
SELECTED_LANGUAGE = "mar_Deva"  # Change this to any supported language

# Load data from IndicCorpV2 dataset
print(f"Loading {LANGUAGE_CONFIG[SELECTED_LANGUAGE]['name']} data...")
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split=SELECTED_LANGUAGE, streaming=True)

# Convert streaming dataset to list of texts (first 1000 samples)
texts = []
count = 0
for sample in dataset:
    texts.append(sample['text'])
    count += 1
    if count >= 1000:  # Limit to avoid memory issues
        break

# Preview a few paragraphs
print(f"Loaded {len(texts)} {LANGUAGE_CONFIG[SELECTED_LANGUAGE]['name']} text samples")
print(f"Script: {LANGUAGE_CONFIG[SELECTED_LANGUAGE]['script']}")
for i, text in enumerate(texts[:3]):
    print(f"Sample {i+1}: {text[:100]}...")

Loading Marathi data...
Loaded 1000 Marathi text samples
Script: Devanagari
Sample 1: ऊती संवर्धन तंत्राचे अनेक उपयोग आहेत. या तंत्राचा उपयोग विशेषकरून जीवशास्त्र व वैद्यकशास्त्रात होतो....
Sample 2: ...
Sample 3: शहरातील माध्यमिक विभागाच्या शाळा ३ जानेवारीपर्यंत विद्यार्थ्यांसाठी बंद ठेवण्यात येणार आहेत. मात्र, ...
Loaded 1000 Marathi text samples
Script: Devanagari
Sample 1: ऊती संवर्धन तंत्राचे अनेक उपयोग आहेत. या तंत्राचा उपयोग विशेषकरून जीवशास्त्र व वैद्यकशास्त्रात होतो....
Sample 2: ...
Sample 3: शहरातील माध्यमिक विभागाच्या शाळा ३ जानेवारीपर्यंत विद्यार्थ्यांसाठी बंद ठेवण्यात येणार आहेत. मात्र, ...


In [21]:
def detect_language_from_split(split_name):
    """
    Automatically detect language configuration based on split name
    """
    if split_name in LANGUAGE_CONFIG:
        return LANGUAGE_CONFIG[split_name]
    else:
        # If split not in config, try to infer basic info
        parts = split_name.split('_')
        if len(parts) == 2:
            lang_code, script_code = parts
            return {
                "name": lang_code.upper(),
                "script": script_code,
                "unicode_range": "\\u0000-\\uFFFF"  # Default to full unicode
            }
        else:
            return {
                "name": "Unknown",
                "script": "Unknown", 
                "unicode_range": "\\u0000-\\uFFFF"
            }

def load_language_data(language_split, num_samples=1000):
    """
    Load data for any language split from IndicCorpV2
    """
    # Get language info
    lang_info = detect_language_from_split(language_split)
    
    print(f"Loading {lang_info['name']} data from split: {language_split}")
    print(f"Script: {lang_info['script']}")
    
    try:
        # Load dataset
        dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split=language_split, streaming=True)
        
        # Convert streaming dataset to list of texts
        texts = []
        count = 0
        for sample in dataset:
            texts.append(sample['text'])
            count += 1
            if count >= num_samples:
                break
        
        print(f"Successfully loaded {len(texts)} text samples")
        
        # Preview samples
        for i, text in enumerate(texts[:3]):
            print(f"Sample {i+1}: {text[:100]}...")
            
        return texts, lang_info
        
    except Exception as e:
        print(f"Error loading data for {language_split}: {e}")
        print("Available splits might include:")
        for split in LANGUAGE_CONFIG.keys():
            print(f"  - {split} ({LANGUAGE_CONFIG[split]['name']})")
        return [], lang_info

# Example usage - change this split to any language you want
TARGET_LANGUAGE_SPLIT = "mar_Deva"  # Change this to test different languages
texts, current_lang_info = load_language_data(TARGET_LANGUAGE_SPLIT)

Loading Marathi data from split: mar_Deva
Script: Devanagari
Successfully loaded 1000 text samples
Sample 1: ऊती संवर्धन तंत्राचे अनेक उपयोग आहेत. या तंत्राचा उपयोग विशेषकरून जीवशास्त्र व वैद्यकशास्त्रात होतो....
Sample 2: ...
Sample 3: शहरातील माध्यमिक विभागाच्या शाळा ३ जानेवारीपर्यंत विद्यार्थ्यांसाठी बंद ठेवण्यात येणार आहेत. मात्र, ...
Successfully loaded 1000 text samples
Sample 1: ऊती संवर्धन तंत्राचे अनेक उपयोग आहेत. या तंत्राचा उपयोग विशेषकरून जीवशास्त्र व वैद्यकशास्त्रात होतो....
Sample 2: ...
Sample 3: शहरातील माध्यमिक विभागाच्या शाळा ३ जानेवारीपर्यंत विद्यार्थ्यांसाठी बंद ठेवण्यात येणार आहेत. मात्र, ...


In [15]:
import re

# Sentence tokenizer remains the same for all languages
def sentence_tokenizer(text):
    """
    Universal sentence tokenizer that works for most languages
    """
    # Use multiple sentence ending patterns for different scripts
    sentence_patterns = [
        r'(?<=[.!?])\s+',     # English punctuation
        r'(?<=[।॥])\s+',      # Devanagari punctuation (Hindi, Marathi, etc.)
        r'(?<=[؟۔])\s+',       # Arabic/Urdu punctuation
        r'(?<=[។៕])\s+',      # Khmer punctuation
        r'(?<=[။၊])\s+',       # Myanmar punctuation
    ]
    
    # Combine all patterns
    combined_pattern = '|'.join(sentence_patterns)
    return re.split(combined_pattern, text.strip())

def create_language_aware_word_tokenizer(lang_info):
    """
    Create a word tokenizer optimized for the specific language
    """
    def word_tokenizer(sentence):
        # Common patterns for all languages
        url_pattern = r'https?://\S+|www\.\S+'
        email_pattern = r'\S+@\S+\.\S+'
        date_pattern = r'\b\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}\b'
        number_pattern = r'\b\d+(?:\.\d+)?\b'
        
        # Language-specific punctuation patterns
        if lang_info['script'] == 'Devanagari':
            punctuation_pattern = r'[।॥!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
        elif lang_info['script'] == 'Arabic':
            punctuation_pattern = r'[؟۔!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
        elif lang_info['script'] == 'Bengali':
            punctuation_pattern = r'[।॥!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
        else:
            punctuation_pattern = r'[।॥!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
        
        # Comprehensive Unicode word patterns for all major scripts
        unicode_word_patterns = [
            r'[\u0900-\u097F]+',  # Devanagari (Hindi, Marathi, Sanskrit, etc.)
            r'[\u0980-\u09FF]+',  # Bengali
            r'[\u0A00-\u0A7F]+',  # Gurmukhi (Punjabi)
            r'[\u0A80-\u0AFF]+',  # Gujarati
            r'[\u0B00-\u0B7F]+',  # Oriya
            r'[\u0B80-\u0BFF]+',  # Tamil
            r'[\u0C00-\u0C7F]+',  # Telugu
            r'[\u0C80-\u0CFF]+',  # Kannada
            r'[\u0D00-\u0D7F]+',  # Malayalam
            r'[\u0D80-\u0DFF]+',  # Sinhala
            r'[\u0E00-\u0E7F]+',  # Thai
            r'[\u0E80-\u0EFF]+',  # Lao
            r'[\u1000-\u109F]+', # Myanmar
            r'[\u1780-\u17FF]+', # Khmer
            r'[\u0600-\u06FF]+', # Arabic
            r'[\u0590-\u05FF]+', # Hebrew
            r'[\u0400-\u04FF]+', # Cyrillic (Russian, etc.)
            r'[\u3040-\u309F]+', # Hiragana (Japanese)
            r'[\u30A0-\u30FF]+', # Katakana (Japanese)
            r'[\u4E00-\u9FFF]+', # CJK Unified Ideographs (Chinese, Japanese, Korean)
            r'[\uAC00-\uD7AF]+', # Hangul (Korean)
            r'[\u1100-\u11FF]+', # Hangul Jamo (Korean)
            r'[\u0370-\u03FF]+', # Greek
            r'[\u1F00-\u1FFF]+', # Greek Extended
            r'[\u0100-\u017F]+', # Latin Extended-A
            r'[\u0180-\u024F]+', # Latin Extended-B
            r'[\u1E00-\u1EFF]+', # Latin Extended Additional
            r'[a-zA-Z]+',        # Basic Latin
        ]

        # Combine all Unicode patterns
        multilingual_word_pattern = '|'.join(unicode_word_patterns)

        # Combined pattern for tokenization
        combined_pattern = f'({url_pattern}|{email_pattern}|{date_pattern}|{number_pattern}|{multilingual_word_pattern}|{punctuation_pattern})'

        return re.findall(combined_pattern, sentence)
    
    return word_tokenizer

# Create language-specific tokenizer
word_tokenizer = create_language_aware_word_tokenizer(current_lang_info)

print(f"Created tokenizer for {current_lang_info['name']} ({current_lang_info['script']} script)")
print(f"Unicode range: {current_lang_info['unicode_range']}")

Created tokenizer for Marathi (Devanagari script)
Unicode range: \u0900-\u097F


In [22]:
from tqdm import tqdm

# Tokenize the loaded data
tokenized_data = []
LIMIT = min(len(texts), 1000)  # Use actual loaded data length

print(f"Tokenizing {LIMIT} {current_lang_info['name']} text samples...")

for paragraph in tqdm(texts[:LIMIT]):
    sentences = sentence_tokenizer(paragraph)
    tokenized_paragraph = [word_tokenizer(sentence) for sentence in sentences]
    tokenized_data.append(tokenized_paragraph)

print(f"Tokenization completed for {current_lang_info['name']} language")

Tokenizing 1000 Marathi text samples...


100%|██████████| 1000/1000 [00:00<00:00, 12728.22it/s]

Tokenization completed for Marathi language





In [23]:
import json

# Save tokenized data in sentence form
sentence_data = []

for para_idx, paragraph in enumerate(tokenized_data):
    paragraph_sentences = []
    for sentence in paragraph:
        # Join tokens back into sentence
        sentence_text = " ".join(sentence)
        paragraph_sentences.append(sentence_text)
    sentence_data.append(paragraph_sentences)

# Create filename based on current language
filename = f"tokenized_{current_lang_info['name'].lower()}_sentences.json"

# Save to JSON file
with open(filename, "w", encoding="utf-8") as f:
    json.dump(sentence_data, f, ensure_ascii=False, indent=2)

print(f"Tokenized sentences saved to '{filename}'")

Tokenized sentences saved to 'tokenized_marathi_sentences.json'


In [24]:
total_sentences = 0
total_words = 0
total_chars = 0
all_words = []

for para in tokenized_data:
    total_sentences += len(para)
    for sentence in para:
        total_words += len(sentence)
        total_chars += sum(len(word) for word in sentence)
        all_words.extend(sentence)

avg_sentence_length = total_words / total_sentences
avg_word_length = total_chars / total_words
ttr = len(set(all_words)) / len(all_words)

print("Total Sentences:", total_sentences)
print("Total Words:", total_words)
print("Total Characters:", total_chars)
print("Avg. Sentence Length (words):", round(avg_sentence_length, 2))
print("Avg. Word Length (chars):", round(avg_word_length, 2))
print("Type-Token Ratio (TTR):", round(ttr, 4))


Total Sentences: 2117
Total Words: 18911
Total Characters: 108152
Avg. Sentence Length (words): 8.93
Avg. Word Length (chars): 5.72
Type-Token Ratio (TTR): 0.4526


In [25]:
# ===== EASY LANGUAGE SWITCHING DEMO =====
# To process a different language, simply change this variable and run all cells:

def process_language(language_split, num_samples=100):
    """
    Complete pipeline to process any language from IndicCorpV2
    """
    print(f"\n{'='*50}")
    print(f"PROCESSING LANGUAGE: {language_split}")
    print(f"{'='*50}")
    
    # Load data
    texts_new, lang_info_new = load_language_data(language_split, num_samples)
    
    if not texts_new:
        print("Failed to load data. Skipping this language.")
        return
    
    # Create tokenizer
    word_tokenizer_new = create_language_aware_word_tokenizer(lang_info_new)
    
    # Tokenize
    tokenized_data_new = []
    for paragraph in tqdm(texts_new[:num_samples], desc=f"Tokenizing {lang_info_new['name']}"):
        sentences = sentence_tokenizer(paragraph)
        tokenized_paragraph = [word_tokenizer_new(sentence) for sentence in sentences]
        tokenized_data_new.append(tokenized_paragraph)
    
    # Calculate statistics
    total_sentences = sum(len(para) for para in tokenized_data_new)
    total_words = sum(len(sentence) for para in tokenized_data_new for sentence in para)
    total_chars = sum(len(word) for para in tokenized_data_new for sentence in para for word in sentence)
    
    print(f"\n{lang_info_new['name']} Language Statistics:")
    print(f"- Total Sentences: {total_sentences}")
    print(f"- Total Words: {total_words}")
    print(f"- Total Characters: {total_chars}")
    print(f"- Average Sentence Length: {total_words/total_sentences:.2f} words")
    print(f"- Average Word Length: {total_chars/total_words:.2f} characters")
    
    # Save tokenized data
    filename_new = f"tokenized_{lang_info_new['name'].lower()}_sentences.json"
    sentence_data_new = []
    for paragraph in tokenized_data_new:
        paragraph_sentences = [" ".join(sentence) for sentence in paragraph]
        sentence_data_new.append(paragraph_sentences)
    
    with open(filename_new, "w", encoding="utf-8") as f:
        json.dump(sentence_data_new, f, ensure_ascii=False, indent=2)
    
    print(f"- Saved to: {filename_new}")
    return tokenized_data_new, lang_info_new

# Demo: Process multiple languages easily
print("Available languages:")
for split, info in LANGUAGE_CONFIG.items():
    print(f"  {split} - {info['name']} ({info['script']})")

# Uncomment any of these to process different languages:
# process_language("hin_Deva", 50)  # Hindi
# process_language("ben_Beng", 50)  # Bengali  
# process_language("tam_Taml", 50)  # Tamil
# process_language("tel_Telu", 50)  # Telugu
# process_language("guj_Gujr", 50)  # Gujarati

print("\nTo process a new language, call: process_language('language_split', num_samples)")

Available languages:
  hin_Deva - Hindi (Devanagari)
  mar_Deva - Marathi (Devanagari)
  guj_Gujr - Gujarati (Gujarati)
  ben_Beng - Bengali (Bengali)
  tam_Taml - Tamil (Tamil)
  tel_Telu - Telugu (Telugu)
  kan_Knda - Kannada (Kannada)
  mal_Mlym - Malayalam (Malayalam)
  pan_Guru - Punjabi (Gurmukhi)
  ori_Orya - Odia (Odia)
  asm_Beng - Assamese (Bengali)
  urd_Arab - Urdu (Arabic)

To process a new language, call: process_language('language_split', num_samples)


In [None]:
hindi_data, hindi_info = process_language("hin_Deva", 25)


PROCESSING LANGUAGE: hin_Deva
Loading Hindi data from split: hin_Deva
Script: Devanagari
Successfully loaded 25 text samples
Sample 1: लोगों को बिलों संबंधी सुविधा देना ही उनका काम...
Sample 2: ...
Sample 3: इनेलो 1987 में उस वक्त ऐसे ही दोराहे पर खड़ी थी, जब पूर्व उपप्रधानमंत्री देवीलाल ने अपने पुत्र ओमप्र...
Successfully loaded 25 text samples
Sample 1: लोगों को बिलों संबंधी सुविधा देना ही उनका काम...
Sample 2: ...
Sample 3: इनेलो 1987 में उस वक्त ऐसे ही दोराहे पर खड़ी थी, जब पूर्व उपप्रधानमंत्री देवीलाल ने अपने पुत्र ओमप्र...


Tokenizing Hindi: 100%|██████████| 25/25 [00:00<00:00, 13347.45it/s]


Hindi Language Statistics:
- Total Sentences: 43
- Total Words: 662
- Total Characters: 2714
- Average Sentence Length: 15.40 words
- Average Word Length: 4.10 characters
- Saved to: tokenized_hindi_sentences.json



