In [7]:
from datasets import load_dataset
import re
import json
from tqdm import tqdm

# Simple language configuration - just the basics
LANGUAGES = {
    "hin_Deva": "Hindi",
    "mar_Deva": "Marathi"
}

def load_data(language="hin_Deva", num_samples=100):
    """Load text data for the specified language"""
    print(f"Loading {LANGUAGES[language]} data...")
    try:
        dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split=language, streaming=True)
        
        texts = []
        for i, sample in enumerate(dataset):
            if i >= num_samples:
                break
            if sample.get('text'):  # Validate text exists
                texts.append(sample['text'])
        
        print(f"Loaded {len(texts)} text samples")
        return texts
    except Exception as e:
        print(f"Error loading data: {e}")
        return []

# Load Hindi data
texts = load_data("hin_Deva", 50)

Loading Hindi data...
Loaded 25 text samples


In [8]:
def sentence_tokenizer(text):
    """Simple sentence tokenizer"""
    if not text or not text.strip():
        return []
    
    # Split on common sentence endings (including Devanagari punctuation)
    sentences = re.split(r'[.!?।॥]\s*', text.strip())
    return [s.strip() for s in sentences if s.strip()]

def word_tokenizer(sentence):
    """Enhanced word tokenizer for Indian languages with URL, email, date detection"""
    if not sentence or not sentence.strip():
        return []
    
    # Special patterns for structured data (most specific first)
    patterns = [
        r'https?://\S+',           # URLs with http/https
        r'www\.\S+',               # URLs with www
        r'\S+@\S+\.\S+',          # Email addresses
        r'\b\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}\b',  # Dates
        r'\b\d+(?:\.\d+)?\b',      # Numbers (including decimals)
        
        # Indian language scripts (extended for conjuncts and diacritics)
        r'[\u0900-\u097F]+',       # Devanagari (Hindi, Marathi, Sanskrit)
        r'[\u0A80-\u0AFF]+',       # Gujarati
        r'[\u0980-\u09FF]+',       # Bengali, Assamese
        r'[\u0B80-\u0BFF]+',       # Tamil
        r'[\u0C00-\u0C7F]+',       # Telugu
        r'[\u0C80-\u0CFF]+',       # Kannada
        r'[\u0D00-\u0D7F]+',       # Malayalam
        r'[\u0A00-\u0A7F]+',       # Punjabi (Gurmukhi)
        r'[\u0B00-\u0B7F]+',       # Odia
        r'[\u0600-\u06FF]+',       # Arabic/Urdu
        
        r'[a-zA-Z]+',              # English words
        r'[।॥!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]',  # Punctuation (individual)
    ]
    
    # Combine all patterns
    combined_pattern = '|'.join(f'({pattern})' for pattern in patterns)
    
    # Find all matches
    matches = re.findall(combined_pattern, sentence)
    
    # Flatten the tuple results and filter out empty strings
    tokens = []
    for match in matches:
        for group in match:
            if group:  # Only add non-empty groups
                tokens.append(group)
    
    return tokens

print("Tokenization functions ready")

Tokenization functions ready


In [9]:
def tokenize_texts(texts):
    """Tokenize all texts into sentences and words"""
    if not texts:
        return []
    
    tokenized_data = []
    
    for text in tqdm(texts, desc="Tokenizing"):
        try:
            sentences = sentence_tokenizer(text)
            tokenized_sentences = []
            
            for sentence in sentences:
                words = word_tokenizer(sentence)
                if words:  # Only add non-empty sentences
                    tokenized_sentences.append(words)
            
            if tokenized_sentences:  # Only add non-empty paragraphs
                tokenized_data.append(tokenized_sentences)
        except Exception as e:
            print(f"Error tokenizing text: {e}")
            continue
    
    return tokenized_data

# Tokenize the loaded texts
tokenized_data = tokenize_texts(texts)
print(f"Tokenized {len(tokenized_data)} paragraphs")

Tokenizing: 100%|██████████| 25/25 [00:00<00:00, 4866.46it/s]

Tokenized 25 paragraphs





In [None]:
def save_tokenized_tokens(data, filename="tokenized_tokens.json"):
    """Save tokenized data as individual tokens (preserving token structure)"""
    if not data:
        print("No data to save")
        return
    
    token_data = []
    
    for paragraph in data:
        paragraph_tokens = []
        for sentence_tokens in paragraph:
            # Keep tokens as list instead of joining them
            paragraph_tokens.append(sentence_tokens)
        token_data.append(paragraph_tokens)
    
    # Save to JSON
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(token_data, f, ensure_ascii=False, indent=2)
        print(f"Saved tokenized data to '{filename}'")
    except Exception as e:
        print(f"Error saving file: {e}")

# Save the tokenized data as tokens
save_tokenized_tokens(tokenized_data, "tokenized_hindi_tokens.json")

Saved tokenized data to 'tokenized_hindi_tokens.json'
Saved tokenized data to 'tokenized_marathi_tokens.json'


In [11]:
# def save_tokenized_data(data, filename="tokenized_sentences.json"):
#     """Save tokenized data as sentences"""
#     if not data:
#         print("No data to save")
#         return
    
#     sentence_data = []
    
#     for paragraph in data:
#         paragraph_sentences = []
#         for sentence_tokens in paragraph:
#             # Join tokens back into sentence
#             sentence_text = " ".join(sentence_tokens)
#             paragraph_sentences.append(sentence_text)
#         sentence_data.append(paragraph_sentences)
    
#     # Save to JSON
#     try:
#         with open(filename, "w", encoding="utf-8") as f:
#             json.dump(sentence_data, f, ensure_ascii=False, indent=2)
#         print(f"Saved to '{filename}'")
#     except Exception as e:
#         print(f"Error saving file: {e}")

# # Save the tokenized data
# save_tokenized_data(tokenized_data, "tokenized_hindi_sentences.json")

In [12]:
def calculate_stats(tokenized_data):
    """Calculate basic statistics"""
    if not tokenized_data:
        print("No data to analyze")
        return
    
    total_sentences = 0
    total_words = 0
    all_words = []
    
    for paragraph in tokenized_data:
        total_sentences += len(paragraph)
        for sentence in paragraph:
            total_words += len(sentence)
            all_words.extend(sentence)
    
    unique_words = len(set(all_words))
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    type_token_ratio = unique_words / total_words if total_words > 0 else 0
    
    print("=== STATISTICS ===")
    print(f"Total Paragraphs: {len(tokenized_data)}")
    print(f"Total Sentences: {total_sentences}")
    print(f"Total Words: {total_words}")
    print(f"Unique Words: {unique_words}")
    print(f"Average Sentence Length: {avg_sentence_length:.2f} words")
    print(f"Type-Token Ratio: {type_token_ratio:.4f}")
    
    # Show some example tokens
    if all_words:
        print(f"\nSample tokens: {all_words[:10]}")

# Calculate and display statistics
calculate_stats(tokenized_data)

=== STATISTICS ===
Total Paragraphs: 25
Total Sentences: 66
Total Words: 1300
Unique Words: 652
Average Sentence Length: 19.70 words
Type-Token Ratio: 0.5015

Sample tokens: ['लोगों', 'को', 'बिलों', 'संबंधी', 'सुविधा', 'देना', 'ही', 'उनका', 'काम', 'इनेलो']


In [13]:
# Example: Process Marathi data
print("\n" + "="*40)
print("Processing Marathi data...")
print("="*40)

marathi_texts = load_data("mar_Deva", 25)
if marathi_texts:
    marathi_tokenized = tokenize_texts(marathi_texts)
    save_tokenized_data(marathi_tokenized, "tokenized_marathi_sentences.json")
    calculate_stats(marathi_tokenized)
else:
    print("No Marathi data loaded")


Processing Marathi data...
Loading Marathi data...
Loaded 13 text samples


Tokenizing: 100%|██████████| 13/13 [00:00<00:00, 10532.35it/s]

Saved to 'tokenized_marathi_sentences.json'
=== STATISTICS ===
Total Paragraphs: 13
Total Sentences: 39
Total Words: 427
Unique Words: 338
Average Sentence Length: 10.95 words
Type-Token Ratio: 0.7916

Sample tokens: ['ऊती', 'संवर्धन', 'तंत्राचे', 'अनेक', 'उपयोग', 'आहेत', 'या', 'तंत्राचा', 'उपयोग', 'विशेषकरून']



