In [1]:
from datasets import load_dataset
import re
import json
from tqdm import tqdm

# Simple language configuration - just the basics
LANGUAGES = {
    "hin_Deva": "Hindi",
    "mar_Deva": "Marathi"
}

def load_data(language="hin_Deva", num_samples=100):
    """Load text data for the specified language"""
    print(f"Loading {LANGUAGES[language]} data...")
    dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split=language, streaming=True)
    
    texts = []
    for i, sample in enumerate(dataset):
        if i >= num_samples:
            break
        texts.append(sample['text'])
    
    print(f"Loaded {len(texts)} text samples")
    return texts

# Load Hindi data
texts = load_data("hin_Deva", 50)

  from .autonotebook import tqdm as notebook_tqdm


Loading Hindi data...
Loaded 50 text samples


In [2]:
def sentence_tokenizer(text):
    """Simple sentence tokenizer"""
    # Split on common sentence endings
    sentences = re.split(r'[.!?।॥]\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]

def word_tokenizer(sentence):
    """Simple word tokenizer for Indian languages"""
    # Pattern for Indian language scripts and basic punctuation
    pattern = r'[\u0900-\u097F\u0980-\u09FF\u0A00-\u0A7F\u0A80-\u0AFF\u0B00-\u0B7F\u0B80-\u0BFF\u0C00-\u0C7F\u0C80-\u0CFF\u0D00-\u0D7F\u0600-\u06FF]+|[a-zA-Z]+|\d+|[^\w\s]'
    return re.findall(pattern, sentence)

print("Tokenization functions ready")

Tokenization functions ready


In [3]:
def tokenize_texts(texts):
    """Tokenize all texts into sentences and words"""
    tokenized_data = []
    
    for text in tqdm(texts, desc="Tokenizing"):
        sentences = sentence_tokenizer(text)
        tokenized_sentences = []
        
        for sentence in sentences:
            words = word_tokenizer(sentence)
            if words:  # Only add non-empty sentences
                tokenized_sentences.append(words)
        
        if tokenized_sentences:  # Only add non-empty paragraphs
            tokenized_data.append(tokenized_sentences)
    
    return tokenized_data

# Tokenize the loaded texts
tokenized_data = tokenize_texts(texts)
print(f"Tokenized {len(tokenized_data)} paragraphs")

Tokenizing: 100%|██████████| 50/50 [00:00<00:00, 20532.13it/s]

Tokenized 25 paragraphs





In [4]:
def save_tokenized_data(data, filename="tokenized_sentences.json"):
    """Save tokenized data as sentences"""
    sentence_data = []
    
    for paragraph in data:
        paragraph_sentences = []
        for sentence_tokens in paragraph:
            # Join tokens back into sentence
            sentence_text = " ".join(sentence_tokens)
            paragraph_sentences.append(sentence_text)
        sentence_data.append(paragraph_sentences)
    
    # Save to JSON
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(sentence_data, f, ensure_ascii=False, indent=2)
    
    print(f"Saved to '{filename}'")

# Save the tokenized data
save_tokenized_data(tokenized_data, "tokenized_hindi_sentences.json")

Saved to 'tokenized_hindi_sentences.json'


In [5]:
def calculate_stats(tokenized_data):
    """Calculate basic statistics"""
    total_sentences = 0
    total_words = 0
    all_words = []
    
    for paragraph in tokenized_data:
        total_sentences += len(paragraph)
        for sentence in paragraph:
            total_words += len(sentence)
            all_words.extend(sentence)
    
    unique_words = len(set(all_words))
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    type_token_ratio = unique_words / total_words if total_words > 0 else 0
    
    print("=== STATISTICS ===")
    print(f"Total Sentences: {total_sentences}")
    print(f"Total Words: {total_words}")
    print(f"Unique Words: {unique_words}")
    print(f"Average Sentence Length: {avg_sentence_length:.2f} words")
    print(f"Type-Token Ratio: {type_token_ratio:.4f}")

# Calculate and display statistics
calculate_stats(tokenized_data)

=== STATISTICS ===
Total Sentences: 63
Total Words: 1310
Unique Words: 661
Average Sentence Length: 20.79 words
Type-Token Ratio: 0.5046


In [6]:
# Example: Process Marathi data
print("\n" + "="*40)
print("Processing Marathi data...")
print("="*40)

marathi_texts = load_data("mar_Deva", 25)
marathi_tokenized = tokenize_texts(marathi_texts)
save_tokenized_data(marathi_tokenized, "tokenized_marathi_sentences.json")
calculate_stats(marathi_tokenized)


Processing Marathi data...
Loading Marathi data...
Loaded 25 text samples


Tokenizing: 100%|██████████| 25/25 [00:00<00:00, 40407.55it/s]

Saved to 'tokenized_marathi_sentences.json'
=== STATISTICS ===
Total Sentences: 33
Total Words: 452
Unique Words: 344
Average Sentence Length: 13.70 words
Type-Token Ratio: 0.7611



