In [1]:
from datasets import load_dataset

dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming =True)

In [7]:
# Convert streaming dataset to list of texts (first 1000 samples)
texts = []
count = 0
for sample in dataset:
    texts.append(sample['text'])
    count += 1
    if count >= 1000:  # Limit to avoid memory issues
        break

# Preview a few paragraphs
print(f"Loaded {len(texts)} text samples")
for i, text in enumerate(texts[:3]):
    print(f"Sample {i+1}: {text[:100]}...")

Loaded 1000 text samples
Sample 1: આ વીડિયો જુઓ: ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ...
Sample 2: ...
Sample 3: મિથેનોલ આવ્યો ક્યાંથી?...


In [8]:
import re

# Sentence tokenizer remains the same
def sentence_tokenizer(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

# Improved word tokenizer that handles Devanagari + matras
def word_tokenizer(sentence):
    url_pattern = r'https?://\S+|www\.\S+'
    email_pattern = r'\S+@\S+\.\S+'
    date_pattern = r'\b\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}\b'
    number_pattern = r'\b\d+(?:\.\d+)?\b'
    punctuation_pattern = r'[।॥!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    
    # Unicode range for Devanagari letters and matras
    gujarati_word_pattern = r'[\u0A81-\u0AFF]+'

    # Combined pattern for tokenization
    combined_pattern = f'({url_pattern}|{email_pattern}|{date_pattern}|{number_pattern}|{gujarati_word_pattern}|{punctuation_pattern})'

    return re.findall(combined_pattern, sentence)

In [9]:
from tqdm import tqdm

tokenized_data = []
LIMIT = 1000  # Increase if system allows

for paragraph in tqdm(texts[:LIMIT]):
    sentences = sentence_tokenizer(paragraph)
    tokenized_paragraph = [word_tokenizer(sentence) for sentence in sentences]
    tokenized_data.append(tokenized_paragraph)

100%|██████████| 1000/1000 [00:00<00:00, 20074.30it/s]


In [10]:
import json

# Save tokenized data in sentence form
sentence_data = []

for para_idx, paragraph in enumerate(tokenized_data):
    paragraph_sentences = []
    for sentence in paragraph:
        # Join tokens back into sentence
        sentence_text = " ".join(sentence)
        paragraph_sentences.append(sentence_text)
    sentence_data.append(paragraph_sentences)

# Save to JSON file
with open("tokenized_gujarati_sentences.json", "w", encoding="utf-8") as f:
    json.dump(sentence_data, f, ensure_ascii=False, indent=2)

print("Tokenized sentences saved to 'tokenized_gujarati_sentences.json'")


Tokenized sentences saved to 'tokenized_gujarati_sentences.json'


In [11]:
total_sentences = 0
total_words = 0
total_chars = 0
all_words = []

for para in tokenized_data:
    total_sentences += len(para)
    for sentence in para:
        total_words += len(sentence)
        total_chars += sum(len(word) for word in sentence)
        all_words.extend(sentence)

avg_sentence_length = total_words / total_sentences
avg_word_length = total_chars / total_words
ttr = len(set(all_words)) / len(all_words)

print("Total Sentences:", total_sentences)
print("Total Words:", total_words)
print("Total Characters:", total_chars)
print("Avg. Sentence Length (words):", round(avg_sentence_length, 2))
print("Avg. Word Length (chars):", round(avg_word_length, 2))
print("Type-Token Ratio (TTR):", round(ttr, 4))


Total Sentences: 1946
Total Words: 21977
Total Characters: 106987
Avg. Sentence Length (words): 11.29
Avg. Word Length (chars): 4.87
Type-Token Ratio (TTR): 0.381
