In [6]:
# ASSIGNMENT-1: Tokenized Sentences to Parquet Format
# This notebook processes Telugu text and saves tokenized sentences to parquet format

import re
import os
import pandas as pd
import pyarrow.parquet as pq

# Load Telugu dataset from local file
dataset_file = 'telugu_dataset.txt'
print(f"Loading Telugu dataset from {dataset_file}...")

# Read all paragraphs from the file (each line is a paragraph)
with open(dataset_file, 'r', encoding='utf-8') as f:
    paragraphs = [line.strip() for line in f if line.strip()]

print(f"Dataset loaded successfully! Total paragraphs: {len(paragraphs):,}")


Loading Telugu dataset from telugu_dataset.txt...
Dataset loaded successfully! Total paragraphs: 25,001


In [7]:
# Define tokenization patterns

# Sentence tokenization pattern - splits on punctuation followed by whitespace
sentence_pattern = re.compile(r'(?<=[.!?])\s+')

# Word tokenization pattern - handles URLs, emails, dates, numbers, Telugu script, English, punctuation
token_pattern = re.compile(
    r'\bhttps?://\S+|'                  # URLs
    r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b|'  # email addresses
    r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|'   # dates (DD/MM/YYYY or DD-MM-YYYY)
    r'\d+\.\d+|\d+|'                    # numbers (decimals and integers)
    r'[\u0C00-\u0C7F]+|'                # Telugu script
    r'[^\s\w\u0C00-\u0C7F]'             # punctuation
)

def tokenize_sentence(sentence):
    """
    Tokenize a sentence into words/tokens and return as space-separated string.
    
    Args:
        sentence: Input sentence string
        
    Returns:
        Space-separated tokenized sentence
    """
    tokens = token_pattern.findall(sentence)
    return ' '.join(tokens)

print("Tokenization patterns defined!")


Tokenization patterns defined!


In [8]:
# ============================================================================
# CONFIGURATION: Set processing limit
# ============================================================================
# Change MAX_PARAGRAPHS to control how many paragraphs to process:
#   - None: Process entire dataset
#   - 1000: Quick test (~1-2 minutes)
#   - 10000: Small sample (~5-10 minutes)
#   - 25000: Full dataset (based on telugu_dataset.txt size)
# ============================================================================
MAX_PARAGRAPHS = None  # Set to None to process all, or a number to limit

# Process dataset: Split paragraphs into sentences, tokenize each sentence, and collect results

tokenized_sentences = []
num_paragraphs = 0
batch_size = 1000  # Process in batches for progress tracking

# Determine how many paragraphs to process
total_paragraphs = len(paragraphs)
if MAX_PARAGRAPHS:
    paragraphs_to_process = paragraphs[:MAX_PARAGRAPHS]
    print(f"Processing {MAX_PARAGRAPHS:,} paragraphs (out of {total_paragraphs:,} total)")
else:
    paragraphs_to_process = paragraphs
    print(f"Processing entire dataset: {total_paragraphs:,} paragraphs")

print("=" * 60)

for paragraph in paragraphs_to_process:
    if paragraph.strip():
        # Step 1: Split paragraph into sentences
        sentences = sentence_pattern.split(paragraph)
        
        # Step 2: Tokenize each sentence and join tokens with spaces
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:  # Skip empty sentences
                tokenized = tokenize_sentence(sentence)
                if tokenized:  # Only add non-empty tokenized sentences
                    tokenized_sentences.append(tokenized)
        
        num_paragraphs += 1
        
        # Progress update
        if num_paragraphs % batch_size == 0:
            print(f"Processed {num_paragraphs:,} paragraphs, {len(tokenized_sentences):,} tokenized sentences")

print("=" * 60)
print(f"\nProcessing complete!")
print(f"Total paragraphs processed: {num_paragraphs:,}")
print(f"Total tokenized sentences: {len(tokenized_sentences):,}")


Processing dataset...
Limit: 50,000 paragraphs
Processed 1,000 paragraphs, 3,659 tokenized sentences
Processed 2,000 paragraphs, 7,246 tokenized sentences
Processed 3,000 paragraphs, 10,905 tokenized sentences
Processed 4,000 paragraphs, 14,838 tokenized sentences
Processed 5,000 paragraphs, 18,345 tokenized sentences
Processed 6,000 paragraphs, 22,270 tokenized sentences
Processed 7,000 paragraphs, 26,858 tokenized sentences
Processed 8,000 paragraphs, 30,450 tokenized sentences
Processed 9,000 paragraphs, 33,828 tokenized sentences
Processed 10,000 paragraphs, 37,437 tokenized sentences
Processed 11,000 paragraphs, 41,449 tokenized sentences
Processed 12,000 paragraphs, 44,698 tokenized sentences
Processed 13,000 paragraphs, 48,345 tokenized sentences
Processed 14,000 paragraphs, 52,187 tokenized sentences
Processed 15,000 paragraphs, 55,816 tokenized sentences
Processed 16,000 paragraphs, 59,375 tokenized sentences
Processed 17,000 paragraphs, 63,084 tokenized sentences
Processed 18

In [9]:
# Create DataFrame and save to Parquet format with compression

# Create DataFrame with tokenized sentences
df = pd.DataFrame({
    'tokenized_sentence': tokenized_sentences
})

# Output file name
output_file = 'telugu_tokenized_sentences.parquet'

print(f"Saving to {output_file}...")

# Save to parquet with compression
df.to_parquet(
    output_file,
    engine='pyarrow',           # Use PyArrow engine for better performance
    compression='snappy',       # Snappy compression: fast, good compression ratio
    index=False                 # Don't save DataFrame index
)

# Get file size
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)

print(f"✓ Successfully saved {len(tokenized_sentences)} tokenized sentences")
print(f"✓ File: {output_file}")
print(f"✓ File size: {file_size_mb:.2f} MB")
print(f"✓ Compression: Snappy")


Saving to telugu_tokenized_sentences.parquet...
✓ Successfully saved 182239 tokenized sentences
✓ File: telugu_tokenized_sentences.parquet
✓ File size: 14.92 MB
✓ Compression: Snappy


In [10]:
# Verify the parquet file - Read back and display sample

print("Verifying parquet file...")
print("=" * 60)

# Read the parquet file
df_read = pd.read_parquet(output_file)

print(f"Total sentences in parquet file: {len(df_read)}")
print(f"\nFirst 10 tokenized sentences:")
print("-" * 60)

for idx, row in df_read.head(10).iterrows():
    print(f"{idx + 1}. {row['tokenized_sentence'][:100]}...")  # Show first 100 chars

print("\n" + "=" * 60)
print("✓ Parquet file verified successfully!")


Verifying parquet file...
Total sentences in parquet file: 182239

First 10 tokenized sentences:
------------------------------------------------------------
1. అమెరికా అధ్యక్షుడు డొనాల్డ్ ట్రంప్ కు రాష్ట్రపతి భవన్ వద్ద ఘనస్వాగతం లభించింది ....
2. ఆయనకు రాష్ట్రపతి రామ్ నాథ్ కోవింద్ దంపతులు , ప్రధాని మోదీ సాదరంగా ఆహ్వానం పలకడంతో పాటు సైనికులు గౌరవ...
3. ఇటు తెలంగాణలో కరోనా వైరస్ కారణంగా అన్ని దేవాలయాల్లో ముందస్తు చర్యలు చేపట్టారు ....
4. భద్రాద్రి రాముడికి కరోనా ఎఫెక్ట్ తగిలింది ....
5. ఏప్రిల్ 2 న భద్రాద్రిలో జరగనున్న శ్రీరామ నవమి వేడుకలను వెంటాడుతోంది కరోనా ....
6. రాష్ట్రంలో కొనసాగుతున్న కరోనా అలర్ట్ నేపథ్యంలో భక్తులు లేకుండానే శ్రీరామనవమి జరుపుతామని మంత్రి పువ్వ...
7. ప్రత్యేక మీడియా సమావేశం ఏర్పాటు చేసిన మంత్రి పువ్వాడ ఈ మేరకు స్పష్టం చేశారు ....
8. శ్రీ రాములవారి కల్యాణం టికెట్లు రద్దు చేస్తున్నామని ప్రకటించారు . ....
9. టికెట్ ‌ డబ్బు తిరిగి ఆలయ అధికారులు చెల్లిస్తారన్నారు ....
10. కరోనాపై ప్రజలు భయభ్రాంతులకు గురికావొదని , మరింత అప్రమత్తంగా ఉండాలని ఉండాలని సూచించారు ....

✓ Parqu

In [11]:
# Optional: Calculate and display statistics

if len(tokenized_sentences) > 0:
    # Calculate statistics
    all_tokens = []
    for sent in tokenized_sentences:
        all_tokens.extend(sent.split())
    
    total_tokens = len(all_tokens)
    unique_tokens = len(set(all_tokens))
    avg_sentence_length = total_tokens / len(tokenized_sentences) if tokenized_sentences else 0
    ttr = unique_tokens / total_tokens if total_tokens > 0 else 0
    
    print("Corpus Statistics:")
    print("=" * 60)
    print(f"Total sentences: {len(tokenized_sentences):,}")
    print(f"Total tokens: {total_tokens:,}")
    print(f"Unique tokens: {unique_tokens:,}")
    print(f"Average sentence length: {avg_sentence_length:.2f} tokens")
    print(f"Type-Token Ratio (TTR): {ttr:.4f}")
    print("=" * 60)


Corpus Statistics:
Total sentences: 182,239
Total tokens: 2,242,620
Unique tokens: 200,756
Average sentence length: 12.31 tokens
Type-Token Ratio (TTR): 0.0895
