In [1]:
from datasets import load_dataset
import re
#Load Telugu dataset
dataset = load_dataset("ai4bharat/IndicCorpV2", name="indiccorp_v2", split="tel_Telu", streaming=True)

# Small Telugu stopword list (expand if needed)
telugu_stopwords = {
    "మరియు", "కాని", "లేక", "ఇది", "అది", "ఒక",
    "లో", "పై", "తో", "కి", "వంటి", "అని"
}

# Function to clean text
def clean_text(text):
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    words = text.split(" ")
    # Remove stopwords
    words = [w for w in words if w not in telugu_stopwords]
    return " ".join(words)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Save cleaned dataset to file
with open("telugu_dataset.txt", "w", encoding="utf-8") as f:
    for i, example in enumerate(dataset):
        if example["text"].strip():
            cleaned = clean_text(example["text"])
            if cleaned:
                f.write(cleaned + "\n")
        # ⚠️ To avoid huge file, limit to first 50k sentences
        if i >= 50000:
            break

In [3]:
#Read back the file
with open("telugu_dataset.txt", "r", encoding="utf-8") as f:
    text_data = f.read()


In [4]:
#Sentence Splitting
sentence_pattern = re.compile(r'(?<=[.!?])\s+')
all_sentences = sentence_pattern.split(text_data)
no_of_sentences = len(all_sentences)


In [5]:
#Tokenization
token_pattern = re.compile(
    r'\bhttps?://\S+|'                  # URLs
    r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b|'  # email addresses
    r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|'   # dates
    r'\d+\.\d+|\d+|'                    # numbers
    r'[\u0C00-\u0C7F]+|'                # Telugu words
    r'[^\s\w\u0C00-\u0C7F]'             # punctuation
)

all_tokens = token_pattern.findall(text_data)
no_of_words = len(all_tokens)
unique_tokens = len(set(all_tokens))


In [6]:

#Character-level Analysis
char_pattern = re.compile(r'[\u0C00-\u0C7F0-9.,!?;:"\'()\[\]{}\-—–…\s]')
all_chars = char_pattern.findall(text_data)
no_of_chars = len(all_chars)


In [7]:
Average_word_length = no_of_chars / no_of_words if no_of_words else 0
Average_sentence_length = no_of_words / no_of_sentences if no_of_sentences else 0
TTR = unique_tokens / no_of_words if no_of_words else 0

print("Number of sentences:", no_of_sentences)
print("Total words/tokens:", no_of_words)
print("Unique tokens:", unique_tokens)
print("Number of characters:", no_of_chars)
print("Average word length:", Average_word_length)
print("Average sentence length:", Average_sentence_length)
print("TTR:", TTR)

Number of sentences: 85870
Total words/tokens: 1114662
Unique tokens: 130760
Number of characters: 6827880
Average word length: 6.125516075725198
Average sentence length: 12.980808198439501
TTR: 0.11730910356682116
