In [1]:
import re
import os

In [2]:


# Gujarati-aware regex patterns
EMAIL_PATTERN = r'\b[\w.-]+@[\w.-]+\.\w+\b'
URL_PATTERN = r'https?://\S+\.[a-zA-Z]+|www\.\S+\.[a-zA-Z]+'
DECIMAL_PATTERN = r'\d+\.\d+'
DATE = r'(?:[0-9\u0AE6-\u0AEF]{1,2}[-/]){2}[0-9\u0AE6-\u0AEF]{2,4}|(?:[0-9\u0AE6-\u0AEF]{4}[-/])([0-9\u0AE6-\u0AEF]{1,2})[-/][0-9\u0AE6-\u0AEF]{1,2}'
GUJARATI_LETTER = r'[\u0A80-\u0AFF]'
WORD_PATTERN = r'[\u0A80-\u0AFF]+|[a-zA-Z0-9]+|[.,!?;:"\'“”‘’…—–-]'

# File paths
input_file = "gu.txt"
sentence_file = "sentences.txt"
word_file = "words.txt"
recombined_file = "recombined.txt"
email_file = "emails.txt"
url_file="url.txt"


# Batch size
BATCH_SIZE =50000

def process_batch(batch_lines, batch_num):
    batch_text = "\n".join(batch_lines)

    #email 
    emails = re.findall(EMAIL_PATTERN, batch_text)
    urls = re.findall(URL_PATTERN, batch_text)

    with open(email_file, 'a', encoding='utf-8') as f:
        for e in emails:
            f.write(e + "\n")

    #url
    with open(url_file, 'a', encoding='utf-8') as f:
        for u in urls:
            f.write(u + "\n")

    # Sentence 
    sentences = re.split(r'(?<=[.!?।])\s+', batch_text)
    tokenized_sentences = []
    all_tokens = []

    for sent in sentences:
        if not sent.strip():
            continue
        tokens = re.findall(WORD_PATTERN, sent)
        tokenized_sentences.append(" ".join(tokens))
        all_tokens.extend(tokens)

    # making output files 
    with open(sentence_file, 'a', encoding='utf-8') as f:
        for sent in tokenized_sentences:
            f.write(sent.strip() + "\n")

    with open(word_file, 'a', encoding='utf-8') as f:
        for word in all_tokens:
            f.write(word.strip() + "\n")

    with open(recombined_file, 'a', encoding='utf-8') as f:
        for sent in tokenized_sentences:
            f.write(sent.strip() + "\n")

    print(f"Processed Batch {batch_num} - {len(batch_lines)} lines")

# Process file in batches
with open(input_file, 'r', encoding='utf-8') as f:
    batch = []
    batch_num = 1

    for line_num, line in enumerate(f, 1):
        batch.append(line)
        if line_num % BATCH_SIZE == 0:
            process_batch(batch, batch_num)
            batch = []
            batch_num += 1

    # Process remaining lines
    if batch:
        process_batch(batch, batch_num)



Processed Batch 1 - 50000 lines
Processed Batch 2 - 50000 lines
Processed Batch 3 - 50000 lines
Processed Batch 4 - 50000 lines
Processed Batch 5 - 50000 lines
Processed Batch 6 - 50000 lines
Processed Batch 7 - 50000 lines
Processed Batch 8 - 50000 lines
Processed Batch 9 - 50000 lines
Processed Batch 10 - 50000 lines
Processed Batch 11 - 50000 lines
Processed Batch 12 - 50000 lines
Processed Batch 13 - 50000 lines
Processed Batch 14 - 50000 lines
Processed Batch 15 - 50000 lines
Processed Batch 16 - 50000 lines
Processed Batch 17 - 50000 lines
Processed Batch 18 - 50000 lines
Processed Batch 19 - 50000 lines
Processed Batch 20 - 50000 lines
Processed Batch 21 - 50000 lines
Processed Batch 22 - 50000 lines
Processed Batch 23 - 50000 lines
Processed Batch 24 - 50000 lines
Processed Batch 25 - 50000 lines
Processed Batch 26 - 50000 lines
Processed Batch 27 - 50000 lines
Processed Batch 28 - 50000 lines
Processed Batch 29 - 50000 lines
Processed Batch 30 - 50000 lines
Processed Batch 31 

In [None]:
# Corpus Statistics Calculation

# Read tokenized files
with open("sentences.txt", 'r', encoding='utf-8') as f:
    tokenized_sentences = [line.strip() for line in f if line.strip()]

with open("words.txt", 'r', encoding='utf-8') as f:
    tokenized_words = [line.strip() for line in f if line.strip()]


# i. Total number of sentences
total_sentences = len(tokenized_sentences)

# ii. Total number of words
total_words = len(tokenized_words)

# iii. Total number of characters (excluding whitespace)
total_characters = sum(len(token) for token in tokenized_words)

# iv. Average Sentence Length
avg_sentence_length = total_words / total_sentences if total_sentences else 0

# v. Average Word Length
avg_word_length = total_characters / total_words if total_words else 0

# vi. Type/Token Ratio
unique_tokens = len(set(tokenized_words))
ttr = unique_tokens / total_words if total_words else 0

# 📊 Print Results
print(" Statistics")
print(f"Total Sentences           : {total_sentences}")
print(f"Total Words               : {total_words}")
print(f"Total Characters          : {total_characters}")
print(f"Average Sentence Length   : {avg_sentence_length:.2f} words/sentence")
print(f"Average Word Length       : {avg_word_length:.2f} chars/word")
print(f"Type/Token Ratio (TTR)    : {ttr:.4f}")


