In [1]:
import re
import os

In [2]:

# Gujarati-aware regex patterns
EMAIL_PATTERN = r'\b[\w.-]+@[\w.-]+\.\w+\b'
URL_PATTERN = r'https?://\S+\.[a-zA-Z]+|www\.\S+\.[a-zA-Z]+'
DECIMAL_PATTERN = r'\d+\.\d+'
DATE = r'(?:[0-9\u0AE6-\u0AEF]{1,2}[-/]){2}[0-9\u0AE6-\u0AEF]{2,4}|(?:[0-9\u0AE6-\u0AEF]{4}[-/])([0-9\u0AE6-\u0AEF]{1,2})[-/][0-9\u0AE6-\u0AEF]{1,2}'
GUJARATI_LETTER = r'[\u0A80-\u0AFF]'
WORD_PATTERN = r'[\u0A80-\u0AFF]+|[a-zA-Z0-9]+|[.,!?;:"\'“”‘’…—–-]'

In [7]:
# # File names
# # input_file = "gu_meta_part_1.txt"
# # sentence_file = "gu_meta_part_1.txt_sentences.txt"
# # word_file = "gu_meta_part_1.txt_words.txt"
# # recombined_file = "gu_meta_part_1.txt_recombined.txt"
# # email_file = "gu_meta_part_1.txt_emails.txt"
# # url_file = "gu_meta_part_1.txt_url.txt"
input_file=""
sentence_file=""
word_file=""
recombined_file=""
email_file=""
url_file=""
# # Batch size (smaller = faster regex)
# BATCH_SIZE = 10000

# # Gujarati-aware regex patterns (compiled for speed)
# EMAIL_PATTERN = re.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')
# URL_PATTERN = re.compile(r'https?://\S+\.[a-zA-Z]+|www\.\S+\.[a-zA-Z]+')
# DECIMAL_PATTERN = re.compile(r'\d+\.\d+')
# DATE_PATTERN = re.compile(
#     r'(?:[0-9\u0AE6-\u0AEF]{1,2}[-/]){2}[0-9\u0AE6-\u0AEF]{2,4}'
#     r'|(?:[0-9\u0AE6-\u0AEF]{4}[-/])([0-9\u0AE6-\u0AEF]{1,2})[-/][0-9\u0AE6-\u0AEF]{1,2}'
# )
# GUJARATI_LETTER = re.compile(r'[\u0A80-\u0AFF]')
# WORD_PATTERN = re.compile(r'[\u0A80-\u0AFF]+|[a-zA-Z0-9]+|[.,!?;:"\'“”‘’…—–-]')

# def process():
#     with open(input_file, 'r', encoding='utf-8') as fin, \
#          open(sentence_file, 'w', encoding='utf-8') as f_sent, \
#          open(word_file, 'w', encoding='utf-8') as f_word, \
#          open(recombined_file, 'w', encoding='utf-8') as f_recombined, \
#          open(email_file, 'w', encoding='utf-8') as f_email, \
#          open(url_file, 'w', encoding='utf-8') as f_url:

#         batch = []
#         batch_num = 1

#         for line_num, line in enumerate(fin, 1):
#             batch.append(line)

#             if line_num % BATCH_SIZE == 0:
#                 process_batch(batch, batch_num, f_sent, f_word, f_recombined, f_email, f_url)
#                 batch.clear()
#                 batch_num += 1

#         if batch:
#             process_batch(batch, batch_num, f_sent, f_word, f_recombined, f_email, f_url)

# def process_batch(batch_lines, batch_num, f_sent, f_word, f_recombined, f_email, f_url):
#     # Email & URL extraction line-by-line
#     for line in batch_lines:
#         for e in EMAIL_PATTERN.findall(line):
#             f_email.write(e + "\n")
#         for u in URL_PATTERN.findall(line):
#             f_url.write(u + "\n")

#     # Sentence splitting
#     batch_text = " ".join(batch_lines)
#     sentences = re.split(r'(?<=[.!?।])\s+', batch_text)

#     all_tokens = []
#     for sent in sentences:
#         if not sent.strip():
#             continue
#         tokens = WORD_PATTERN.findall(sent)
#         f_sent.write(" ".join(tokens) + "\n")
#         f_recombined.write(" ".join(tokens) + "\n")
#         all_tokens.extend(tokens)

#     # Write words
#     for word in all_tokens:
#         f_word.write(word + "\n")

#     print(f"✅ Processed Batch {batch_num} - {len(batch_lines)} lines")

# # Run processing
# # process()


# import re

# Files
# input_file = "gu_meta_part_3.txt"
# sentence_file = "gu_meta_part_3_sentences.txt"
# word_file = "gu_meta_part_3_words.txt"
# recombined_file = "gu_meta_part_3_recombined.txt"
# email_file = "gu_meta_part_3_emails.txt"
# url_file = "gu_meta_part_3_url.txt"

BATCH_SIZE = 10000  # smaller batches for regex speed

# Gujarati-aware regex patterns (compiled once)
EMAIL_PATTERN = re.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')
URL_PATTERN = re.compile(r'https?://\S+\.[a-zA-Z]+|www\.\S+\.[a-zA-Z]+')
WORD_PATTERN = re.compile(r'[\u0A80-\u0AFF]+|[a-zA-Z0-9]+|[.,!?;:"\'“”‘’…—–-]')

def process():
    with open(input_file, 'r', encoding='utf-8') as fin, \
         open(sentence_file, 'w', encoding='utf-8') as f_sent, \
         open(word_file, 'w', encoding='utf-8') as f_word, \
         open(recombined_file, 'w', encoding='utf-8') as f_recombined, \
         open(email_file, 'w', encoding='utf-8') as f_email, \
         open(url_file, 'w', encoding='utf-8') as f_url:

        batch = []
        batch_num = 1

        for line_num, line in enumerate(fin, 1):
            batch.append(line)
            if line_num % BATCH_SIZE == 0:
                process_batch(batch, batch_num, f_sent, f_word, f_recombined, f_email, f_url)
                batch.clear()
                batch_num += 1

        if batch:
            process_batch(batch, batch_num, f_sent, f_word, f_recombined, f_email, f_url)

def process_batch(batch_lines, batch_num, f_sent, f_word, f_recombined, f_email, f_url):
    email_buf = []
    url_buf = []
    sentence_buf = []
    word_buf = []
    recombined_buf = []

    # Email & URL line-by-line
    for line in batch_lines:
        email_buf.extend(EMAIL_PATTERN.findall(line))
        url_buf.extend(URL_PATTERN.findall(line))

    # Sentence splitting
    batch_text = " ".join(batch_lines)
    sentences = re.split(r'(?<=[.!?।])\s+', batch_text)

    for sent in sentences:
        if not sent.strip():
            continue
        tokens = WORD_PATTERN.findall(sent)
        sentence_buf.append(" ".join(tokens) + "\n")
        recombined_buf.append(" ".join(tokens) + "\n")
        word_buf.extend(word + "\n" for word in tokens)

    # Write buffered results
    if email_buf:
        f_email.write("\n".join(email_buf) + "\n")
    if url_buf:
        f_url.write("\n".join(url_buf) + "\n")
    if sentence_buf:
        f_sent.writelines(sentence_buf)
    if recombined_buf:
        f_recombined.writelines(recombined_buf)
    if word_buf:
        f_word.writelines(word_buf)

    print(f"✅ Processed Batch {batch_num} - {len(batch_lines)} lines")

# Run
# process()


In [5]:

# input_file = "gu_meta_part_1.txt"
# sentence_file = "gu_meta_part_1.txt_sentences.txt"
# word_file = "gu_meta_part_1.txt_words.txt"
# recombined_file = "gu_meta_part_1.txt_recombined.txt"
# email_file = "gu_meta_part_1.txt_emails.txt"
# url_file="gu_meta_part_1.txt_url.txt"

# process()

In [6]:
# File paths
input_file = "gu_meta_part_2.txt"
sentence_file = "gu_meta_part_2sentences.txt"
word_file = "gu_meta_part_2_words.txt"
recombined_file = "gu_meta_part_2_recombined.txt"
email_file = "gu_meta_part_2_emails.txt"
url_file="gu_meta_part_2_url.txt"

process()

✅ Processed Batch 1 - 10000 lines
✅ Processed Batch 2 - 10000 lines
✅ Processed Batch 3 - 10000 lines
✅ Processed Batch 4 - 10000 lines
✅ Processed Batch 5 - 10000 lines
✅ Processed Batch 6 - 10000 lines
✅ Processed Batch 7 - 4992 lines


In [8]:
# File paths
input_file = "gu_meta_part_3.txt"
sentence_file = "gu_meta_part_3_sentences.txt"
word_file = "gu_meta_part_3_words.txt"
recombined_file = "gu_meta_part_3_recombined.txt"
email_file = "gu_meta_part_3_emails.txt"
url_file="gu_meta_part_3_url.txt"

process()

✅ Processed Batch 1 - 10000 lines
✅ Processed Batch 2 - 10000 lines
✅ Processed Batch 3 - 10000 lines
✅ Processed Batch 4 - 10000 lines
✅ Processed Batch 5 - 10000 lines
✅ Processed Batch 6 - 10000 lines
✅ Processed Batch 7 - 2890 lines


In [6]:
# File paths
input_file = "gu_meta_part_4.txt"
sentence_file = "gu_meta_part_4_sentences.txt"
word_file = "gu_meta_part_4_words.txt"
recombined_file = "gu_meta_part_4_recombined.txt"
email_file = "gu_meta_part_4_emails.txt"
url_file="gu_meta_part_4_url.txt"

process()

✅ Processed Batch 1 - 10000 lines
✅ Processed Batch 2 - 10000 lines
✅ Processed Batch 3 - 1109 lines


In [None]:
# # Corpus Statistics Calculation

# # Read tokenized files
# with open("sentences.txt", 'r', encoding='utf-8') as f:
#     tokenized_sentences = [line.strip() for line in f if line.strip()]

# with open("words.txt", 'r', encoding='utf-8') as f:
#     tokenized_words = [line.strip() for line in f if line.strip()]

# with open("emails.txt", 'r', encoding='utf-8') as f:
#     email = [line.strip() for line in f if line.strip()]

# with open("url.txt", 'r', encoding='utf-8') as f:
#     url = [line.strip() for line in f if line.strip()]

# # i. Total number of sentences
# total_sentences = len(tokenized_sentences)

# # ii. Total number of words
# total_words = len(tokenized_words)

# # iii. Total number of characters (excluding whitespace)
# total_characters = sum(len(token) for token in tokenized_words)

# # iv. Average Sentence Length
# avg_sentence_length = total_words / total_sentences if total_sentences else 0

# # v. Average Word Length
# avg_word_length = total_characters / total_words if total_words else 0

# # vi. Type/Token Ratio
# unique_tokens = len(set(tokenized_words))
# ttr = unique_tokens / total_words if total_words else 0

# # 📊 Print Results
# print(" Statistics")
# print(f"Total Sentences           : {total_sentences}")
# print(f"Total Words               : {total_words}")
# print(f"Total Emails              : {len(email)}")
# print(f"Total URLs                : {len(url)}")
# print(f"Total Characters          : {total_characters}")
# print(f"Average Sentence Length   : {avg_sentence_length:.2f} words/sentence")
# print(f"Average Word Length       : {avg_word_length:.2f} chars/word")
# print(f"Type/Token Ratio (TTR)    : {ttr:.4f}")


In [None]:
from pathlib import Path

BATCH_SIZE = 50000  # Adjust based on memory and speed needs

def process_file_stats(file_path, count_chars=False, collect_unique=False):
    """Process a tokenized file in batches for stats."""
    total_lines = 0
    total_chars = 0
    unique_tokens = set() if collect_unique else None

    with open(file_path, 'r', encoding='utf-8') as f:
        batch = []
        for line in f:
            token = line.strip()
            if token:
                total_lines += 1
                if count_chars:
                    total_chars += len(token)
                if collect_unique:
                    unique_tokens.add(token)

            # Optional batch handling (if you need per-batch ops)
            if len(batch) >= BATCH_SIZE:
                batch.clear()

    return total_lines, total_chars, unique_tokens


def count_non_empty_lines(file_path):
    """Count non-empty lines quickly."""
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                count += 1
    return count


# --- Sentences ---
total_sentences = count_non_empty_lines("sentences.txt")

# --- Words ---
total_words, total_chars, unique_tokens = process_file_stats(
    "words.txt", count_chars=True, collect_unique=True
)

# --- Emails & URLs ---
total_emails = count_non_empty_lines("emails.txt")
total_urls = count_non_empty_lines("url.txt")

# --- Stats ---
avg_sentence_len = total_words / total_sentences if total_sentences else 0
avg_word_len = total_chars / total_words if total_words else 0
ttr = len(unique_tokens) / total_words if total_words else 0

# --- Print ---
print("📊 Corpus Statistics (Batch Mode)")
print(f"Total Sentences         : {total_sentences}")
print(f"Total Words             : {total_words}")
print(f"Total Emails            : {total_emails}")
print(f"Total URLs              : {total_urls}")
print(f"Total Characters        : {total_chars}")
print(f"Average Sentence Length : {avg_sentence_len:.2f} words/sentence")
print(f"Average Word Length     : {avg_word_len:.2f} chars/word")
print(f"Type/Token Ratio (TTR)  : {ttr:.4f}")


KeyboardInterrupt: 