In [1]:
pip install datasets 

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading huggingface_hub-0.34.4-py3-none-any.whl (561 kB)
   ---------------------------------------- 0.0/561.5 kB ? eta -:--:--
   ---------------------------------------- 561.5/561.5 kB 4.8 MB/s eta 0:00:00
Downloading xxhash-3.5.0-cp313-cp313-

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2025.3.2 requires fsspec==2025.3.2.*, but you have fsspec 2025.3.0 which is incompatible.


In [2]:
import re
def sentence_tokenizer(paragraph):
  pattern='[\u0964!?.][^0-9.0-9]'
  sentences=re.split(pattern,paragraph)
  ends=re.findall(pattern,paragraph)
  res=[]
  for sentence in sentences:
    if ends==[]:
      break
    end=ends.pop(0)
    res.append(sentence.strip() + end)
  return res


def word_tokenizer(sentence):
    url_pattern = r'https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}'
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # Phone numbers (with optional +91 or +९१)
    phone_pattern = r'(?:\+?[0-9\u0966-\u096F]{1,3}[\s-]?)?[0-9\u0966-\u096F]{10}'

    # Time format: 10:30 or १०:३०
    time_pattern = r'[0-9\u0966-\u096F]{1,2}:[0-9\u0966-\u096F]{2}'

    # Decimal and whole numbers
    latin_number = r'[0-9]+(?:\.[0-9]+)?'
    devnag_number = r'[\u0966-\u096F]+(?:\.[\u0966-\u096F]+)?'

    # Punctuation
    punctuation = r'[!?\.\u0964\,"\'\+-]'

    # Hindi / Devanagari words
    hindi_word = fr'[\u0900-\u0963\u0965-\u096F]+[^!?\.\u0964\,"\'\+-]'
    english_word = r'[a-zA-Z]+'


    # Emojis (basic unicode emoji range)
    emoji_pattern = r'[\U0001F300-\U0001FAFF\U00002700-\U000027BF]'

    # Final combined pattern
    pattern = fr'{email_pattern}|{url_pattern}|{phone_pattern}|{time_pattern}|{latin_number}|{devnag_number}|{punctuation}|{hindi_word}|{emoji_pattern}|{english_word}'

    return re.findall(pattern, sentence)




In [3]:
from datasets import load_dataset
from tqdm import tqdm
import re
import csv

In [6]:
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="hin_Deva", streaming=True)

In [7]:
dataset

IterableDataset({
    features: Unknown,
    num_shards: 3
})

In [8]:
tokenized_words_csv = "hindi_tokenized_words.csv"           
reconstructed_csv = "hindi_reconstructed_sentences.csv"     
re_sentence_csv = "hindi_re_sentence_tokenized.csv" 

In [9]:
max_lines = 10_000_000   
log_every = 100_000      
flush_every = 10_000 

In [None]:
with open(tokenized_words_csv, "w", encoding="utf-8", newline='') as words_file, \
     open(reconstructed_csv, "w", encoding="utf-8", newline='') as recon_file, \
     open(re_sentence_csv, "w", encoding="utf-8", newline='') as re_sent_file:

    words_writer = csv.writer(words_file)
    recon_writer = csv.writer(recon_file)
    re_sent_writer = csv.writer(re_sent_file)

    # Headers
    words_writer.writerow(["tokens"])
    recon_writer.writerow(["reconstructed_sentence"])
    re_sent_writer.writerow(["re_sentence"])

    count_original_sentences = 0
    count_re_sentences = 0

    pbar = tqdm(total=max_lines, desc="Processed original sentences", unit="sent")

    for row in dataset:                       # streaming dataset
        text = row.get("text", "") or ""
        sentences = sentence_tokenizer(text)  # step 1: sentence-tokenize paragraph

        for sent in sentences:
            # --- step 2: word-tokenize this sentence
            tokens = word_tokenizer(sent)

            # --- step 2a: save tokenized words (space-joined)
            token_line = " ".join(tokens)
            words_writer.writerow([token_line])

            # --- step 3: reconstruct sentence from tokens and save
            reconstructed = token_line  # joining tokens with space
            recon_writer.writerow([reconstructed])

            # --- step 4: sentence-tokenize the reconstructed string and save each re-sentence
            re_sentences = sentence_tokenizer(reconstructed)
            if re_sentences:
                for rs in re_sentences:
                    re_sent_writer.writerow([rs])
                    count_re_sentences += 1
            else:
                # If sentence_tokenizer returns empty, still store reconstructed as fallback
                re_sent_writer.writerow([reconstructed])
                count_re_sentences += 1

            # --- counting, progress, and flushing
            count_original_sentences += 1
            pbar.update(1)

            if count_original_sentences % flush_every == 0:
                words_file.flush(); recon_file.flush(); re_sent_file.flush()

            if count_original_sentences % log_every == 0:
                print(f"[LOG] original sentences processed: {count_original_sentences:,} | re-sentences written: {count_re_sentences:,}")

            # stop condition (based on original sentences)
            if count_original_sentences >= max_lines:
                break

        if count_original_sentences >= max_lines:
            break

    pbar.close()


Processed original sentences:   1%|▍                                    | 103222/10000000 [00:45<16:29, 10003.54sent/s]

[LOG] original sentences processed: 100,000 | re-sentences written: 101,167


Processed original sentences:   2%|▋                                    | 169015/10000000 [01:00<10:36, 15439.52sent/s]