In [30]:
from datasets import load_dataset
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [31]:
from build_tokenizer import *
from config import *

In [32]:
config = get_config()

In [33]:
ds_raw = load_dataset("cfilt/iitb-english-hindi", split= 'train')

In [29]:
tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])

In [36]:
tokenizer_tgt = Tokenizer.from_file("tokenizer_hi.json")

In [38]:
max_len_src = 0
max_len_tgt = 0
for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')

Max length of source sentence: 2176
Max length of target sentence: 2068


In [41]:
print(ds_raw)
print(ds_raw[0])

Dataset({
    features: ['translation'],
    num_rows: 1659083
})
{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [42]:
from datasets import Dataset

# Initialize an empty list to store filtered rows
truncated_rows = []

# Define the maximum length threshold
max_length_threshold = 340

# Iterate through ds_raw and filter rows
for item in ds_raw:
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    
    # Check if the length of source and target sentences is less than the threshold
    if len(src_ids) <= max_length_threshold and len(tgt_ids) <= max_length_threshold:
        # Add the row to truncated_rows
        truncated_rows.append(item)

# Create a new dataset ds_truncated containing the filtered rows
ds_truncated = Dataset.from_dict({"translation": [item["translation"] for item in truncated_rows]})

In [43]:
print(ds_truncated)
print(ds_truncated[0])

Dataset({
    features: ['translation'],
    num_rows: 1658990
})
{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [44]:
max_len_src = 0
max_len_tgt = 0
for item in ds_truncated:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')

Max length of source sentence: 335
Max length of target sentence: 333


In [45]:
ds_truncated.save_to_disk("ds_truncated.json")


Saving the dataset (1/1 shards): 100%|██████████| 1658990/1658990 [00:00<00:00, 1836628.69 examples/s]


In [62]:
ds_raw = load_dataset("adilgupta/cfilt-iitb-en-hi-truncated", split= 'train')

In [53]:
print(ds_truncated)
print(ds_truncated[0])

Dataset({
    features: ['translation'],
    num_rows: 1658990
})
{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [61]:
ds_truncated[98789]['translation']

{'en': 'Certificate Authority Trust', 'hi': 'प्रमाणपत्र प्राधिकार ट्रस्ट'}

In [63]:
ds_truncated

Dataset({
    features: ['translation'],
    num_rows: 1658990
})

In [64]:
ds_truncated.push_to_hub("cfilt-iitb-en-hi-truncated")

Creating parquet from Arrow format: 100%|██████████| 1659/1659 [00:03<00:00, 528.42ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:30<00:00, 30.55s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/adilgupta/cfilt-iitb-en-hi-truncated/commit/2c708567441fa8cf8cf905709c69cbde85323b1c', commit_message='Upload dataset', commit_description='', oid='2c708567441fa8cf8cf905709c69cbde85323b1c', pr_url=None, pr_revision=None, pr_num=None)

In [65]:
ds_raw = load_dataset("adilgupta/cfilt-iitb-en-hi-truncated", split= 'train')

Downloading readme: 100%|██████████| 354/354 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 199M/199M [00:28<00:00, 6.95MB/s] 
Generating train split: 100%|██████████| 1658990/1658990 [00:05<00:00, 284943.27 examples/s]


In [67]:
ds_raw

Dataset({
    features: ['translation'],
    num_rows: 1658990
})