In [23]:
#!pip install transformers datasets sentencepiece

### loading & preprocessing of the dataset

In [2]:
from datasets import load_dataset

raw_dataset = load_dataset("Aarif1430/english-to-hindi", split="train[:23%]")
raw_dataset

README.md:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

(…)-00000-of-00001-71c2cec7402cd444.parquet:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127705 [00:00<?, ? examples/s]

Dataset({
    features: ['english_sentence', 'hindi_sentence'],
    num_rows: 29372
})

In [4]:
raw_dataset[100]

{'english_sentence': 'politicians do not have permission to do what needs to be done.',
 'hindi_sentence': 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .'}

### tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("barghavani/English_to_Hindi")
print(tokenizer("Paneer"))

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

{'input_ids': [44, 24847, 581, 0], 'attention_mask': [1, 1, 1, 1]}




In [6]:
max_length = 128
def preprocess(example):
  text = [en for en in example["english_sentence"]]
  labels = [hin for hin in example["hindi_sentence"]]

  model_input = tokenizer(text, max_length = max_length)

  with tokenizer.as_target_tokenizer():
    label = tokenizer(labels, max_length = max_length)

  model_input["labels"] = label["input_ids"]
  return model_input

tokenized_dataset = raw_dataset.map(preprocess, batched = True)
print(tokenized_dataset)

Map:   0%|          | 0/29372 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Dataset({
    features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 29372
})


### train-test split

In [7]:
dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 26434
    })
    test: Dataset({
        features: ['english_sentence', 'hindi_sentence', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2938
    })
})


### getting the base model

In [8]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("barghavani/English_to_Hindi")

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

### setting up the training arguments

In [9]:
from transformers import Seq2SeqTrainingArguments

batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
number_of_train_epochs = 3


training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= number_of_train_epochs,
    weight_decay = weight_decay,
    predict_with_generate=True
)



In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model, return_tensors = "pt", pad_to_multiple_of=128)

In [11]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=test_dataset,
 tokenizer=tokenizer,
 data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [12]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,3.0189,2.819427
2,2.789,2.787112
3,2.6872,2.780374




TrainOutput(global_step=4959, training_loss=2.8487232759802255, metrics={'train_runtime': 1998.6552, 'train_samples_per_second': 39.678, 'train_steps_per_second': 2.481, 'total_flos': 2688208135520256.0, 'train_loss': 2.8487232759802255, 'epoch': 3.0})

### function to test on new output

In [15]:
def translate(input_text):
  # tokenizing
  inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device="cuda")

  # generating the results
  output_sequences = model.generate(
      inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=128,
      num_beams=4,
      early_stopping=True
    )

  # decoding
  translated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  return translated_text

In [16]:
input_text = "finally i did it"

translated_text = translate(input_text)

print("Translated text:", translated_text)

Translated text: अंत में मैंने ऐसा किया


## pushing the model to hub

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
model.push_to_hub("eng2hindi")

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Swekerr/eng2hindi/commit/2ef416f96f6e432299ce028dfee6da98106faa08', commit_message='Upload model', commit_description='', oid='2ef416f96f6e432299ce028dfee6da98106faa08', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Swekerr/eng2hindi', endpoint='https://huggingface.co', repo_type='model', repo_id='Swekerr/eng2hindi'), pr_revision=None, pr_num=None)

In [19]:
tokenizer.push_to_hub("eng2hindi")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Swekerr/eng2hindi/commit/f6cb737899d3dd675f9e8751608e8b94c236b5ac', commit_message='Upload tokenizer', commit_description='', oid='f6cb737899d3dd675f9e8751608e8b94c236b5ac', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Swekerr/eng2hindi', endpoint='https://huggingface.co', repo_type='model', repo_id='Swekerr/eng2hindi'), pr_revision=None, pr_num=None)

In [27]:
from transformers import pipeline

checkpoint = "Swekerr/eng2hindi"

translate = pipeline("translation", model=checkpoint)
translate("Failure is the pillar of success")

Device set to use cuda:0


[{'translation_text': 'असफलता सफलता का स्तम्भ है।'}]