In [1]:
# Transformers installation
!pip install transformers datasets
!pip install accelerate -U
!pip install transformers[torch]



# Translation

In [2]:
!pip install huggingface_hub



In [3]:

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("huggingface-write")

from huggingface_hub import login
login(secret_value_0)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset
books = load_dataset("CohleM/english-to-nepali")

Downloading readme:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 28.9M/28.9M [00:00<00:00, 75.3MB/s]


Generating train split:   0%|          | 0/177334 [00:00<?, ? examples/s]

In [5]:
development=True
EPOCHS=2

Split the dataset into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:

In [6]:
if development==True:
    print("Development")
    books = books["train"].train_test_split(train_size=5000,test_size=1000)
else:
    print("Not Development")
    books = books["train"].train_test_split(test_size=0.2)
    

Development


In [7]:
books["train"][0]

{'en': 'Source does not support proxy lookups',
 'ne': 'भण्डारले प्राप्ति मञ्जूषा लाई समर्थन गर्दैन'}

In [8]:
books=books.map(lambda x: {"translation":x})

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Preprocess

In [9]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,NllbTokenizerFast,DataCollatorForSeq2Seq

tokenizer = NllbTokenizerFast.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="npi_Deva"
)

2024-04-22 17:10:41.841730: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 17:10:41.841832: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 17:10:41.981675: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [10]:
source_lang = "en"
target_lang = "ne"
prefix = ""


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [11]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Evaluate

In [12]:
!pip install evaluate sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: p

In [13]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [14]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train

In [15]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [16]:
tokenizer = NllbTokenizerFast.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="npi_Deva"
)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="LMPT_project",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.1397,1.909169,17.03,33.143
2,1.956,1.88459,17.7229,32.719


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


TrainOutput(global_step=1250, training_loss=2.0179735107421877, metrics={'train_runtime': 1282.1064, 'train_samples_per_second': 7.8, 'train_steps_per_second': 0.975, 'total_flos': 1330890044276736.0, 'train_loss': 2.0179735107421877, 'epoch': 2.0})

In [17]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 200}


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

events.out.tfevents.1713805882.571da347b2d2.34.0:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/rujengelal/LMPT_project/commit/940a35ea30fdc108e8dabe88eba7a39353d475e6', commit_message='End of training', commit_description='', oid='940a35ea30fdc108e8dabe88eba7a39353d475e6', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [23]:
text = "Hello good morning. How are you?"

Simple way for inference is to use pipeline functionality 

In [24]:
from transformers import pipeline

translator = pipeline("translation", model="LMPT_project", src_lang="eng_Latn", tgt_lang="npi_Deva")
translator(text)

[{'translation_text': 'नमस्कार शुभ प्रभात । तपाईँ कसरी हुनुहुन्छ ?'}]

In [25]:
translator(text)[0]['translation_text']

'नमस्कार शुभ प्रभात । तपाईँ कसरी हुनुहुन्छ ?'

we can do it iteritively

In [26]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("LMPT_project", src_lang="eng_Latn", tgt_lang="npi_Deva")
inputs = tokenizer(text, return_tensors="pt").input_ids
model = AutoModelForSeq2SeqLM.from_pretrained("LMPT_project")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

Decode the generated token ids back into text:

In [27]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'नमस्कार शुभ प्रभात । तपाईँ कसरी हुनुहुन्छ?'