# Installing Library

In [1]:
!pip install transformers datasets evaluate sacrebleu numpy 

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m761.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.2 portalocker-2.10.1 sacrebleu-2.4.3


# Loading Library

In [2]:
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import evaluate
import numpy

# Loading Data


#### Data: https://huggingface.co/datasets/Helsinki-NLP/news_commentary
#### Model: https://huggingface.co/Helsinki-NLP/opus-mt-ar-it

In [3]:
ds = load_dataset("Helsinki-NLP/news_commentary", "ar-it")
ds = ds.remove_columns('id')
ds

Downloading readme:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17227 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 17227
    })
})

In [4]:
ds = ds['train'].train_test_split(train_size=0.8)
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 13781
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3446
    })
})

# Loading Model & Tokenizer

In [5]:
checkpoints = 'Helsinki-NLP/opus-mt-ar-it'
model = MarianMTModel.from_pretrained(checkpoints)
tokenizer = MarianTokenizer.from_pretrained(checkpoints)

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/309M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/899k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]



# Make Preprocessing In Text

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 13781
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3446
    })
})

In [7]:
ds['train'][0]

{'translation': {'ar': 'وفي التحدي الأكثر إلحاحاً الذي يواجه الاتحاد الأوروبي، والذي تفرضه روسيا، سوف يضطر تاسك إلى التوسط في القرارات مع القادة من البلدان التي تشعر بأنها مهددة بشكل مباشر (مثل بلده) وتلك التي تفوق أهمية علاقاتها الاقتصادية مع روسيا أي تهديد لأمن الأوروبي، والذي تشعر بأنه بعيد عنها على أية حال. وعلى جبهة الاقتصاد، يتعين عليه أن يوفق بين أولويات ألمانيا حيث التشغيل الكامل للعمالة وأولويات بلدان مثل اليونان وإيطاليا التي لا تزال واقعة في قبضة الركود ومعدلات البطالة المرتفعة إلى عنان السماء. وقد تكون القدرة على التحدث بشكل مباشر مع أعضاء المجلس، باللغة الإنجليزية في الأغلب، بمثابة التحدي المباشر الأعظم كما اعترف هو شخصيا.',
  'it': 'Per quanto riguarda la sfida più immediata dell’Ue, posta dalla Russia, Tusk dovrà mediare le decisioni con i leader provenienti dai Paesi che si sentono immediatamente minacciati (come il suo) e con quelli per cui i legami economici con la Russia superano qualsiasi minaccia alla sicurezza europea, fatto che sentono essere remoto. Sul fronte e

In [8]:
source_lang = 'ar'
target_lang = 'it'

def preprocessing(batch):
    inputs = [example[source_lang] for example in batch['translation']]
    targets = [example[target_lang] for example in batch['translation']]
    model_inputs = tokenizer(inputs, text_target=targets, padding=True,
                             return_tensors='pt', truncation=True)
    
    return model_inputs

In [9]:
ds = ds.map(preprocessing, batched=True)

Map:   0%|          | 0/13781 [00:00<?, ? examples/s]

Map:   0%|          | 0/3446 [00:00<?, ? examples/s]

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13781
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3446
    })
})

# DataCollatorForSeq2Seq

In [11]:
data_collector = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoints)

# Create Compute Metrics

In [12]:
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Create Arguments

In [21]:
model_args = Seq2SeqTrainingArguments(
    output_dir="./Helsinki-mt-ar-it",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    fp16=True,
    warmup_steps=100,
    logging_steps=100, save_steps=4000
)

In [22]:
trainer = Seq2SeqTrainer(
    model=model,
    args=model_args,
    tokenizer=tokenizer,
    data_collator=data_collector,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['test']
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


# Training Model

In [23]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
100,0.7442
200,0.7278
300,0.7259
400,0.8303
500,0.8125
600,0.8122
700,0.8244
800,0.8223
900,0.7772
1000,0.7522


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[63293]], 'forced_eos_token_id': 0}


TrainOutput(global_step=2586, training_loss=0.7518522479417983, metrics={'train_runtime': 1060.5428, 'train_samples_per_second': 38.983, 'train_steps_per_second': 2.438, 'total_flos': 2422072866963456.0, 'train_loss': 0.7518522479417983, 'epoch': 3.0})

# Saving Model

In [24]:
trainer.save_model(r'/kaggle/working/model')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[63293]], 'forced_eos_token_id': 0}


# Make Prediction

In [28]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

In [29]:
def predict(text, model_checkpoint):
    print('Input: ', text, '\n\n')
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    inputs = tokenizer(text, return_tensors="pt").input_ids
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    outputs = model.generate(inputs, max_new_tokens=300, do_sample=True, top_k=30, top_p=0.95)
    return "Output:  " + tokenizer.decode(outputs[0], skip_special_tokens=True)

In [30]:
text = 'مرحبا'
predict(text, '/kaggle/working/Helsinki-mt-ar-it/checkpoint-2586')

Input:  مرحبا 






"Output:  C'e' nessuno?"

# Create Pipeline

In [32]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

model_path = r'/kaggle/working/model'
# model_path = r'/kaggle/working/Helsinki-mt-ar-it/checkpoint-2586'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Create a translation pipeline
translation_pipeline = pipeline('translation', model=model, tokenizer=tokenizer)

# Example usage
input_text = "مرحبا"
translated_text = translation_pipeline(input_text)[0]['translation_text']

# Output the translated text
print(translated_text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


C'e' nessuno?


# Deploy Model

In [37]:
!pip install streamlit

  pid, fd = os.forkpty()


Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Downloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.9/82.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: watchdog, pydeck, streamlit
Successfully install

In [44]:
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

model_path = r'/kaggle/working/model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

translation_pipeline = pipeline('translation', model=model, tokenizer=tokenizer)

st.title("Translation Model")

input_text = st.text_area("Enter text to translate:")

if st.button("Translate"):
    if input_text:
        translated_text = translation_pipeline(input_text)
        st.write("Translation:", translated_text[0]['translation_text'])
    else:
        st.write("Please enter some text.")