<a href="https://colab.research.google.com/github/Saputoa21/Machine-Translation/blob/main/finetune_MarianMT_BasicsMT2025S.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[sentencepiece] #==4.51.3
!pip install -U datasets #==3.6.0
!pip install evaluate #==0.4.3
!pip install accelerate #==1.6.0
!pip install sacrebleu #==2.5.1
#datasets depends on fsspec==2025.3.0

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [14]:
#txt to json
import sys
import json
import re
import codecs

# json from huggingface
#{ "translation": { "en": "Others have dismissed him as a joke.", "ro": "Alții l-au numit o glumă." } }
#{ "translation": { "en": "And some are holding out for an implosion.", "ro": "Iar alții așteaptă implozia." } }

def txt2json(src_trg, src_file, trg_file, out_file):

  src = codecs.open(src_file, 'r', encoding="utf-8")
  trg = codecs.open(trg_file, 'r', encoding="utf-8")
  out_json = codecs.open(out_file, 'w', encoding="utf-8")
  src_id, trg_id = src_trg.split('-')

  src_lines = []
  trg_lines = []
  for line_s, line_t in zip(src, trg):
      line_s = line_s.strip()
      line_t = line_t.strip()
      src_lines.append(line_s)
      trg_lines.append(line_t)
  recs = [src_lines, trg_lines]
  for src, tgt in zip(*recs):
      out = {"translation": { src_id: src, trg_id: tgt } }
      x = json.dumps(out, indent=0, ensure_ascii=False)
      x = re.sub(r'\n', ' ', x, 0, re.M)
      out_json.write(x + "\n")
  out_json.close()
  return

In [None]:
lang_pair = "en-de"

train_src = "Vienna_Environmental.en-de.train.en"
train_trg = "Vienna_Environmental.en-de.train.de"
train_json = "Vienna_Environmental.en-de.train.json"

txt2json(lang_pair, train_src, train_trg, train_json) #create a json for training

#valid_src = "Vienna_Environmental.en-de.valid.en.txt"
#valid_trg = "Vienna_Environmental.en-de.valid.de.txt"
#valid_json = "Vienna_Environmental.en-de.valid.json"

#txt2json(lang_pair, valid_src, valid_trg, valid_json)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import os
import evaluate
import numpy as np


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def main():

    model_id = "Helsinki-NLP/opus-mt-en-de" #trained with crawled sentences
    max_length = 250 #look up in Hugging Face

    code2lang = {
    "de": "German",
    "fr": "French",
    "en": "English",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ro": "Romanian",
    "es": "Spanish"
    }
    source_code = 'en'
    target_code = 'de'
    data_files = "Vienna_Environmental.en-de.train.json"
    output_dir = 'models/opus-finetune'
    train_bs = 6 #batch_size
    grad_acc = 1
    lr = 5e-5
    w_steps = 0.03
    n_epoch = 5  #number of epochs
    lr_scheduler_type = "linear" #how learning rate will change
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.src_lang = source_code  #for multilingual models ??
    tokenizer.tgt_lang = target_code
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                                  device_map={"": 0})


    print_trainable_parameters(model)
    print(model)

    metric = evaluate.load("sacrebleu")

    def preprocess_parallel_function(examples):
      inputs = [ex[source_code] for ex in examples["translation"]] #take out only source lines
      targets = [ex[target_code] for ex in examples["translation"]] #take out only target lines
      #inputs = [prefix + inp for inp in inputs]
      model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True) #for optimisation

      # Setup the tokenizer for targets
      with tokenizer.as_target_tokenizer():
          labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)


      model_inputs["labels"] = labels["input_ids"] #numbers from tokenizer

      return model_inputs

    def postprocess_text(preds, labels):
      preds = [pred.strip() for pred in preds]
      labels = [[label.strip()] for label in labels]

      return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
      preds, labels = eval_preds
      if isinstance(preds, tuple):
        preds = preds[0]
      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
      # Replace -100 in the labels as we can't decode them.
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id) #remove such labels
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
      # Some simple post-processing
      decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) # to create a friendly input for BLEU scores
      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
      result = {'bleu' : result['score']}
      result["gen_len"] = np.mean(prediction_lens)
      result = {k: round(v, 4) for k, v in result.items()}
      return result

    data = load_dataset("json", data_files=data_files)
    data_split = data['train'].train_test_split(test_size=0.1, seed=42)
    data = DatasetDict({'train': data_split['train'],
                        'valid': data_split['test']})

    column_names = data["train"].column_names
    #print(column_names)
    data = data.map(preprocess_parallel_function,
                    batched=True)
    label_pad_token_id = -100

    trainer = Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=Seq2SeqTrainingArguments(
            per_device_train_batch_size=train_bs,
            gradient_accumulation_steps=grad_acc,
            per_device_eval_batch_size=2,
            eval_accumulation_steps=2,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch,
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr,
            save_total_limit=2,
            generation_num_beams=5,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
            report_to="none",
        ),
        data_collator=DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id, model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
!ls
!head -n 200 Vienna_Environmental.en-de.train.de
!tail -n 200 Vienna_Environmental.en-de.train.de
#!paste
#!shuf
#!cut
#!grep

sample_data			    Vienna_Environmental.en-de.train.de
Vienna_Environmental.en-de.test.de  Vienna_Environmental.en-de.train.en
Vienna_Environmental.en-de.test.en  Vienna_Environmental.en-de.train.json
Dies ist auch der Grund, warum der Ausbau der Fernwärme eine wichtige Maßnahme sowohl im „Klimaschutzprogramm der Stadt Wien“ (KliP) als auch bei der „Urbanen Luft Initiative“ (ULI) zur Reduktion der Luftschadstoffe und beim „Städtischen Energieeffizienzprogramm“ (STEP) zur Steigerung der Energieeffizienz darstellt. 
Derzeit speisen 15 Erzeugungsanlagen an zehn Standorten Heißwasser in das Fernwärmeverbundnetz ein. 
Die Abwärme aus den Hausmüllverbrennungsanlagen Flötzersteig und Spittelau sowie der Sonderabfall- und Klärschlammverbrennungsanlage Simmeringer Haide wird als Grundlast ganzjährig verwendet. 
Zur Deckung der Mittellast werden die KWK der WIEN ENERGIE-Kraftwerke Donaustadt, Simmering und Leopoldau sowie die zwei Kraftwerke in der OMV-Raffinerie in Schwechat eingesetzt. 
Erst we

In [None]:
#fine-tune
main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

trainable params: 74410496 || all params: 74410496 || trainable%: 100.0
MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=5

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.22206,26.2248,33.98
2,No log,1.167482,30.0918,29.44
3,No log,1.201287,26.5937,34.19
4,0.897800,1.216067,23.7727,38.93
5,0.897800,1.231448,25.7446,34.37


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


In [None]:
#use models
#import models and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune") #path of your directory with the finetuned

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune") #NOTE load in gpu!!!!

#upload source file and read
import codecs
file_name = "Vienna_Environmental.en-de.test.en"
mt_output = []
with codecs.open(file_name, 'r', 'utf-8') as src:
  for line in src:
    line = line.strip()
    encoded = tokenizer(line, return_tensors="pt")              #tokenize note add to gpu!!!
    generated_tokens = model.generate(**encoded) #add beam search 6
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #de tokenize
    print(translation[0])



Abfallwirtschaft
» Abfallbilanz
» Abfallsammlung
» Abfallentsorgung
» Abfallvermeidung
» Kontaminierte Standorte
Grundlagen der Wiener Abfallwirtschaft sind Abfallvermeidung, Abfallzerlegung und Abfallverwertung
Abfallbilanz
Zusätzlich zum Bundesabfallwirtschaftsgesetz wird die Abfallwirtschaft in Wien durch das Wiener Abfallwirtschaftsgesetz (Wiener AWG) geregelt.
Für die Abfallwirtschaft sind in Wien zwei städtische Abteilungen zuständig, die MA 48 – Abfallwirtschaft, Straßenreinigung und Fahrzeugflotte sowie die MA 22 – Umweltschutz.
Während die MA 48 für die gemeinsame Sammlung und Behandlung von Abfällen von privaten Haushalten und Betrieben verantwortlich ist, hat die MA 22 die Aufgabe, die Umsetzung der Abfallvorschriften zu überwachen.
Auf strategischer Ebene arbeiten sie z. B. an der Umsetzung der „Strategischen Umweltprüfung (SEA) für den Wiener Abfallwirtschaftsplan oder an der Initiierung von Abfallvermeidungsprojekten zusammen.
Die Grundprinzipien der Wiener Abfallwirtscha

In [None]:
!pip freeze > requirements.txt

In [None]:
!cat requirements.txt

absl-py==1.4.0
accelerate==1.7.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.2.0.38.23
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.5.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.3.4
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
build==1.2.2.post1
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.4.26
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.2
chex==0.1.89
clarabel==0.11.0
click==8.2.1
cloudpathlib==0.21.1
cloudpickle==3.1.1
cmake==3.31.6
cmdstanpy

# TODO

1. Select a corpus from ([opus](https://opus.nlpl.eu/results/en&de/corpus-result-table))
2. sample 20k segments
3. clean corpus (optional)
4. fine-tune for 5 epochs
5. translate with vienna test file

# Files

1. de translation marianft with greedy search
2. de translation marianft with beam search  
3. de translation marianft similar corpus with beam search

## Creating two files with beam search and greedy search with the  fine-tuned model (opus-finetune)

In [None]:
#use models
#import models and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune") #path of your directory with the finetuned

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune") #NOTE load in gpu!!!!

#upload source file and read
import codecs
file_name = "Vienna_Environmental.en-de.test.en"
beam_output = open("Vienna_Environmental.en-de.test.marian.de.beam", 'w', encoding='utf-8')
greedy_output = open("Vienna_Environmental.en-de.test.marian.de.greedy", 'w', encoding='utf-8')

with codecs.open(file_name, 'r', 'utf-8') as src:
    for line in src:
        line = line.strip()
        encoded = tokenizer(line, return_tensors="pt")
        # Beam decoding
        generated_beam = model.generate(**encoded, num_beams=6)
        translation_beam = tokenizer.batch_decode(generated_beam, skip_special_tokens=True)
        print(translation_beam[0], file=beam_output)

        # Greedy decoding
        generated_greedy = model.generate(**encoded)
        translation_greedy = tokenizer.batch_decode(generated_greedy, skip_special_tokens=True)
        print(translation_greedy[0], file=greedy_output)

beam_output.close()
greedy_output.close()

## Finetung with another corpus

* I have chosen this one: europarl-v7-en-de from the MTcorpora folder on Moodle
* Link: https://www2.statmt.org/wmt24/mtdata/

In [23]:
lang_pair = "en-de"

train_src = "europarl-v7.de-en.en"
train_trg = "europarl-v7.de-en.de"
train_json_europarl = "europarl-v7.de-en.train.json"

txt2json(lang_pair, train_src, train_trg, train_json_europarl) #create a json for training

In [17]:
train_src

'europarl-v7.de-en.en'

In [25]:
with open("europarl-v7.de-en.train.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]
    print(f"number of samples: {len(data)}")
    print("first sample:", data[0])

number of samples: 251489
first sample: {'translation': {'en': 'Resumption of the session', 'de': 'Wiederaufnahme der Sitzungsperiode'}}


In [32]:
subset = data[:20000]

with open("europarl-v7.de-en.train.20k.json", "w", encoding="utf-8") as f:
    for item in subset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print("Saved subset to europarl-v7.de-en.train.20k.json")

Saved subset to europarl-v7.de-en.train.20k.json


In [33]:
with open("europarl-v7.de-en.train.20k.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]
    print(f"number of samples: {len(data)}")
    print("first sample:", data[0])

number of samples: 20000
first sample: {'translation': {'en': 'Resumption of the session', 'de': 'Wiederaufnahme der Sitzungsperiode'}}


In [34]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict # the first reads the files, the second - loads to memory
import os
import evaluate
import numpy as np


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def main():

    model_id = "Helsinki-NLP/opus-mt-en-de" #trained with crawled sentences
    max_length = 250 #look up in Hugging Face

    code2lang = {
    "de": "German",
    "fr": "French",
    "en": "English",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ro": "Romanian",
    "es": "Spanish"
    }
    source_code = 'en'
    target_code = 'de'
    data_files = "europarl-v7.de-en.train.20k.json"
    output_dir = 'models/opus-finetune-europarl'
    train_bs = 6 #batch_size
    grad_acc = 1 #gadient accumulation
    lr = 5e-5 #learning rate
    w_steps = 0.03 #warm up steps
    n_epoch = 5  #number of epochs
    lr_scheduler_type = "linear" #how the learning rate will change over time
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.src_lang = source_code  #for multilingual models ??
    tokenizer.tgt_lang = target_code
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                                  device_map={"": 0}) #put the mode lto th gpu


    print_trainable_parameters(model)
    print(model)

    metric = evaluate.load("sacrebleu")

    def preprocess_parallel_function(examples):
      inputs = [ex[source_code] for ex in examples["translation"]] #take out only source lines
      targets = [ex[target_code] for ex in examples["translation"]] #take out only target lines
      #inputs = [prefix + inp for inp in inputs]
      model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True) #for optimisation

      # Setup the tokenizer for targets
      with tokenizer.as_target_tokenizer():
          labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)


      model_inputs["labels"] = labels["input_ids"] #numbers from tokenizer

      return model_inputs

    def postprocess_text(preds, labels):
      preds = [pred.strip() for pred in preds]
      labels = [[label.strip()] for label in labels]

      return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
      preds, labels = eval_preds
      if isinstance(preds, tuple):
        preds = preds[0]
      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
      # Replace -100 in the labels as we can't decode them.
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id) #remove such labels
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
      # Some simple post-processing
      decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) # to create a friendly input for BLEU scores
      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
      result = {'bleu' : result['score']}
      result["gen_len"] = np.mean(prediction_lens)
      result = {k: round(v, 4) for k, v in result.items()}
      return result

    data = load_dataset("json", data_files=data_files)
    data_split = data['train'].train_test_split(test_size=0.1, seed=42)
    data = DatasetDict({'train': data_split['train'],
                        'valid': data_split['test']})

    column_names = data["train"].column_names
    #print(column_names)
    data = data.map(preprocess_parallel_function,
                    batched=True)
    label_pad_token_id = -100

    trainer = Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=Seq2SeqTrainingArguments(
            per_device_train_batch_size=train_bs,
            gradient_accumulation_steps=grad_acc,
            per_device_eval_batch_size=2,
            eval_accumulation_steps=2,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch,
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr,
            save_total_limit=2,
            generation_num_beams=5,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
            report_to="none",
        ),
        data_collator=DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id, model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [30]:
!ls
!head -n 200 europarl-v7.de-en.en
!tail -n 200 europarl-v7.de-en.de
#!paste
#!shuf
#!cut
#!grep

europarl-v7.de-en.de	      Vienna_Environmental.en-de.test.de
europarl-v7.de-en.en	      Vienna_Environmental.en-de.test.en
europarl-v7.de-en.train.json  Vienna_Environmental.en-de.test.marian.de.beam
models			      Vienna_Environmental.en-de.test.marian.de.greedy
requirements.txt	      Vienna_Environmental.en-de.train.de
sample_data		      Vienna_Environmental.en-de.train.en
Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
You have requested a debate on this subject in the course of the next few days, during this part-session.
In the meantime, I should like to observe a minute' s silence, as a number of Members hav

In [35]:
#fine-tune
main()



trainable params: 74410496 || all params: 74410496 || trainable%: 100.0
MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=5

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.6056,1.515861,24.844,29.926
2,1.3107,1.527837,25.001,30.443
3,1.048,1.559236,25.0928,30.192
4,0.9056,1.584722,24.7874,30.2415
5,0.7626,1.60558,24.7874,30.307


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


## Creating two files with beam search and greedy search with the  fine-tuned model (opus-finetune-europarl)

In [36]:
#use models
#import models and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune-europarl") #path of your directory with the finetuned

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune-europarl") #NOTE load in gpu!!!!

#upload source file and read
import codecs
file_name = "Vienna_Environmental.en-de.test.en"
beam_output = open("Vienna_Environmental.en-de.test.marian.de.beam.similar.corpus", 'w', encoding='utf-8')

with codecs.open(file_name, 'r', 'utf-8') as src:
    for line in src:
        line = line.strip()
        encoded = tokenizer(line, return_tensors="pt")
        generated_beam = model.generate(**encoded, num_beams=6)
        translation_beam = tokenizer.batch_decode(generated_beam, skip_special_tokens=True)
        print(translation_beam[0], file=beam_output)

beam_output.close()
greedy_output.close()

