In [None]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-de-en"
translator = pipeline("translation", model=model_checkpoint)


Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/750k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

In [None]:
translator("Ihr naht euch wieder, schwankende Gestalten!")

[{'translation_text': "You're approaching again, fluctuating figures!"}]

In [None]:
model_textgen = "distilgpt2"
generator = pipeline("text-generation", model=model_textgen)
generator("The rest of this assignment will be")

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The rest of this assignment will be provided on the basis of results.'}]

# Fine-tuning Model 

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="de", lang2="en") # STUDENT 

Downloading builder script:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Using custom data configuration de-en-lang1=de,lang2=en


Downloading and preparing dataset kde4/de-en to /root/.cache/huggingface/datasets/kde4/de-en-lang1=de,lang2=en/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/7.62M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset kde4 downloaded and prepared to /root/.cache/huggingface/datasets/kde4/de-en-lang1=de,lang2=en/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 224035
    })
})

In [None]:
perc_val = 3000/101593
raw_datasets = raw_datasets['train'].train_test_split(test_size=perc_val)
raw_datasets["validation"] = raw_datasets.pop("test") 

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 217419
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 6616
    })
})

# Data preprocessing 

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

In [None]:
de_sentence = "Sie müssen nur die richtige Taste im richtigen Moment berühren, und das Instrument selbst ertönt"
en_sentence = "All one has to do is hit the right keys at the right time and the instrument plays itself."

In [None]:
print(" ".join(tokenizer.tokenize(de_sentence)))

▁Sie ▁müssen ▁nur ▁die ▁richtige ▁Taste ▁im ▁richtigen ▁Moment ▁berühren , ▁und ▁das ▁Instrument ▁selbst ▁er tönt


In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.tokenize(en_sentence))

['▁All', '▁one', '▁has', '▁to', '▁do', '▁is', '▁hit', '▁the', '▁right', '▁keys', '▁at', '▁the', '▁right', '▁time', '▁and', '▁the', '▁instrument', '▁plays', '▁itself', '.']


In [None]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex["de"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) 
    with tokenizer.as_target_tokenizer(): 
        labels = tokenizer(targets, max_length=max_target_length, truncation=True) 

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

proc_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/218 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"de-en",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    max_steps=2000,
    predict_with_generate=True,
    fp16=True,
    generation_max_length=max_target_length,
)

In [None]:
import numpy as np

from datasets import load_metric

metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace any occurrences of -100 labels 
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip whitespace and put the decoded labels in the expected format 
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer( 
    model = model, 
    args = args, 
    train_dataset=proc_datasets["train"],
    eval_dataset=proc_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 6616
  Batch size = 64


{'eval_bleu': 33.60437607306265,
 'eval_loss': 1.82985258102417,
 'eval_runtime': 1025.2617,
 'eval_samples_per_second': 6.453,
 'eval_steps_per_second': 0.101}

In [None]:
trainer.train()

***** Running training *****
  Num examples = 217419
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2000


Step,Training Loss
500,1.7146
1000,1.6246
1500,1.5801
2000,1.5776


Saving model checkpoint to de-en/checkpoint-2000
Configuration saved in de-en/checkpoint-2000/config.json
Model weights saved in de-en/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in de-en/checkpoint-2000/tokenizer_config.json
Special tokens file saved in de-en/checkpoint-2000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2000, training_loss=1.624211639404297, metrics={'train_runtime': 4054.1957, 'train_samples_per_second': 15.786, 'train_steps_per_second': 0.493, 'total_flos': 1365392240934912.0, 'train_loss': 1.624211639404297, 'epoch': 0.29})

In [None]:
trainer.evaluate() 

***** Running Evaluation *****
  Num examples = 6616
  Batch size = 64


{'epoch': 0.29,
 'eval_bleu': 36.52019995504327,
 'eval_loss': 1.4945979118347168,
 'eval_runtime': 1057.0316,
 'eval_samples_per_second': 6.259,
 'eval_steps_per_second': 0.098}

In [None]:
# OPENAI API KEY deleted for privacy purposees 
%env OPENAI_API_KEY=

In [None]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

# returns a tuple consisting of the text completion and the full response 
def print_gpt3_response(prompt): 
    response = openai.Completion.create(
    engine="text-davinci-002",
    prompt=prompt,
    temperature=0.7,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    text = response["choices"][0]["text"].strip()
    print(text)

In [None]:
print_gpt3_response("Das Studium und allgemein das Streben nach Wahrheit und Schönheit ist ein Gebiet, auf dem wir das ganze Leben lang Kinder bleiben dürfen.") # STUDENT 

«

Albert Einstein



Anfang der 18. Jahrhundert waren es nur wenige ausgewählte Leute, die Zugang zu Schulen und Universitäten hatten. Die Mehrheit der Menschen konnte nach der Schule nicht weiter studieren. Die Schulen und Universitäten waren auch meistens von Katholiken und Protestanten besucht.



Nach der Aufklärung änderte sich dies. Es entstanden neue Universitäten und Schulen. Diese waren jetzt auch für finanzschwächere Menschen zugänglich.



Nach der Französischen Revolution wurden in Deutschland viele Universitäten gegründet. Die Universitäten waren jetzt auch für Juden und Mitglieder von kleinen Konfessionen geöffnet.



In der Weimarer Republik gab es ganz besondere Universitäten. Es


In [None]:
print_gpt3_response("Der Mensch kann zwar tun, was er will, aber er kann nicht wollen, was er will means in English")

<blockquote>
<p>A person can do what he wants, but he cannot want what he wants.</p>
</blockquote>


In [None]:
# sentence completion 
print_gpt3_response("Although the weather was fine in the morning, ") 

the forecast called for rain in the afternoon.


In [None]:
print_gpt3_response("The exchange rate between USD and RMB today is") 
# bad at predicting technical statements or keeping information updated (current affairs) 
# usd to rmb is 6.55 as of Apr 27, 2022 01:40. 

about 7.3 yuan to the dollar.

7.3 yuan to the dollar.
