#### Load in Model and set generation 

In [None]:
!pip install nltk --quiet
!pip install transformers --quiet
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install sentencepiece --quiet
!pip install accelerate --quiet
!pip install rouge_score --quiet
!pip install bert_score --quiet
!pip install torchvision --quiet
!pip install tensorboard --quiet
!pip install bertviz --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import nltk

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline,  EarlyStoppingCallback, DataCollatorForSeq2Seq, Trainer
from transformers import MT5ForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration, MT5TokenizerFast
import datasets
import evaluate
import accelerate

import gc
import json
from ast import literal_eval

from tqdm import tqdm

#### Prep BertViz

In [None]:
def run_bertViz_model_view(model, tokenizer, tekst_input, tekst_output, view_type):
  encoder_input_ids = tokenizer(f"{tekst_input}", return_tensors="pt", add_special_tokens=True).input_ids

  with tokenizer.as_target_tokenizer():
      decoder_input_ids = tokenizer(f"{tekst_output}", return_tensors="pt", add_special_tokens=True).input_ids

  outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

  encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
  decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

  if "head" in view_type:
    head_view(
      encoder_attention=outputs.encoder_attentions,
      decoder_attention=outputs.decoder_attentions,
      cross_attention=outputs.cross_attentions,
      encoder_tokens= encoder_text,
      decoder_tokens = decoder_text
      )
  else:
    model_view(
        encoder_attention=outputs.encoder_attentions,
        decoder_attention=outputs.decoder_attentions,
        cross_attention=outputs.cross_attentions,
        encoder_tokens= encoder_text,
        decoder_tokens = decoder_text
    )

In [None]:
from bertviz import model_view, head_view
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM

model_path = '/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, output_attentions=True)

#### Experiment 1: Check poor generations

##### Experiment 1: Generate the text

In [None]:
## Stocks generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
    ["LOC | Egyptian, stockChange | rise, moneyAmount | 24 of 30", "companyName | ECB, incidentType | besluit nemen, timePoint | april, companyName | Analisten,  incidentType | hadden verwacht",
    "locationName | Duitse , CompanyName | tienjaarsrente, amountNumber | laagste niveau ooit", "ORG | Beyond Meat, timepoint | live, incidentType | trade"] 
    )


[[{'generated_text': 'The Egyptian stock market is set to rise 24 of 30 points, while the Egyptian'},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock index, which is set to rise 24 of 30 points, '},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock market, which has a rise of 24 of 30 points, is'}],
 [{'generated_text': 'Analisten, die in april een besluit nemen, hadden verwacht dat ECB het'},
  {'generated_text': 'Analisten, die het besluit nemen, hadden verwacht dat de ECB in april'},
  {'generated_text': 'Analisten, die het besluit nemen, hadden verwacht dat de ECB in april'},
  {'generated_text': 'Analisten, die het besluit nemen, hadden verwacht dat de ECB in april'},
  {'generated_text': 'Analisten, die het besluit nemen, hadden verwacht dat de ECB in april'}],
 [{'generated_text': 'De Duitse tienjaarsrente ligt op het laagste niveau oo

In [None]:
## Stocks generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
    ["LOC | Egyptian, stockChange | rise, moneyAmount | 24 of 30", "companyName | ECB, incidentType | besluit nemen, timePoint | april, companyName | Analisten,  incidentType | hadden verwacht",
    "locationName | Duitse , CompanyName | tienjaarsrente, amountNumber | laagste niveau ooit", "ORG | Beyond Meat, timepoint | live, incidentType | trade"] 
    )


[[{'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'},
  {'generated_text': 'The Egyptian stock market is expected to rise 24 of 30.'}],
 [{'generated_text': 'Analisten, die in april hadden verwacht dat ECB een besluit nemen, hadden verwacht.'},
  {'generated_text': 'Analisten, die in april hadden verwacht dat ECB een besluit nemen, hadden verwacht.'},
  {'generated_text': 'Analisten, die in april hadden verwacht dat ECB een besluit nemen, hadden verwacht.'},
  {'generated_text': 'Analisten, die in april hadden verwacht dat ECB een besluit nemen, hadden verwacht.'},
  {'generated_text': 'Analisten, die in april hadden verwacht dat ECB een besluit nemen, hadden verwacht.'}],
 [{'generated_text': 'De Duitse tienjaarsrente s

In [None]:
## Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
["victimAmount | drietal , victimVehicle | auto, incidentCause | slingeren, victimStatus | tot stilstand kwam tegen een brugpijler", 
 "victimVehicle | auto, incidentType | boven water gehaald, incidentType | op de kade getild",
 "suspectVehicle | automobilist, incidentCause | te hard reed"
])


[[{'generated_text': 'Een drietal, een auto, die tot stilstand kwam tegen een brugpijler, kwam met een slingeren.'},
  {'generated_text': 'Een drietal, een auto, die tot stilstand kwam tegen een brugpijler, kwam met een slingeren.'},
  {'generated_text': 'Een drietal, een auto, die tot stilstand kwam tegen een brugpijler, kwam met een slingeren.'},
  {'generated_text': 'Een drietal, een auto, die tot stilstand kwam tegen een brugpijler, kwam met een slingeren.'},
  {'generated_text': 'Een drietal, een auto, die tot stilstand kwam tegen een brugpijler, kwam met een slingeren.'}],
 [{'generated_text': 'De auto, die boven water gehaald is, is op de kade getild.'},
  {'generated_text': 'De auto, die boven water gehaald werd, is op de kade getild.'},
  {'generated_text': 'De auto, die boven water gehaald is, is op de kade getild.'},
  {'generated_text': 'De auto, die boven water gehaald is, is op de kade getild.'},
  {'generated_text': 'De auto, die boven water gehaald is, is op de kade get

In [None]:
## Weather generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
["weatherFrequency | even , weatherType | buien, weatherChange | breekt de zon door",
 "temperatureCelsius | above average, timePoint | this summer"
 ])




[[{'generated_text': 'De zon breekt de zon door en komen er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en komen er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en komen er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en komen er even buien voor.'}],
 [{'generated_text': 'Temperatures will be above average this summer.'},
  {'generated_text': 'Temperatures will be above average this summer.'},
  {'generated_text': 'Temperatures will be above average this summer.'},
  {'generated_text': 'Temperatures will be above average this summer.'},
  {'generated_text': 'Temperatures will be above average this summer.'}]]

In [None]:
## Weather generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
["weatherChange | breekt de zon door, weatherFrequency | even , weatherType | buien",
 " weatherType | buien, weatherChange | breekt de zon door, weatherFrequency | even",
  "weatherFrequency | even , weatherType | buien"
 ])

[[{'generated_text': 'De zon breekt de zon door, maar het blijft even droog.'},
  {'generated_text': 'De zon breekt de zon door, maar komt er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door, maar het blijft even droog.'},
  {'generated_text': 'De zon breekt de zon door, maar het blijft even droog.'},
  {'generated_text': 'De zon breekt de zon door, maar het blijft even droog.'}],
 [{'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'},
  {'generated_text': 'De zon breekt de zon door en trekken er even buien voor.'}],
 [{'generated_text': 'De buien zijn even , maar het blijft droog.'},
  {'generated_text': 'De buien zijn even , maar het blijft droog.'},
  {'generated_text': 'De buien zijn even , maar 

In [None]:
## Sports generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
[
  "batterName | Cabrera , teamName | Detroit, injuryType | MRI",
  "tackleRecipientName | Kamohelo Mokotjo, chanceForType | op de stip ging, tackleGiverName | Hij",
  "goalName | Jeff Stans, goalName | Tom van Weert",
  "playerName | van Ajax overgekomen, positionOfPlayer | verdediger, matchTime | vroeg , injuryType | in de fout gaan",
  "teamName | Ajax, teamName | Vitesse, IncidentType | slordig spel"
])


[[{'generated_text': 'Cabrera , who had a MRI, is expected to return to Detroit.'},
  {'generated_text': 'Cabrera , who had a MRI, is expected to return to Detroit.'},
  {'generated_text': 'Cabrera , a Detroit veteran, is expected to survive with a MRI.'},
  {'generated_text': 'Cabrera , who had a MRI, is expected to return to Detroit.'},
  {'generated_text': 'Cabrera , who had an MRI, is expected to return to Detroit.'}],
 [{'generated_text': 'Hij zette Kamohelo Mokotjo op de stip, waarna hij op de stip ging.'},
  {'generated_text': 'Hij zette Kamohelo Mokotjo op de stip, waarna hij op de stip ging.'},
  {'generated_text': 'Hij moest op de stip gaan, maar Kamohelo Mokotjo, die op de stip ging.'},
  {'generated_text': 'Hij zette Kamohelo Mokotjo op de stip, waarna hij op de stip ging.'},
  {'generated_text': 'Hij zette Kamohelo Mokotjo op de stip, waarna hij op de stip ging.'}],
 [{'generated_text': 'Tom van Weert en Jeff Stans, waren de beste kansen.'},
  {'generated_text': 'Tom van W

##### Experiment 1: Apply BertViz test

stock example

In [None]:
run_bertViz_model_view(model, tokenizer, "locationName | Duitse , CompanyName | tienjaarsrente, amountNumber | laagste niveau ooit", "De Duitse tienjaarsrente ligt op het laagste niveau ooit.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "locationName | Duitse", "De Duitse aandelenbeurzen waren beter dan verwacht.", "head_view")

sport example

In [None]:
run_bertViz_model_view(model, tokenizer, "playerName | van Ajax overgekomen, positionOfPlayer | verdediger, matchTime | vroeg , injuryType | in de fout gaan", "De verdediger, die vroeg in de fout gaat, is van Ajax overgekomen.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "'matchTime | vroeg, teamName | Ajax", "Ajax heeft vroeg een overwinning geboekt.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "batterName | Cabrera , teamName | Detroit, injuryType | MRI", "Cabrera , who had a MRI, is expected to return to Detroit.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "batterName | Cabrera , teamName | Detroit, "Cabrera took the lead for Detroit.", "head_view")

Incident example

In [None]:
run_bertViz_model_view(model, tokenizer, "victimVehicle | auto, incidentType | boven water gehaald, incidentType | op de kade getild", "De auto, die boven water gehaald werd, is op de kade getild.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "victimVehicle | auto", "De auto reed tegen een boom en botste tegen een boom.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "suspectVehicle | automobilist, incidentCause | te hard reed", "De automobilist, die te hard reed, reed tegen een boom.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "suspectVehicle | automobilist", "De automobilist reed op de verkeerde weghelft en belandde", "head_view")

#### Experiment 2: Check good generations


##### Experiment 2: Apply Generations test

In [None]:
## Weather generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
()
generator(
[
  "timePoint | In de avond , windChange | af",
  "timePoint | In de middag , weatherType | zonnige"
])


[[{'generated_text': 'In de avond neemt de wind af.'},
  {'generated_text': 'In de avond neemt de wind af.'},
  {'generated_text': 'In de avond neemt de wind af.'},
  {'generated_text': 'In de avond neemt de wind af.'},
  {'generated_text': 'In de avond neemt de wind af.'}],
 [{'generated_text': 'In de middag , zijn er zonnige perioden.'},
  {'generated_text': 'In de middag , zijn er zonnige perioden.'},
  {'generated_text': 'In de middag , zijn er zonnige perioden.'},
  {'generated_text': 'In de middag , zijn er zonnige perioden.'},
  {'generated_text': 'In de middag , zijn er zonnige perioden.'}]]

In [None]:
## Stock generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, max_length = 150, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
()
generator(
[
  "companyName | bank UBS , locationName | Zwitserse , stockChange | zakte , stockChangePercentage | 2,1 procent",
  "companyName | fabrikant van elektrische auto\'s Tesla , stockChangePercentage | 5,4 procent",
])


[[{'generated_text': 'De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.'},
  {'generated_text': 'De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.'},
  {'generated_text': 'De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.'},
  {'generated_text': 'De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.'},
  {'generated_text': 'De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.'}],
 [{'generated_text': "De fabrikant van elektrische auto's Tesla sloot 5,4 procent na een adviesverlaging door de fabrikant van elektrische auto's Tesla."},
  {'generated_text': "De fabrikant van elektrische auto's Tesla , sloot 5,4 procent na een adviesverlaging door de fabrikant van elektrische auto's Tesla."},
  {'generated_text': "De fabrikant van elektrische auto's Tesla sloot 5,4 procent na een adviesverlaging 

In [None]:
## Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
()
generator(
[
  "accidentAddress | Carver Houses on E. 104th St. , accidentDate | around 2 a.m. Sunday. , victimNumber | Three , victimStatus | shot",
  "accidentAddress | 14th Street near Broadway , accidentDate | just after midnight , shootingType | shooting"
])

[[{'generated_text': 'Three people were shot at Carver Houses on E. 104th St. around 2 a'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'}],
 [{'generated_text': 'The shooting happened just after midnight on 14th Street near Broadway.'},
  {'generated_text': 'The shooting happened just after midnight on 14th Street near Broadway.'},
  {'generated_text': 'The shooting happened just after midnight on 14th Street near Broadway.'},
  {'generated_text': 'The shooting happened just after midnight on 14th Street near Broadway.'},
  {'generated_text': 'The shooting happened just after midnight on 14th Street near Broadway.'}]]

In [None]:
## Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)

generator(
[
  "accidentAddress | Carver Houses on E. 104th St. , accidentDate | around 2 a.m. Sunday. , victimNumber | Three , victimStatus | shot",
  "accidentAddress | Carver Houses on E. 104th St. , accidentDate | around 2 a.m. Sunday. , victimNumber | Three , 'victimGender | men",
  "accidentAddress | 14th Street near Broadway , shootingType | shooting"
])

[[{'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses on E. 104th St. around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'},
  {'generated_text': 'Three people were shot at Carver Houses around 2 a.m. Sunday.'}],
 [{'generated_text': "Three 'victims were shot at the Carver Houses on E. 104th St. around 2 a.m. Sunday."},
  {'generated_text': "Three 'victims were shot at the Carver Houses on E. 104th St. around 2 a.m. Sunday."},
  {'generated_text': "Three 'victims were shot at the Carver Houses on E. 104th St. around 2 a.m. Sunday."},
  {'generated_text': "Three 'victims were shot at the Carver Houses on E. 104th St. around 2 a.m. Sunday."},
  {'generated_text': "Three 'victims were shot at the Carver Houses on E. 104th St. around 2 a.m. Sunday."}],
 [

##### Experiment 2: Apply BertViz test

Weather examples

In [None]:
run_bertViz_model_view(model, tokenizer, "timePoint | In de avond , windChange | af", "In de avond neemt de wind af.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "timePoint | In de avond , windChange | af, locationArea | overal", "In de avond neemt de wind overal af.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "timePoint | In de middag , weatherType | zonnige", "In de middag , zijn er zonnige perioden.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "timePoint | In de middag ,  weatherFrequency | perioden , weatherType | zonnige", "In de middag zijn er zonnige perioden.", "head_view")

Stock Examples

In [None]:
run_bertViz_model_view(model, tokenizer, "companyName | bank UBS , locationName | Zwitserse , stockChange | zakte , stockChangePercentage | 2,1 procent", 
                       "De Zwitserse bank UBS zakte 2,1 procent na een adviesverlaging door de Zwitserse bank UBS.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "companyName | bank UBS , locationName | Zwitserse , locationName | Zürich , stockChange | zakte , stockChangePercentage | 2,1 procent", 
                       "De Zwitserse bank UBS zakte 2,1 procent in Zürich.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "companyName | fabrikant van elektrische auto\'s Tesla , stockChangePercentage | 5,4 procent", 
                       "De fabrikant van elektrische auto's Tesla , dat 5,4 procent omzet heeft geboekt.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "companyName | fabrikant van elektrische auto\'s Tesla , stockChangePercentage | 5,4 procent, 'stockChange | hoger", 
                       "De fabrikant van elektrische auto's Tesla sloot 5,4 procent hoger.", "head_view")

Incident examples

In [None]:
run_bertViz_model_view(model, tokenizer, "accidentAddress | 14th Street near Broadway , accidentDate | just after midnight , shootingType | shooting", 
                       "The shooting happened just after midnight on 14th Street near Broadway.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "accidentAddress | 14th Street near Broadway , shootingType | shooting", 
                       "The shooting happened on 14th Street near Broadway , police said.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "accidentAddress | Carver Houses on E. 104th St. , accidentDate | around 2 a.m. Sunday. , victimNumber | Three , victimStatus | shot", 
                       "Three people were shot at Carver Houses on E. 104th St. around 2 a.m. Sunday.", "head_view")

In [None]:
run_bertViz_model_view(model, tokenizer, "accidentAddress | Carver Houses on E. 104th St. , accidentDate | around 2 a.m. Sunday. , victimNumber | Three , victimStatus | shot, victimGender | men", 
                       "Three men were shot at Carver Houses on E. 104th St. around 2 a.m. Sunday.", "head_view")

#### Experiment 3: Check poor generations by adding unknown attributes


In [None]:
## Stocks generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
()
generator(
[
"DATE | today, timePoint | historical precedent, feeling | shouldn’t fear, event | political drama, ORG | today’s traders",
"companyName | rivaal Sandd, event | overname, event | resultaten, ORG | postbezorger",
"companyName | ECB, timePoint | april, companyName | Analisten, event | juist verwacht",
"stockChange | min , stockChangePercentage | meer dan 12%, event | Na hevige koersschommelingen sloot",
 "LOC | Qatar, event | closed, timePoint | public holiday",
])

[[{'generated_text': 'The traders shouldn’t fear today’s political drama, but they'},
  {'generated_text': 'The traders shouldn’t fear today’s political drama, but they'},
  {'generated_text': 'The traders shouldn’t fear today’s political drama, as they'},
  {'generated_text': 'The traders shouldn’t fear today’s political drama, but they'},
  {'generated_text': 'The traders shouldn’t fear today’s political drama, but they'}],
 [{'generated_text': 'De postbezorger maakte een overname van het rivaal Sandd, dat'},
  {'generated_text': 'De postbezorger heeft een overname van de rivaal Sandd, die'},
  {'generated_text': 'De postbezorger heeft een overname van de rivaal Sandd, die'},
  {'generated_text': 'De postbezorger heeft een overname van de rivaal Sandd, die'},
  {'generated_text': 'De postbezorger heeft een overname van de rivaal Sandd, die'}],
 [{'generated_text': 'Analisten, die juist verwachten dat ECB in april een renteverlaging zou'},
  {'generated_text': 'Analisten, die juist ve

In [None]:
## Sport generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
()
generator(
[
"teamName | Ajax ,teamName | Vitesse, eventType | spel, eventQuality | zeer slordig",
"locationPlayed | Target Field, teamName | his teammates"
])

[[{'generated_text': 'Vitesse heeft een zeer slechter spel geboekt, nadat Ajax de'},
  {'generated_text': 'Vitesse, dat zeer slechter was dan Ajax, scoorde in het'},
  {'generated_text': 'Vitesse, dat een zeer slechter spel heeft geboekt, was het '},
  {'generated_text': 'Vitesse, dat een zeer slechter spel heeft geboekt, was Ajax'},
  {'generated_text': 'Vitesse, dat zeer slechter was dan Ajax, werd getrakteerd op een'}],
 [{'generated_text': 'Target Field, his teammates were sitting in a row as they '},
  {'generated_text': "Target Field, his teammates couldn't get a chance to play."},
  {'generated_text': "Target Field, his teammates hadn't been able to stay in the game"},
  {'generated_text': "Target Field, his teammates couldn't get a chance to escape"},
  {'generated_text': "Target Field, his teammates couldn't make it easy for him."}]]

#### Experiment 4: Check poor generations by adding attributes from different subject

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"incidentType | boven water gehaald , incidentType | op de kade getild , ORG | brandweer , ORG | duikers, victimVehicle | auto",
])

[[{'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers, die op de kade kwamen.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers zijn op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers werden op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers werden op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald door brandweer, de duikers, die op de kade getild is.'}]]

In [None]:
##  Weather generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
 "temperatureCelsius | above average, timePoint | this summer , ORG | the department, weatherArea | the province"
])

[[{'generated_text': 'The department said the temperature will remain above average this summer, according to the department.'},
  {'generated_text': 'The department said the temperature will remain above average this summer, according to the department.'},
  {'generated_text': 'The department said temperatures will be above average this summer, according to the department.'},
  {'generated_text': 'The department said temperatures will be above average this summer, according to the department.'},
  {'generated_text': 'The department said the temperature will remain above average this summer, according to the department.'}]]

In [None]:
##  generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
 "goalName | Jeff Stans, goalName | Tom van Weert, tiemPoint | eerst, timePoint | daarna",
 "locationPlayed | Target Field, teamName | his teammates, incidentType | cheering, timePoint | at the end",
 "batterName | Cabrera , teamName | Detroit, incidentType | MRI"
])

[[{'generated_text': 'Tom van Weert, Jeff Stans, die daarna nog een doelpunt maakte,'},
  {'generated_text': 'Tom van Weert, Jeff Stans, die daarna de bal in het doel zette'},
  {'generated_text': 'Tom van Weert, Jeff Stans, die daarna de eerste treffer van het'},
  {'generated_text': 'Tom van Weert, Jeff Stans, kwam daarna in de verre hoek en'},
  {'generated_text': 'Tom van Weert, Jeff Stans, die daarna de eerste treffer van het'}],
 [{'generated_text': 'Target Field, where his teammates were cheering at the end of the incident,'},
  {'generated_text': 'When he arrived at Target Field, he was greeted by a che'},
  {'generated_text': 'Target Field, where his teammates were cheering at the end of the incident,'},
  {'generated_text': 'Target Field, where his teammates were cheering at the end of the incident,'},
  {'generated_text': 'Target Field, where his teammates were cheering at the end of the incident,'}],
 [{'generated_text': 'Cabrera , a Detroit native, was treated for a MRI.'}

#### Experiment 5: Order of attributes

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 150, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"incidentType | boven water gehaald , incidentType | op de kade getild , ORG | brandweer , ORG | duikers, victimVehicle | auto",
"victimVehicle | auto, incidentType | boven water gehaald , ORG | brandweer , ORG | duikers , incidentType | op de kade getild",
"temperatureCelsius | above average, timePoint | this summer , ORG | the department, weatherArea | the province",
"ORG | the department, weatherArea | the province, temperatureCelsius | above average, timePoint | this summer"
])

[[{'generated_text': 'De auto werd boven water gehaald, de brandweer is op de kade getild en de duikers zijn op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers werden op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer is op de kade getild en de duikers zijn op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald, de brandweer heeft de auto op de kade getild en de duikers zijn op de kade getild.'},
  {'generated_text': 'De auto werd boven water gehaald door brandweer en duikers, die op de kade getild waren.'}],
 [{'generated_text': 'De auto, die boven water gehaald is door brandweer, is op de kade getild.'},
  {'generated_text': 'De auto, die boven water gehaald is door brandweer, is op de kade getild.'},
  {'generated_text': 'De auto, die op de kade getild is, werd boven water gehaald door brandweer en duikers.'},
  {'generated_text': 'De au

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 200, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"companyName | ECB, timePoint | april, companyName | Analisten, event | juist verwacht",
"companyName | Analisten, event | juist verwacht, companyName | ECB, timePoint | april ",

"stockChange | min , stockChangePercentage | meer dan 12%, event | Na hevige koersschommelingen sloot",
"event | Na hevige koersschommelingen sloot, stockChange | min , stockChangePercentage | meer dan 12%",

"DATE | today, timePoint | historical precedent, feeling | shouldn’t fear, event | political drama, ORG | today’s traders",
"feeling | shouldn’t fear, event | political drama, ORG | today’s traders, DATE | today, timePoint | historical precedent ",


"companyName | rivaal Sandd, event | overname, event | resultaten, ORG | postbezorger",
"event | resultaten, ORG | postbezorger, companyName | rivaal Sandd, event | overname"


])




[[{'generated_text': 'Analisten, die juist verwachten dat ECB in april een handelsdeal met de centrale bank gaat sluiten.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een nieuwe renteverlaging door de centrale bank gaat doen.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een renteverlaging in april zou verwachten.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een nieuwe renteverlaging door de centrale bank gaat doen.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een handelsdeal met de centrale bank gaat sluiten.'}],
 [{'generated_text': 'Analisten, die in april juist verwachten dat ECB een nieuwe renteverlaging in april heeft geboekt.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een renteverlaging in april zou verwachten.'},
  {'generated_text': 'Analisten, die in april juist verwachten dat ECB een nieuwe renteverlaging in april zou kunnen incass

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 200, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"companyName | rivaal Sandd, event | overname, event | drukte op de resultaten, ORG | postbezorger",
])


[[{'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten, nadat de overname van rivaal Sandd, een overname van de hoofdfondsen.'}]]

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 200, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"ORG | rivaal Sandd, event | overname, event | drukte op de resultaten, companyName | postbezorger",
])

[[{'generated_text': 'De postbezorger drukte op de resultaten, nadat de overname van de rivaal Sandd, een overname van de hoofdfondsen.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten, nadat de overname van de rivaal Sandd, een overname van de hoofdfondsen.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten, na een overname van de rivaal Sandd.'}]]

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 200, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"ORG | rivaal Sandd, event | de overname van, event | drukte op de resultaten, companyName | postbezorger",
])

[[{'generated_text': 'De postbezorger drukte op de resultaten van de overname van de rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van, de rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van, de rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van, de rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten, na een overname van de rivaal Sandd.'}]]

In [None]:
##  Incident generations
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/", tokenizer = "/content/drive/MyDrive/MscThesis/Results/google/mt5-base/checkpoint-12428 (model)/",  task='text2text-generation', max_length = 200, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
"companyName | rivaal Sandd, event | de overname van, event | drukte op de resultaten, companyName | postbezorger",
 "companyName | postbezorger, event | de overname van, event | drukte op de resultaten, companyName | rivaal Sandd",
])

[[{'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten, na de overname van rivaal Sandd.'}],
 [{'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van rivaal Sandd.'},
  {'generated_text': 'De postbezorger drukte op de resultaten van de overname van, de rivaal Sandd drukte op

## Experiment Data Augmentation

In [None]:
##  Was Kamohelo Mokotjo scoorde.
##  Was Amin Younes schoot na een krap hafut uit.
## Was  Klaassen scoorde in de verre hoek
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Models/google_mt5-base/", tokenizer = "/content/drive/MyDrive/MscThesis/Models/google_mt5-base/",  task='text2text-generation', max_length = 100, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator(
[
'tackleRecipientName | Kamohelo Mokotjo',
"assistName | Amin Younes , goalType | schoot , matchTime | na een krap hafuur",
'chanceForName | Klaassen'
])

[[{'generated_text': 'Kamohelo Mokotjo kreeg geel nadat hij de bal over de aarzelende doelman neerhaalde.'},
  {'generated_text': 'Kamohelo Mokotjo kreeg geel nadat hij de bal over de aarzelende doelman neerhaalde.'},
  {'generated_text': 'Kamohelo Mokotjo kreeg geel nadat hij de bal over de aarzelende doelman neerhaalde.'},
  {'generated_text': 'Kamohelo Mokotjo kreeg rood nadat hij de bal over de aarzelende doelman neerhaalde.'},
  {'generated_text': 'Kamohelo Mokotjo kreeg rood nadat hij de bal over de aarzelende doelman neerhaalde.'}],
 [{'generated_text': 'Amin Younes schoot na een krap halfuur in de verre hoek.'},
  {'generated_text': 'Amin Younes schoot na een krap halfuur via een doelpunt van Amin Younes.'},
  {'generated_text': 'Amin Younes schoot na een krap halfuur in de verre hoek.'},
  {'generated_text': 'Amin Younes schoot na een krap halfuur in de verre hoek.'},
  {'generated_text': 'Amin Younes schoot na een krap halfuur via een doelpunt van Amin Younes.'}],
 [{'generat

In [None]:
### Was De ploeg van Sam Larsson schoot de bal in de handen van kamphuis.
### Was Keeper Eloy Room scoorde.
### Was Vincent Vermeij en Kostas Lamprou maakten tweemaal de gelijkmaker.
### Navarone Foor scoorde voor Yeboah.
### Linssen scoorde voor het eerst in de Eredivisie.
### De ploeg van Kluivert zette de AZ-defensie op voorsprong.
generator = pipeline(model="/content/drive/MyDrive/MscThesis/Models/google_mt5-base/", tokenizer = "/content/drive/MyDrive/MscThesis/Models/google_mt5-base/",  task='text2text-generation', max_length = 100, do_sample=True, num_beams = 5, top_p=0.7, repetition_penalty = 1.3, num_return_sequences=5)
generator([
    "redCardName | Sam Larsson, refereeName | Kamphuis",
    "goalkeeperName | Eloy Room, positionOfPlayer | Keeper",
    "chanceForName | Vincent Vermeij, chanceForNumber | tweemaa, coachName | John Stegeman, goalScore | gelijkmaker, goalkeeperName | Kostas Lamprou, positionOfPlayer | spits",
    "tackleGiverName | Navarone Foor, tackleRecipientName | Yeboah",
    'redCardName | Linssen',
    "playerName | Kluivert, teamName | AZ-defensie",

])

[[{'generated_text': 'Scheidsrechter Serdar Gözübüyük wees na een rode kaart nadat hij de doorgebroken Sam Larsson neerhaalde.'},
  {'generated_text': 'Scheidsrechter Serdar Gözübüyük wees na een rode kaart nadat hij de doorgebroken Sam Larsson neerhaalde.'},
  {'generated_text': 'Scheidsrechter Serdar Gözübüyük wees na een rode kaart nadat hij de doorgebroken Sam Larsson neerhaalde.'},
  {'generated_text': 'Scheidsrechter Serdar Gözübüyük wees na een rode kaart nadat hij de doorgebroken Sam Larsson neerhaalde.'},
  {'generated_text': 'Scheidsrechter Serdar Gözübüyük wees na een rode kaart nadat hij de doorgebroken Sam Larsson neerhaalde.'}],
 [{'generated_text': 'Keeper Eloy Room kreeg de bal op de stip.'},
  {'generated_text': 'Keeper Eloy Room kreeg de bal op de stip.'},
  {'generated_text': 'Keeper Eloy Room schoot de bal op de stip.'},
  {'generated_text': 'Keeper Eloy Room stond op de stip.'},
  {'generated_text': 'Keeper Eloy Room scoorde de bal in de verre hoek.'}],
 [{'generat