In [None]:
# Add "parameters" (jupyter-notebook) tag to this cell, to allow papermill to inject different parameters
from datetime import date
it=9  #Iteration of gridsearch
# to put it all in one folder by date, will be replaced by papermill
today=date.today() 
rdate=today.strftime("%Y-%m-%d")

In [None]:
# See if running on Colab (for setting the correct workdir and installing all dependencies)
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  g_colab = True
else:
  print('Not running on CoLab')
  g_colab = False

In [None]:
# connect to drive
if g_colab:
    from google.colab import drive
    drive.mount('/gdrive')
    %cd "/gdrive/MyDrive/1 Job/Product and Code/CogAlex 2.0/"

# Choose model
# Gridsearch parameters
from sklearn.model_selection import ParameterGrid

# Original results with "xlm_roberta_base"
grid = [{"model_name": ["xlm-roberta-base"],
        "datasets": [["old"], ["de_train_new", "de_val_new"], ["de_train_new"], ["de_val_new"],
                      ["en_train_new", "en_val_new"], ["en_train_new"], ["en_val_new"], 
                      ["de_train_new", "de_val_new","en_train_new", "en_val_new"]]},
        {"model_name": ["xlm-roberta-large", "distilbert-base-multilingual-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "roberta-base"],
        "datasets": [["old"], ["de_train_new", "de_val_new","en_train_new", "en_val_new"]]}]

pg = list(ParameterGrid(grid))
print(len(pg))
model_name = pg[it]["model_name"]
datasets = pg[it]["datasets"]
print(pg[it])

# Manually set parameters

In [None]:
# Training Params
best_runs = []
training_stats = {}
epochs = 15
loops = 5

#Savedirs
import os
savedir = f"./{rdate}/averages/{model_name}_{it}_{datasets}"
if g_colab:
    workdir = os.getcwd()
else:
    workdir = os.getcwd().replace("/home/","/binfl/")
model_dir = f"{workdir}/saved_models/{rdate}/"

# Libraries

In [None]:
if g_colab:
    !pip install transformers
    !pip install sentencepiece

In [None]:
import torch                                              #for training the model
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
import pandas as pd                                       #for handling the data
from transformers import XLMRobertaTokenizer, AutoTokenizer              #for loading the pretrained model and tokenizer
from transformers import XLMRobertaForSequenceClassification, AutoModelForSequenceClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline
from sklearn import preprocessing                         #for label encoding
from sklearn.metrics import classification_report         #for showing performance on validation/test sets
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         #for grid search
from sklearn.model_selection import ParameterSampler      #for random search
from sklearn.utils.fixes import loguniform
import sentencepiece
import matplotlib.pyplot as plt
import time
import datetime
import random
import numpy as np
%matplotlib inline
import seaborn as sns
# For file saving etc.
import os
import shutil

# Load Data

Load data from text files 

In [None]:
# sk learn label encoder for changing the labels to integers
labels=["ANT", "HYP", "RANDOM", "SYN"]
le = preprocessing.LabelEncoder()
le.fit(labels)

#chinese 
data_train_zh = pd.read_csv('datasets/train_chinese_data.txt', sep="\t", header=None)
data_train_zh.columns = ["Word1", "Word2", "Label"]

data_valid_zh = pd.read_csv('datasets/validgold_chinese_data.txt', sep="\t", header=None)
data_valid_zh.columns = ["Word1", "Word2", "Label"]

data_train_zh["Label"]=le.transform(data_train_zh["Label"])
data_valid_zh["Label"]=le.transform(data_valid_zh["Label"])

#english
if "en_train_new" in datasets:
    print("EN Train: NEW")
    data_train_en = pd.read_csv('datasets/train_english_data_new.txt', sep="\t", header=None, usecols=[0,1,2])
    data_train_en.columns = ["Word1", "Word2", "Label"]
else:
    print("EN Train: OLD")
    data_train_en = pd.read_csv('datasets/train_english_data.txt', sep="\t", header=None)
    data_train_en.columns = ["Word1", "Word2", "Label"]

if "en_valid_new" in datasets:
    print("EN Val: NEW")
    data_valid_en = pd.read_csv('datasets/validgold_english_data_new.txt', sep="\t", header=None, usecols=[0,1,2])
    data_valid_en.columns = ["Word1", "Word2", "Label"]

else:
    print("EN Val: OLD")
    data_valid_en = pd.read_csv('datasets/validgold_english_data.txt', sep="\t", header=None)
    data_valid_en.columns = ["Word1", "Word2", "Label"]

data_train_en["Label"]=le.transform(data_train_en["Label"])
data_valid_en["Label"]=le.transform(data_valid_en["Label"])

#german
if "de_train_new" in datasets:
    print("DE Train: NEW")
    #Cogalex 2.0 - NO DUPLICATES
    data_train_de = pd.read_csv('datasets/train_german_data_new.txt', sep="\t", header=None, usecols=[0,1,2])
    data_train_de.columns = ["Word1", "Word2", "Label"] 
else:
    print("DE Train: OLD")
    # Old data
    data_train_de = pd.read_csv('datasets/train_german_data.txt', sep="\t", header=None)
    data_train_de.columns = ["Word1", "Word2", "Label"]

if "de_val_new" in datasets:
    print("DE Val: NEW")
    #Cogalex 2.0 - NO DUPLICATES
    data_valid_de = pd.read_csv('datasets/validgold_german_data_new.txt', sep="\t", header=None, usecols=[0,1,2])
    data_valid_de.columns = ["Word1", "Word2", "Label"]
else:
    print("DE Val: OLD")
    # Old data
    data_valid_de = pd.read_csv('datasets/validgold_german_data.txt', sep="\t", header=None)
    data_valid_de.columns = ["Word1", "Word2", "Label"]


data_train_de["Label"]=le.transform(data_train_de["Label"])
data_valid_de["Label"]=le.transform(data_valid_de["Label"])


# all together
data_train_all=pd.concat([data_train_zh, data_train_en, data_train_de])
data_train_all=data_train_all.reset_index(drop=True)
data_valid_all=pd.concat([data_valid_zh, data_valid_en, data_valid_de])
data_valid_all=data_valid_all.reset_index(drop=True)

In [None]:
data_train_en

In [None]:
# print class distribution 
data_train_all["Label"].value_counts().plot(kind='bar', title='Count (target)')

# Tokenize

Tokenize Data

- encode in the right format for XLM-Roberta ( <s\> = sentence beginning, </s\> end of sentence/sentence seperator)

- Truncate/Padding so everything has the same length

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
if "gpt2" in model_name:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id

In [None]:
max_len=64

def tokenizer_xlm(data, max_len):
  labels_ = []
  input_ids_ = []
  attn_masks_ = []

  # for each datasample:
  for index, row in data.iterrows():

      word1 = row['Word1']
      word2 = row['Word2']

      # create requiered input, i.e. ids and attention masks
      encoded_dict = tokenizer.encode_plus(word1, word2, 
                                                max_length=max_len, 
                                                padding='max_length',
                                                truncation=True, 
                                                return_tensors='pt')

      # add encoded sample to lists
      input_ids_.append(encoded_dict['input_ids'])
      attn_masks_.append(encoded_dict['attention_mask'])
      labels_.append(row['Label'])
      
  # Convert each Python list of Tensors into a 2D Tensor matrix.
  input_ids_ = torch.cat(input_ids_, dim=0)
  attn_masks_ = torch.cat(attn_masks_, dim=0)

  # labels to tensor
  labels_ = torch.tensor(labels_)

  print('Encoder finished. {:,} examples.'.format(len(labels_)))
  return input_ids_, attn_masks_, labels_

In [None]:
# a small test to see how the tokenizer works
w1 = "Tiger"
w2 = "Animal"

# Encode the two sentences together.
encoded = tokenizer.encode_plus(w1, w2)

# Print the IDs of the resulting tokens.
print ("Input IDs:      ", encoded['input_ids'])

# Convert the token IDs back to strings so we can check them out.
print ("Tokens:         ", tokenizer.convert_ids_to_tokens(encoded['input_ids']))

# The tokenizer returns an attention mask, which masks out PAD tokens. 
# Since we aren't doing any padding yet, the mask is just all 1s. 
print ("\nAttention Mask: ", encoded['attention_mask'])


In [None]:
data_train_all.isnull().values.any()

In [None]:
data_valid_all.isnull().values.any()

In [None]:
# tokenize data

#all
print("All")
input_ids_train_all, attn_masks_train_all, labels_train_all = tokenizer_xlm(data_train_all, max_len)
input_ids_valid_all, attn_masks_valid_all, labels_valid_all = tokenizer_xlm(data_valid_all, max_len)

#zh
print("zh")
input_ids_train_zh, attn_masks_train_zh, labels_train_zh = tokenizer_xlm(data_train_zh, max_len)
input_ids_valid_zh, attn_masks_valid_zh, labels_valid_zh = tokenizer_xlm(data_valid_zh, max_len)

#en
print("en")
input_ids_train_en, attn_masks_train_en, labels_train_en = tokenizer_xlm(data_train_en, max_len)
input_ids_valid_en, attn_masks_valid_en, labels_valid_en = tokenizer_xlm(data_valid_en, max_len)

#de
print("de")
input_ids_train_de, attn_masks_train_de, labels_train_de = tokenizer_xlm(data_train_de, max_len)
input_ids_valid_de, attn_masks_valid_de, labels_valid_de = tokenizer_xlm(data_valid_de, max_len)

In [None]:
# Combine the training inputs into a TensorDataset.

#all
tensor_data_train_all = TensorDataset(input_ids_train_all, attn_masks_train_all, labels_train_all)
tensor_data_valid_all = TensorDataset(input_ids_valid_all, attn_masks_valid_all, labels_valid_all)
#zh
tensor_data_train_zh = TensorDataset(input_ids_train_zh, attn_masks_train_zh, labels_train_zh)
tensor_data_valid_zh = TensorDataset(input_ids_valid_zh, attn_masks_valid_zh, labels_valid_zh)
#en
tensor_data_train_en = TensorDataset(input_ids_train_en, attn_masks_train_en, labels_train_en)
tensor_data_valid_en = TensorDataset(input_ids_valid_en, attn_masks_valid_en, labels_valid_en)
#de
tensor_data_train_de = TensorDataset(input_ids_train_de, attn_masks_train_de, labels_train_de)
tensor_data_valid_de = TensorDataset(input_ids_valid_de, attn_masks_valid_de, labels_valid_de)

In [None]:
#prepare pytorch dataloaders

batch_size = 32

#all
train_dataloader_all = DataLoader(tensor_data_train_all, sampler = RandomSampler(tensor_data_train_all), batch_size = batch_size) #random sampling
validation_dataloader_all = DataLoader(tensor_data_valid_all, sampler = SequentialSampler(tensor_data_valid_all),batch_size = batch_size ) #sequential sampling
#zh
train_dataloader_zh = DataLoader(tensor_data_train_zh, sampler = RandomSampler(tensor_data_train_zh), batch_size = batch_size)
validation_dataloader_zh = DataLoader(tensor_data_valid_zh, sampler = SequentialSampler(tensor_data_valid_zh),batch_size = batch_size)
#en
train_dataloader_en = DataLoader(tensor_data_train_en, sampler = RandomSampler(tensor_data_train_en), batch_size = batch_size)
validation_dataloader_en = DataLoader(tensor_data_valid_en, sampler = SequentialSampler(tensor_data_valid_en),batch_size = batch_size)
#de
train_dataloader_de = DataLoader(tensor_data_train_de, sampler = RandomSampler(tensor_data_train_de), batch_size = batch_size)
validation_dataloader_de = DataLoader(tensor_data_valid_de, sampler = SequentialSampler(tensor_data_valid_de),batch_size = batch_size)

# Training the Classifier (Finetunig the Model)

In [None]:
#load the pretrained model provided by HuggingFace with an added untrained classification head for 4 classes
#model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
#model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4)

In [None]:
#choose training set used for this single training example (original training data for final model)
train_dataloader = train_dataloader_all
print("Training Samples:",len(train_dataloader.dataset))

Set parameters (learning rate & epochs)

Formatting functions

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))  


Validation function

In [None]:
def validate(validation_dataloader, model, verbose): 
  
  # put model in evaluation mode 
  model.eval()

  # Tracking variables 
  total_eval_loss = 0


  predictions, true_labels = [], []

  # Evaluate data for one epoch
  for batch in validation_dataloader:
          
      # Unpack training batch and copy the tensors to the gpu
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
          
      # no backprop needed
      with torch.no_grad():        

          # forward pass
          if "distilbert" in model.name_or_path:
              output = model(b_input_ids, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
          else:
              output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
          
          loss=output.loss
          logits=output.logits
              
      # add up loss
      total_eval_loss += loss.item()

      # on cpu
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      # save preds/true labels
      predictions.append(logits)
      true_labels.append(label_ids)

  # results of the whole validation set
  flat_predictions = np.concatenate(predictions, axis=0)
  flat_true_labels = np.concatenate(true_labels, axis=0)

  # logit to label
  predicted_labels = np.argmax(flat_predictions, axis=1).flatten()

  # print classification report
  if verbose:
    dict_report = classification_report(flat_true_labels, predicted_labels, target_names=labels, output_dict=True)
    str_report = classification_report(flat_true_labels, predicted_labels, target_names=labels)
    print(str_report)
  
  
  #Get precision and recall
  
  precision = dict_report["weighted avg"]["precision"]
  recall = dict_report["weighted avg"]["recall"]
  # Calculate the validation accuracy, macro f1, and weighted f1
  val_accuracy = (predicted_labels == flat_true_labels).mean()
  macroF1 = f1_score(flat_true_labels, predicted_labels, average='macro')
  weightedF1=f1_score(flat_true_labels, predicted_labels, average='weighted')

  # Delete RANDOM samples
  str_true = le.inverse_transform(flat_true_labels).tolist()
  str_pred = le.inverse_transform(predicted_labels).tolist()
  for i in range(len(str_true) - 1, 0, -1):
    if str_true[i] == 'RANDOM':
        str_true.pop(i)
        str_pred.pop(i)
  lbls = ("ANT", "HYP", "SYN")   
  weightedF1_no_random=f1_score(str_true, str_pred, labels=lbls, average='weighted')
  print("\t Weighted F1 (no random):", weightedF1_no_random)

  # Calculate the average loss over all of the batches.
  avg_val_loss = total_eval_loss / len(validation_dataloader)

  # plot confusion matrix
  if verbose:
    print(confusion_matrix(flat_true_labels, predicted_labels, labels=[0,1,2,3]))

  return avg_val_loss, val_accuracy, macroF1, weightedF1, precision, recall, weightedF1_no_random

Training

In [None]:
def train_model(epochs, model, train_dataloader, validation_dataloader_set, random_seed, verbose, optimizer, scheduler, save_best=False, iter=1):

  seed_val = random_seed

  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

  # mostly contains scores about how the training went for each epoch
  training_stats = []

  # total training time
  total_t0 = time.time()

  print('\033[1m'+"================ Model Training ================"+'\033[0m')

  # For each epoch...
  for epoch_i in range(0, epochs):

      print("")
      print('\033[1m'+'======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)+'\033[0m')

      t0 = time.time()

      # summed training loss of the epoch
      total_train_loss = 0


      # model is being put into training mode as mechanisms like dropout work differently during train and test time
      model.train()

      # iterrate over batches
      for step, batch in enumerate(train_dataloader):

          # unpack training batch at load it to gpu (device)  
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          # clear gradients before calculating new ones
          model.zero_grad()        

          # forward pass with current batch
          if "distilbert" in model.name_or_path:
              output = model(b_input_ids, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
          else:
              output = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
          
          loss=output.loss
          logits=output.logits

          # add up the loss
          total_train_loss += loss.item()

          # calculate new gradients
          loss.backward()

          # gradient clipping (not bigger than)
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

          # Update the networks weights based on the gradient as well as the optimiziers parameters
          optimizer.step()

          # lr update
          scheduler.step()

      # avg loss over all batches
      avg_train_loss = total_train_loss / len(train_dataloader)            
      
      # training time of this epoch
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))
          
  
      # VALIDATION

      #all
      print("evaluate on all")
      avg_val_loss_all, val_accuracy_all, macroF1_all, weightedF1_all, precision_all, recall_all, weightedF1_nr_all = validate(validation_dataloader_set[0], model, verbose)   
      #zh
      print("evaluate on zh")
      avg_val_loss_zh, val_accuracy_zh, macroF1_zh, weightedF1_zh, precision_zh, recall_zh, weightedF1_nr_zh = validate(validation_dataloader_set[1], model, verbose) 
      #en
      print("evaluate on en")
      avg_val_loss_en, val_accuracy_en, macroF1_en, weightedF1_en, precision_en, recall_en, weightedF1_nr_en = validate(validation_dataloader_set[2], model, verbose) 
      #de
      print("evaluate on de")
      avg_val_loss_de, val_accuracy_de, macroF1_de, weightedF1_de, precision_de, recall_de, weightedF1_nr_de = validate(validation_dataloader_set[3], model, verbose)  
       

      print('\033[1m'+ "  Validation Loss All: {0:.2f}".format(avg_val_loss_all) + '\033[0m')

      if save_best:
        if training_stats != []:
            maxscore = max(training_stats, key=lambda x:x["Weigh_F1_all"])
            old_best = maxscore["epoch"]
            if maxscore["Weigh_F1_all"] < weightedF1_all:
              print(f"Saving Epoch {epoch_i +1} model and overwriting previous best model...")
              if g_colab:
                workdir = os.getcwd()
              else:
                workdir = os.getcwd().replace("/home/","/binfl/")
              PATH = f"{workdir}/saved_models/{rdate}/{iter}/{model_name}_{it}"
              os.makedirs(PATH, exist_ok=True)
              model.save_pretrained(PATH)
              with open(PATH +"/best_epoch.txt", "w", encoding="utf-8") as f:
                  f.write(str(epoch_i + 1))
              #shutil.rmtree(f"{workdir}/temp/{iter}/{model_name}_{it}_{old_best}")

      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss all': avg_val_loss_all,
              'Valid. Accur. all': val_accuracy_all,
              'Precision_all': precision_all,
              'Recall_all': recall_all,
              'Weigh_F1_all': weightedF1_all,
              'Weigh_F1_nr_all': weightedF1_nr_all, 
              'Macro F1_all': macroF1_all,
              'Precision_en': precision_en,
              'Recall_en': recall_en,
              'Weigh_F1_en': weightedF1_en,
              'Weigh_F1_nr_en': weightedF1_nr_en,
              'Precision_de': precision_de,
              'Recall_de': recall_de,
              'Weigh_F1_de': weightedF1_de,
              'Weigh_F1_nr_de': weightedF1_nr_de,
              'Precision_zh': precision_zh,
              'Recall_zh': recall_zh,
              'Weigh_F1_zh': weightedF1_zh,
              'Weigh_F1_nr_zh': weightedF1_nr_zh,
              'Training Time': training_time,
          }
      )
      if epoch_i >= 3:
        if weightedF1_nr_all <= 0.05:
          maxscore = False
          break

  if maxscore == False:
    print("Not learning... Restarting Run.")
    return training_stats, maxscore
  print("\n\nTraining complete!")
  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
  
  return training_stats, maxscore


In [None]:
validation_dataloader_set=[validation_dataloader_all, validation_dataloader_zh, validation_dataloader_en, validation_dataloader_de]

# Start Training

In [None]:
loop_count = 0
for i in [f"Run {j + 1}" for j in range(loops)]:
    while loop_count <= loops:
        print("Starting {} \n\n".format(i))
        # Load or Re-Load model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
        # connect to GPU 
        device = torch.device('cuda')
        # copy weights onto gpu
        desc = model.to(device)
        print('Connected to GPU:', torch.cuda.get_device_name(0))
        # Optimizer
        optimizer = AdamW(model.parameters(),
                      lr = 2e-5,   #do work well: 2e-5 with 5-7 epochs for trainall, 1e-5
                      eps = 1e-8   # 1e-8.
                      # weight_decay = 0          
                    )
        # number of training epochs
        epochs = epochs
        if "roberta" in model_name:
            num_warmup_steps = 600
        else:
            num_warmup_steps = 0

        # number of batches x epochs
        total_steps = len(train_dataloader) * epochs
        print("total steps:", total_steps)

        #scheduler for lr
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps = num_warmup_steps,   #start low and increase learning rate during these steps
                                                    num_training_steps = total_steps)
        
        training_stats[i], maxscore = train_model(epochs=epochs, 
                           model=model, 
                           train_dataloader=train_dataloader, 
                           validation_dataloader_set=validation_dataloader_set,
                           random_seed=42,
                           verbose=True,
                           scheduler=scheduler,
                           optimizer=optimizer,
                           save_best=True,
                           iter=i)   
        if maxscore == False:
            continue 
        else:
            best_runs.append(maxscore)
            loop_count += 1
            break

# Evaluate Validation

In [None]:
# Garbage Collection before evaluate
import gc
model.to('cpu')
del model
del desc
gc.collect()
torch.cuda.empty_cache()
time.sleep(3)

In [None]:
# Check for GPU support
import torch
if torch.cuda.is_available():
        print("Using GPU for inference")
        print(torch.cuda.get_device_name(torch.cuda.current_device()))
        device = torch.cuda.current_device()
else:
        print("Using CPU for inference")
        device = -1

In [None]:
label_dict = {i : l for i,l in enumerate(labels)}
print(label_dict)

In [None]:
results = {}
results_no_random = {}
weighted_F1_scores = {"chinese":0, "german":0, "english":0, "all":0}
preds_per_lang = {"chinese":"", "german":"", "english":"", "all":""}
for run in os.listdir(model_dir):
    print(run + "\n")
    results[run] = {}
    results_no_random[run] = {}
    model = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{run}/{model_name}_{it}",
                                                               num_labels=4,  
                                                               id2label=label_dict)                                                        
    model.eval()
    annotate = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    validgold_zh, validgold_de, validgold_en = [open(f"./datasets/validgold_{l}_data.txt").readlines() for l in ["chinese", "german", "english"]]
    validgold_all =   validgold_zh + validgold_de + validgold_en 
    for data, read_lang in zip([validgold_zh, validgold_de, validgold_en, validgold_all], ["chinese", "german", "english", "all"]):
        test_set = [line.split("\t") for line in data]
        pred_input = [line[0].strip() + tokenizer.sep_token * 2 + line[1].strip() for line in test_set]
        gold_labels = [line[2].strip() for line in test_set]
        pred_list = annotate(pred_input)
        pred_labels = [entry["label"] for entry in pred_list]
        report = classification_report(gold_labels, pred_labels)
        report_dict = classification_report(gold_labels, pred_labels, output_dict=True)
        print(f"Results for {read_lang}: \n")
        print(report, "\n\n")
        results[run][read_lang]= report_dict
        # Save predictions
        weighted_F1 = f1_score(gold_labels, pred_labels, average='weighted')
        if weighted_F1 > weighted_F1_scores[read_lang]:
            pred_with_words = []
            for line_words, line_pred in zip(test_set, pred_labels):
                pred_with_words.append("\t".join(line_words[:2]) + "\t" + str(line_pred))
            preds_per_lang[read_lang] = pred_with_words
            weighted_F1_scores[read_lang] = f1_score(gold_labels, pred_labels, average='weighted')
        # No RANDOM scores, the CogALex way:
        for i in range(len(gold_labels) -1, -1, -1):
            if gold_labels[i] == 'RANDOM':
                gold_labels.pop(i)
                pred_labels.pop(i)
        lbls = ("ANT", "HYP", "SYN")  
        report_no_random = classification_report(gold_labels, pred_labels, labels=lbls)
        report_no_random_dict = classification_report(gold_labels, pred_labels, labels=lbls, output_dict=True)
        print("\nNo RANDOM:\n")
        print(report_no_random)
        results_no_random[run][read_lang] = report_no_random_dict

In [None]:
results_by_lang = {}

for k1 in results.keys():
    for k2 in results[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_val_results = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_val_results.loc[k] = temp_df.mean()
avg_val_results.update(avg_val_results.loc[:,[i for i in avg_val_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_val_results

In [None]:
results_by_lang = {}

for k1 in results_no_random.keys():
    for k2 in results_no_random[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_val_results_no_random = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_val_results_no_random.loc[k] = temp_df.mean()
avg_val_results_no_random.update(avg_val_results_no_random.loc[:,[i for i in avg_val_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_val_results_no_random

In [None]:
# save average scores
os.makedirs(savedir, exist_ok=True)
val_preds_savedir = savedir + "/val_preds"
os.makedirs(val_preds_savedir, exist_ok=True)
avg_val_results.to_csv(f"{savedir}/val_avg_{model_name}_{it}.csv")
avg_val_results_no_random.to_csv(f"{savedir}/val_no_random_avg_{model_name}_{it}.csv")
for key in preds_per_lang.keys():
    with open(val_preds_savedir + f"/{key}-predictions.txt", "w", encoding="utf-8") as f:
        for line in preds_per_lang[key]:
            print(line, file=f)        

# Evaluate on Gold

In [None]:
label_dict = {i : l for i,l in enumerate(labels)}
if g_colab:
    workdir = os.getcwd()
else:
    workdir = os.getcwd().replace("/home/","/binfl/")
model_dir = f"{workdir}/saved_models/{rdate}/"
results = {}
results_no_random = {}
weighted_F1_scores = {"chinese":0, "german":0, "english":0, "italian":0, "german NEW":0, "english NEW": 0}
preds_per_lang = {"chinese":"", "german":"", "english":"", "italian":"", "german NEW":"", "english NEW": ""}
for run in os.listdir(model_dir):
    print(run + "\n")
    results[run] = {}
    results_no_random[run] = {}
    model = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{run}/{model_name}_{it}",  id2label=label_dict)
    model.eval()
    annotate = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)
    gold_zh, gold_de, gold_en, gold_it = [open(f"./datasets/gold_{l}_data.txt").readlines() for l in ["chinese", "german", "english", "italian"]]
    gold_de_new, gold_en_new = [open(f"./datasets/gold_{l}_data_new.txt").readlines() for l in ["german", "english"]]   
    for data, read_lang in zip([gold_zh, gold_de, gold_en, gold_it, gold_de_new, gold_en_new], ["chinese", "german", "english", "italian", "german NEW", "english NEW"]):
        # Load data for prediction/scoring
        test_set = [line.split("\t") for line in data]
        pred_input = [line[0].strip() + tokenizer.sep_token * 2 + line[1].strip() for line in test_set]
        gold_labels = [line[2].strip() for line in test_set]
        
        #Predict labels on test_set
        pred_list = annotate(pred_input)
        pred_labels = [entry["label"] for entry in pred_list]
        report = classification_report(gold_labels, pred_labels)
        report_dict = classification_report(gold_labels, pred_labels, output_dict=True)
        print(f"Results for {read_lang}: \n")
        print(report, "\n\n")
        results[run][read_lang]= report_dict

        # Save predictions
        weighted_F1 = f1_score(gold_labels, pred_labels, average='weighted')
        if weighted_F1 > weighted_F1_scores[read_lang]:
            pred_with_words = []
            for line_words, line_pred in zip(test_set, pred_labels):
                pred_with_words.append("\t".join(line_words[:2]) + "\t" + str(line_pred))
            preds_per_lang[read_lang] = pred_with_words
            weighted_F1_scores[read_lang] = f1_score(gold_labels, pred_labels, average='weighted')
        
        # No RANDOM scores, the CogALex way:
        for i in range(len(gold_labels) -1, -1, -1):
            if gold_labels[i] == 'RANDOM':
                gold_labels.pop(i)
                pred_labels.pop(i)
        lbls = ("ANT", "HYP", "SYN")  
        report_no_random = classification_report(gold_labels, pred_labels, labels=lbls)
        report_no_random_dict = classification_report(gold_labels, pred_labels, labels=lbls, output_dict=True)
        print("\nNo RANDOM:\n")
        print(report_no_random)
        results_no_random[run][read_lang] = report_no_random_dict

In [None]:
results_by_lang = {}

for k1 in results.keys():
    for k2 in results[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_test_results = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_test_results.loc[k] = temp_df.mean()
avg_test_results.update(avg_test_results.loc[:,[i for i in avg_test_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_test_results

In [None]:
results_by_lang = {}

for k1 in results_no_random.keys():
    for k2 in results_no_random[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_test_results_no_random = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_test_results_no_random.loc[k] = temp_df.mean()
avg_test_results_no_random.update(avg_test_results_no_random.loc[:,[i for i in avg_test_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_test_results_no_random

In [None]:
# save average scores
os.makedirs(savedir, exist_ok=True)
preds_savedir = savedir + "/preds"
os.makedirs(preds_savedir, exist_ok=True)
avg_test_results.to_csv(f"{savedir}/test_avg_{model_name}_{it}.csv")
avg_test_results_no_random.to_csv(f"{savedir}/test_no_random_avg_{model_name}_{it}.csv")
for key in preds_per_lang.keys():
    with open(preds_savedir + f"/{key}-predictions.txt", "w", encoding="utf-8") as f:
        for line in preds_per_lang[key]:
            print(line, file=f)        