In [None]:
# Add "parameters" (jupyter-notebook) tag to this cell, to allow papermill to inject different parameters
from datetime import date
it=0  #Iteration of gridsearch
# to put it all in one folder by date, will be replaced by papermill
today=date.today() 
rdate=today.strftime("%Y-%m-%d")
rdate="2021-08-31"
# Training Params
best_runs = []
training_stats = {}
epochs = 15
loops = 5

In [None]:
# See if running on Colab (for setting the correct workdir and installing all dependencies)
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
  g_colab = True
else:
  print('Not running on CoLab')
  g_colab = False

In [None]:
# connect to drive
if g_colab:
    from google.colab import drive
    drive.mount('/gdrive')
    %cd "/gdrive/MyDrive/1 Job/Product and Code/CogAlex 2.0/"

# Choose model
# Gridsearch parameters
from sklearn.model_selection import ParameterGrid

# Original results with "xlm_roberta_base"
grid = [{"model_name": ["xlm-roberta-base"],
        "datasets": [["old"], ["de_train_new", "de_val_new"], ["de_train_new"], ["de_val_new"],
                      ["en_train_new", "en_val_new"], ["en_train_new"], ["en_val_new"], 
                      ["de_train_new", "de_val_new","en_train_new", "en_val_new"]]},
        {"model_name": ["xlm-roberta-large", "distilbert-base-multilingual-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "roberta-base"],
        "datasets": [["old"], ["de_train_new", "de_val_new","en_train_new", "en_val_new"]]}]

pg = list(ParameterGrid(grid))
print(len(pg))
model_name = pg[it]["model_name"]
datasets = pg[it]["datasets"]
print(pg[it])

# Manually set parameters

In [None]:
# Savedir
import os
savedir = f"./{rdate}/averages/{model_name}_{it}_{datasets}"
if g_colab:
    workdir = os.getcwd()
else:
    workdir = os.getcwd().replace("/home/","/binfl/")
model_dir = f"{workdir}/saved_models/{rdate}"

# Check for GPU support
import torch
if torch.cuda.is_available():
        print("Using GPU for inference")
        print(torch.cuda.get_device_name(torch.cuda.current_device()))
        device = torch.cuda.current_device()
else:
        print("Using CPU for inference")
        device = -1

In [None]:
if g_colab:
    !pip install transformers
    !pip install sentencepiece

# Libraries

In [None]:
import torch                                              #for training the model
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
import pandas as pd                                       #for handling the data
from transformers import XLMRobertaTokenizer, AutoTokenizer              #for loading the pretrained model and tokenizer
from transformers import XLMRobertaForSequenceClassification, AutoModelForSequenceClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline
from sklearn import preprocessing                         #for label encoding
from sklearn.metrics import classification_report         #for showing performance on validation/test sets
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         #for grid search
from sklearn.model_selection import ParameterSampler      #for random search
from sklearn.utils.fixes import loguniform
import sentencepiece
import matplotlib.pyplot as plt
import time
import datetime
import random
import numpy as np
%matplotlib inline
import seaborn as sns
# For file saving etc.
import os
import shutil

# Load Data

In [None]:
# sk learn label encoder for changing the labels to integers
labels=["ANT", "HYP", "RANDOM", "SYN"]
le = preprocessing.LabelEncoder()
le.fit(labels)
print(labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
if "gpt2" in model_name:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id

# Evaluate Validation

In [None]:
label_dict = {i : l for i,l in enumerate(labels)}
print(label_dict)

In [None]:
results = {}
results_no_random = {}
weighted_F1_scores = {"chinese":0, "german":0, "english":0, "all":0}
preds_per_lang = {"chinese":"", "german":"", "english":"", "all":""}
for run in os.listdir(model_dir):
    print(run + "\n")
    results[run] = {}
    results_no_random[run] = {}
    model = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{run}/{model_name}_{it}",
                                                               num_labels=4,  
                                                               id2label=label_dict)                                                        
    model.eval()
    annotate = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    validgold_zh, validgold_de, validgold_en = [open(f"./datasets/validgold_{l}_data.txt").readlines() for l in ["chinese", "german", "english"]]
    validgold_all =   validgold_zh + validgold_de + validgold_en 
    for data, read_lang in zip([validgold_zh, validgold_de, validgold_en, validgold_all], ["chinese", "german", "english", "all"]):
        test_set = [line.split("\t") for line in data]
        pred_input = [line[0].strip() + tokenizer.sep_token * 2 + line[1].strip() for line in test_set]
        gold_labels = [line[2].strip() for line in test_set]
        pred_list = annotate(pred_input)
        pred_labels = [entry["label"] for entry in pred_list]
        report = classification_report(gold_labels, pred_labels)
        report_dict = classification_report(gold_labels, pred_labels, output_dict=True)
        print(f"Results for {read_lang}: \n")
        print(report, "\n\n")
        results[run][read_lang]= report_dict
        # Save predictions
        weighted_F1 = f1_score(gold_labels, pred_labels, average='weighted')
        if weighted_F1 > weighted_F1_scores[read_lang]:
            pred_with_words = []
            for line_words, line_pred in zip(test_set, pred_labels):
                pred_with_words.append("\t".join(line_words[:2]) + "\t" + str(line_pred))
            preds_per_lang[read_lang] = pred_with_words
            weighted_F1_scores[read_lang] = f1_score(gold_labels, pred_labels, average='weighted')
        # No RANDOM scores, the CogALex way:
        for i in range(len(gold_labels) -1, -1, -1):
            if gold_labels[i] == 'RANDOM':
                gold_labels.pop(i)
                pred_labels.pop(i)
        lbls = ("ANT", "HYP", "SYN")  
        report_no_random = classification_report(gold_labels, pred_labels, labels=lbls)
        report_no_random_dict = classification_report(gold_labels, pred_labels, labels=lbls, output_dict=True)
        print("\nNo RANDOM:\n")
        print(report_no_random)
        results_no_random[run][read_lang] = report_no_random_dict

In [None]:
results_by_lang = {}

for k1 in results.keys():
    for k2 in results[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_val_results = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_val_results.loc[k] = temp_df.mean()
avg_val_results.update(avg_val_results.loc[:,[i for i in avg_val_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_val_results

In [None]:
results_by_lang = {}

for k1 in results_no_random.keys():
    for k2 in results_no_random[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_val_results_no_random = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_val_results_no_random.loc[k] = temp_df.mean()
avg_val_results_no_random.update(avg_val_results_no_random.loc[:,[i for i in avg_val_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_val_results_no_random

In [None]:
# save average scores
os.makedirs(savedir, exist_ok=True)
val_preds_savedir = savedir + "/val_preds"
os.makedirs(val_preds_savedir, exist_ok=True)
avg_val_results.to_csv(f"{savedir}/val_avg_{model_name}_{it}.csv")
avg_val_results_no_random.to_csv(f"{savedir}/val_no_random_avg_{model_name}_{it}.csv")
for key in preds_per_lang.keys():
    with open(val_preds_savedir + f"/{key}-predictions.txt", "w", encoding="utf-8") as f:
        for line in preds_per_lang[key]:
            print(line, file=f)        

# Evaluate on Gold

In [None]:
results = {}
results_no_random = {}
weighted_F1_scores = {"chinese":0, "german":0, "english":0, "italian":0, "german NEW":0, "english NEW": 0}
preds_per_lang = {"chinese":"", "german":"", "english":"", "italian":"", "german NEW":"", "english NEW": ""}
for run in os.listdir(model_dir):
    results[run] = {}
    results_no_random[run] = {}
    model = AutoModelForSequenceClassification.from_pretrained(f"{model_dir}/{run}/{model_name}_{it}",  id2label=label_dict)
    model.eval()
    annotate = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    gold_zh, gold_de, gold_en, gold_it = [open(f"./datasets/gold_{l}_data.txt").readlines() for l in ["chinese", "german", "english", "italian"]]
    gold_de_new, gold_en_new = [open(f"./datasets/gold_{l}_data_new.txt").readlines() for l in ["german", "english"]]   
    for data, read_lang in zip([gold_zh, gold_de, gold_en, gold_it, gold_de_new, gold_en_new], ["chinese", "german", "english", "italian", "german NEW", "english NEW"]):
        # Load data for prediction/scoring
        test_set = [line.split("\t") for line in data]
        pred_input = [line[0].strip() + tokenizer.sep_token * 2 + line[1].strip() for line in test_set]
        gold_labels = [line[2].strip() for line in test_set]
        
        #Predict labels on test_set
        pred_list = annotate(pred_input)
        pred_labels = [entry["label"] for entry in pred_list]
        report = classification_report(gold_labels, pred_labels)
        report_dict = classification_report(gold_labels, pred_labels, output_dict=True)
        print(f"Results for {read_lang}: \n")
        print(report, "\n\n")
        results[run][read_lang]= report_dict
        
        # Save predictions
        weighted_F1 = f1_score(gold_labels, pred_labels, average='weighted')
        if weighted_F1 > weighted_F1_scores[read_lang]:
            pred_with_words = []
            for line_words, line_pred in zip(test_set, pred_labels):
                pred_with_words.append("\t".join(line_words[:2]) + "\t" + str(line_pred))
            preds_per_lang[read_lang] = pred_with_words
            weighted_F1_scores[read_lang] = f1_score(gold_labels, pred_labels, average='weighted')
        
        # No RANDOM scores, the CogALex way:
        for i in range(len(gold_labels) -1, -1, -1):
            if gold_labels[i] == 'RANDOM':
                gold_labels.pop(i)
                pred_labels.pop(i)
        lbls = ("ANT", "HYP", "SYN")  
        report_no_random = classification_report(gold_labels, pred_labels, labels=lbls)
        report_no_random_dict = classification_report(gold_labels, pred_labels, labels=lbls, output_dict=True)
        print("\nNo RANDOM:\n")
        print(report_no_random)
        results_no_random[run][read_lang] = report_no_random_dict
        #print(results_no_random[run][read_lang]["weighted avg"])

In [None]:
results_by_lang = {}

for k1 in results.keys():
    for k2 in results[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_test_results = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_test_results.loc[k] = temp_df.mean()
avg_test_results.update(avg_test_results.loc[:,[i for i in avg_test_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_test_results

In [None]:
results_by_lang = {}

for k1 in results_no_random.keys():
    for k2 in results_no_random[k1].keys():
        if k2 in results_by_lang.keys():
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
        else:
            results_by_lang[k2] = {}
            results_by_lang[k2][k1] = results_no_random[k1][k2]["weighted avg"]
columns = [i for i in results_by_lang.keys()]
avg_test_results_no_random = pd.DataFrame(columns=["precision", "recall", "f1-score", "support"])
for k in columns:
    temp_df = pd.DataFrame(results_by_lang[k]).transpose()
    avg_test_results_no_random.loc[k] = temp_df.mean()
avg_test_results_no_random.update(avg_test_results_no_random.loc[:,[i for i in avg_test_results.keys() if i != "support"]].apply(lambda x: round(x * 100, 1)))
avg_test_results_no_random

In [None]:
# save average scores
os.makedirs(savedir, exist_ok=True)
preds_savedir = savedir + "/preds"
os.makedirs(preds_savedir, exist_ok=True)
avg_test_results.to_csv(f"{savedir}/test_avg_{model_name}_{it}.csv")
avg_test_results_no_random.to_csv(f"{savedir}/test_no_random_avg_{model_name}_{it}.csv")
for key in preds_per_lang.keys():
    with open(preds_savedir + f"/{key}-predictions.txt", "w", encoding="utf-8") as f:
        for line in preds_per_lang[key]:
            print(line, file=f)        