# GLUE logs processing

In [None]:
!pip install transformers==4.5.0
!pip install datasets==1.6.2

In [None]:
from datasets import load_metric, load_from_disk, load_dataset, inspect_metric
from transformers import TrainingArguments, Trainer
from transformers import DistilBertTokenizer, BertTokenizer, RobertaTokenizer, AlbertTokenizer,\
T5Tokenizer, DebertaTokenizer, GPT2Tokenizer
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification,\
RobertaForSequenceClassification, AlbertForSequenceClassification, T5ForConditionalGeneration,\
DebertaForSequenceClassification, GPT2ForSequenceClassification
import numpy as np

In [None]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

## Saving datasets and metrics

In [None]:
for task in GLUE_TASKS: 
    actual_task = "mnli" if task == "mnli-mm" else task
    load_dataset("glue", actual_task).save_to_disk("datasets/glue/" + actual_task)
    load_metric("glue", actual_task)
    inspect_metric("glue", "metrics/glue")

In [None]:
metric = load_metric("metrics/glue", "cola")
fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

## Saving tokenizers and models

In [None]:
name_to_model = {
    "distilbert-base-uncased": {
        "model": DistilBertForSequenceClassification,
        "tokenizer": DistilBertTokenizer
    },
    "bert-base-uncased": {
        "model": BertForSequenceClassification,
        "tokenizer": BertTokenizer
    },
    "bert-large-uncased": {
        "model": BertForSequenceClassification,
        "tokenizer": BertTokenizer
    },
    "roberta-base": {
        "model": RobertaForSequenceClassification,
        "tokenizer": RobertaTokenizer
    },
    "roberta-large": {
        "model": RobertaForSequenceClassification,
        "tokenizer": RobertaTokenizer
    },
    "distilroberta-base": {
        "model": RobertaForSequenceClassification,
        "tokenizer": RobertaTokenizer
    },
    "albert-base-v2": {
        "model": AlbertForSequenceClassification,
        "tokenizer": AlbertTokenizer
    },
    "albert-xxlarge-v2": {
        "model": AlbertForSequenceClassification,
        "tokenizer": AlbertTokenizer
    },
    "t5-base": {
        "model": T5ForConditionalGeneration,
        "tokenizer": T5Tokenizer
    },
    "deberta-base": {
        "model": DebertaForSequenceClassification,
        "tokenizer": DebertaTokenizer
    },
    "deberta-large": {
        "model": DebertaForSequenceClassification,
        "tokenizer": DebertaTokenizer
    },
    "gpt2": {
        "model": GPT2ForSequenceClassification,
        "tokenizer": GPT2Tokenizer
    },
    "distilgpt2": {
        "model": GPT2ForSequenceClassification,
        "tokenizer": GPT2Tokenizer
    },
}

In [None]:
for model_name in name_to_model.keys():
    if model_name == 'deberta-base':
        model_id = 'microsoft/deberta-base'
    elif model_name == 'deberta-large':
        model_id = 'microsoft/deberta-large'
    else:
        model_id = model_name
    model_tokenizer = name_to_model[model_name]["tokenizer"]
    model_tokenizer = model_tokenizer.from_pretrained(model_id)
    model_tokenizer.pad_token = model_tokenizer.eos_token
    model_tokenizer.save_pretrained("tokenizers/{}".format(model_name))
    ! mv tokenizers/{model_name}/tokenizer_config.json tokenizers/{model_name}/config.json

In [None]:
for model_name in name_to_model.keys():
    if model_name == 'deberta-base':
        model_id = 'microsoft/deberta-base'
    elif model_name == 'deberta-large':
        model_id = 'microsoft/deberta-large'
    else:
        model_id = model_name
    for num_labels in [1, 2, 3]:
        model = name_to_model[model_name]["model"].from_pretrained(model_id, num_labels=num_labels)
        model.config.pad_token_id = model.config.eos_token_id
        model.save_pretrained("models/{}/{}".format(model_name, num_labels))

## Calculate metrics

In [None]:
from os import listdir
import json

results = {}
model_name = ''
seed = 0
DIR = 'results'

for log in listdir(DIR):
    if log[-3:] == 'err':
        continue
    with open("{}/{}".format(DIR, log)) as f:
        for line in f:
            line = json.loads(line.replace("'", "\""))
            if "task_name" in line.keys():
                task_name = line["task_name"]
                model_name = line["model_name"]
                results[model_name] = results.get(model_name, {})
                seed = str(line["random_seed"])
                results[model_name][task_name] = results[model_name].get(task_name, {})
            elif "eval_loss" in line.keys():
                del line['epoch']
                del line['eval_steps_per_second']
                del line['eval_samples_per_second']
                del line['eval_runtime']
                del line['eval_loss']
                for metric in line.keys():
                    results[model_name][task_name][metric]       = results[model_name][task_name].get(metric, {})
                    results[model_name][task_name][metric][seed] = results[model_name][task_name][metric].get(seed, [])
                    results[model_name][task_name][metric][seed].append(line[metric])

for model_name in results.keys():
    for task in results[model_name].keys():
        for metric in results[model_name][task].keys():
            for seed in results[model_name][task][metric].keys():
                results[model_name][task][metric][seed] = max(results[model_name][task][metric][seed]) * 100

In [None]:
import pandas as pd

mean_results = {a:{c:{e:round(np.mean(list(f.values())), 1) for e,f in d.items()} for c,d in b.items()} for a,b in results.items()}

df = pd.DataFrame(columns=list(pd.io.json.json_normalize(list(mean_results.values())[0]).columns))

models = name_to_model.keys()
actual_models = []

for model_name in models:
    if model_name in mean_results.keys():
        actual_models.append(model_name)
        df = df.append(pd.io.json.json_normalize(mean_results[model_name]), sort=False)

df['model_name'] = actual_models
df = df.set_index('model_name')

In [None]:
df.to_csv("../tables/case_study_4/glue_hpc.csv")