In [32]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from coral_pytorch.dataset import corn_label_from_logits

TOKENIZER_NAME = f"neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
def get_model_instance(model_path, objective, competence):
    model = None
    if objective == "Regression":
        model = AutoModelForSequenceClassification.from_pretrained(
                f"{model_path}{objective}-{competence}", 
                cache_dir="/tmp/", 
                num_labels=1,
            )
    elif objective == "Classification" or objective == "Ordinal":
        model = AutoModelForSequenceClassification.from_pretrained(
                f"{model_path}{objective}-{competence}", 
                cache_dir="/tmp/aes_enem2", 
                num_labels=6,
            )
    return model


pairs = {'C1': 'formal_register', 'C2': 'thematic_coherence', 'C3': 'narrative_rhetorical_structure', 'C4': 'cohesion'}



In [14]:
import pandas as pd
test = pd.read_csv("dataset//testNarrativo.csv")

In [16]:
from sklearn.metrics import cohen_kappa_score, f1_score

def testar(model, dataset, OBJECTIVE):
    y = []
    y_hat = []
    print(f"A competencia Narrativa é: {pairs[enem]}")
    for index, instancia in dataset.iterrows():
        essay = retirar_marcações(instancia['essay'])
        tokenizado = tokenizer(essay, padding=True, truncation=True, max_length=256,return_tensors="pt")
        with torch.no_grad():
            r = model(**tokenizado)['logits'].squeeze()
        y.append(instancia[pairs[enem]])
        if OBJECTIVE == 'Regression':
            y_hat.append(r.item())
        elif OBJECTIVE == 'Classification':
            y_hat.append(r.tolist())
        elif OBJECTIVE == 'Ordinal':
            y_hat.append(corn_label_from_logits(r.unsqueeze(dim=0)).item())
        else:
            return -100
    y_hat = arrumar_notas(y_hat, OBJECTIVE)
    soma = calcular_metricas(y, y_hat)
    print(f"Respostas: {y[:5]}")
    print(f"Depois do arrumar: {y_hat[:5]}")
    return soma

def arredondar_notas(notas):
    novas_notas = []
    for n in notas:
        novas_notas.append( int(round(n)) )
    return novas_notas

def arrumar_notas(notas, OBJECTIVE):
    novas_notas = []
    if OBJECTIVE == 'Regression' or OBJECTIVE == 'Ordinal':
        for n in notas:
            arredondada = int(round(n))
            if arredondada > 5:
                arredondada = 5
            if arredondada < 0:
                arredondada = 0
            novas_notas.append(arredondada)
    elif OBJECTIVE == 'Classification':
        for n in notas:
            maior = max(n)
            novas_notas.append( n.index(maior) )
    else:
        print(notas)
    return novas_notas

In [18]:
def retirar_marcações(texto):
    lista_marcações = ["[P]", "[ P]", "[P}", "[p]", "{p}","[S]", "[s]", "[T]", "[t]", "{t}", "[R]", "[X]", "[X~]", "[r]", "[x]","{x}", "[?]", "{?}", "[?}", "{?]", "[LC]", "[LT]", "[lt]"]
    for lm in lista_marcações:
        texto = texto.replace(lm, "")
    return texto

def calcular_metricas(y, y_hat):
    assert len(y) == len(y_hat)
    QWK = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5], weights='quadratic')
    LWK = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5], weights='linear')
    Kappa = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5])
    F1 = f1_score(y, y_hat, average='weighted')
    print(f'Kappa: {Kappa}')
    print(f"F1: {F1}")
    return Kappa + F1

retirar_marcações("[X~]daew [P}  ")

'daew   '

In [34]:
for o in ['Classification', 'Regression', 'Ordinal']:
    for i in range(1,5):
        enem = f'C{i}'
        print(f"Competencia ENEM: {enem}")
        objective = o
        print(f"Com treinamento do tipo: {o}")
        model = get_model_instance(f"igorcs/", objective, enem)
        print("   Performance {i}: ", testar(model, test, objective))

A competencia Narrativa é: formal_register
Kappa: 0.2867469879518072
F1: 0.6217760617760618
Respostas: [4, 4, 3, 4, 3]
Depois do arrumar: [3, 3, 3, 3, 3]
   Performance {i}:  0.908523049727869


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

A competencia Narrativa é: thematic_coherence
Kappa: -0.001183832234060267
F1: 0.09849422555564843
Respostas: [3, 3, 1, 4, 4]
Depois do arrumar: [2, 2, 2, 2, 2]
   Performance {i}:  0.09731039332158817


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

A competencia Narrativa é: narrative_rhetorical_structure
Kappa: 0.16905350865772695
F1: 0.5555935228409096
Respostas: [4, 5, 5, 4, 4]
Depois do arrumar: [3, 4, 4, 4, 4]
   Performance {i}:  0.7246470314986365


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

A competencia Narrativa é: cohesion
Kappa: 0.0
F1: 0.5694124746756325
Respostas: [4, 5, 3, 3, 4]
Depois do arrumar: [3, 3, 3, 3, 3]
   Performance {i}:  0.5694124746756325
