In [38]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from coral_pytorch.dataset import corn_label_from_logits
from coral_pytorch.losses import corn_loss, CornLoss
from datasets import Dataset

In [40]:
test = pd.read_csv("dataset//testNarrativo.csv")
validation = pd.read_csv("dataset//validationNarrativo.csv")
train = pd.read_csv("dataset//trainNarrativo.csv")

In [42]:
def medir_distribuicao(notas):
    distribuicao = [0]*6
    for n in notas:
        distribuicao[n] += 1 
    return distribuicao

medir_distribuicao(test['formal_register'])

[0, 10, 54, 249, 53, 4]

## Treinando o modelo

In [44]:
TOKENIZER_NAME = f"neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
def get_model_instance(model_path, objective):
    model = None
    if objective == "regression":
        model = AutoModelForSequenceClassification.from_pretrained(
                model_path, 
                cache_dir="/tmp/", 
                num_labels=1,
            )
    elif objective == "classification" or objective == "ordinal":
        model = AutoModelForSequenceClassification.from_pretrained(
                model_path, 
                cache_dir="/tmp/aes_enem2", 
                num_labels=6,
            )
    return model



In [None]:
from sklearn.metrics import cohen_kappa_score, f1_score

def calcular_metricas(y, y_hat):
    assert len(y) == len(y_hat)
    QWK = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5], weights='quadratic')
    LWK = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5], weights='linear')
    Kappa = cohen_kappa_score(y, y_hat, labels=[0, 1, 2, 3, 4, 5])
    F1 = f1_score(y, y_hat, average='weighted')
    print(f'Kappa: {Kappa}')
    print(f"F1: {F1}")
    return Kappa + F1

def arredondar_notas(notas):
    novas_notas = []
    for n in notas:
        novas_notas.append( int(round(n)) )
    return novas_notas

def arrumar_notas(notas, OBJECTIVE):
    novas_notas = []
    if OBJECTIVE == 'regression' or OBJECTIVE == 'ordinal':
        for n in notas:
            arredondada = int(round(n))
            if arredondada > 5:
                arredondada = 5
            if arredondada < 0:
                arredondada = 0
            novas_notas.append(arredondada)
    elif OBJECTIVE == 'classification':
        for n in notas:
            maior = max(n)
            novas_notas.append( n.index(maior) )
    else:
        print(notas)
    return novas_notas

In [49]:
pairs = {'C1': 'formal_register', 'C2': 'thematic_coherence', 'C3': 'narrative_rhetorical_structure', 'C4': 'cohesion'}
OBJECTIVES = ['ordinal', 'classification', 'regression']

In [5]:
def retirar_marcações(texto):
    lista_marcações = ["[P]", "[ P]", "[P}", "[p]", "{p}","[S]", "[s]", "[T]", "[t]", "{t}", "[R]", "[X]", "[X~]", "[r]", "[x]","{x}", "[?]", "{?}", "[?}", "{?]", "[LC]", "[LT]", "[lt]"]
    for lm in lista_marcações:
        texto = texto.replace(lm, "")
    return texto

retirar_marcações("[X~]teste [P}  ")

'teste   '

In [50]:
OBJECTIVE = 'ordinal'
enem='C1'
model = get_model_instance(f"neuralmind/bert-base-portuguese-cased", OBJECTIVE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
print(f"{enem}: {pairs[enem]}")

C1: formal_register


In [None]:
from transformers import TrainingArguments
from transformers import Trainer
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, f1_score


def tokenize_function(example):
    return tokenizer(example["essay"], max_length=256, padding=True, truncation=True, return_tensors='pt')

def testar(model, dataset, OBJECTIVE):
    y = []
    y_hat = []
    for index, instancia in dataset.iterrows():
        essay = retirar_marcações(instancia['essay'])
        tokenizado = tokenizer(essay, padding=True, truncation=True, max_length=256,return_tensors="pt")
        with torch.no_grad():
            r = model(**tokenizado)['logits'].squeeze()
        y.append(instancia[pairs[enem]])
        if OBJECTIVE == 'regression':
            y_hat.append(r.item())
        elif OBJECTIVE == 'classification':
            y_hat.append(r.tolist())
        elif OBJECTIVE == 'ordinal':
            y_hat.append(corn_label_from_logits(r.unsqueeze(dim=0)).item())
        else:
            return -100
    y_hat = arrumar_notas(y_hat, OBJECTIVE)
    soma = calcular_metricas(y, y_hat)
    print(f"Respostas: {y[:5]}")
    print(f"Depois do arrumar: {y_hat[:5]}")
    return soma

def treinar(model, treino, validacao, teste, OBJECTIVE, nome):
    performance_anterior = -1 
    chances = 3
    iteracao = 1
    optimizer = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
    optimizer.zero_grad()
    if OBJECTIVE == 'regression':
        loss_fn = nn.MSELoss()
        tensor_type = torch.float
    elif OBJECTIVE == 'classification':
        loss_fn = nn.CrossEntropyLoss()
        tensor_type = torch.long
    elif OBJECTIVE == 'ordinal':
        loss_fn = CornLoss(num_classes=6)
        tensor_type = torch.long
    else:
        return 0
    while chances > 0:
        print(f"Iteração {iteracao}")
        iteracao += 1
        loss_total = []
        for index, instancia in treino.iterrows():
            essay = retirar_marcações(instancia['essay'])
            labels = torch.tensor(instancia[pairs[enem]], dtype=tensor_type)
            tokenizado = tokenizer(essay, padding=True, truncation=True, max_length=256,return_tensors="pt")
            r = model(**tokenizado)['logits'].squeeze()
            loss = loss_fn(r, labels)
            loss.backward()
            optimizer.step()
            loss_total.append(loss.item())
            optimizer.zero_grad()
        print(f"Loss média do treinamento: {sum(loss_total)/len(loss_total)}")
        print("-- Validação:")
        performance = testar(model, validacao, OBJECTIVE)
        if performance > performance_anterior:
            print("Performance melhor, vou salvar o modelo <<")
            print(f"Foi de {performance_anterior} para {performance}")
            torch.save(model.state_dict(), nome)
            performance_anterior = performance
            chances = 3
        elif performance == performance_anterior:
            chances = 3
        else:
            chances = chances-1
            print(f"Performance pior, só tem mais {chances} chances")
            if chances == 0:
                print("Acabou o treinamento")
                break
        print("-- Teste:")
        testar(model, teste, OBJECTIVE)

In [None]:
treinar(model, train, validation, test, OBJECTIVE, f"modelo-{OBJECTIVE}-{pairs[enem]}.pt")

Iteração 1
Loss média do treinamento: 0.5551151847114434
-- Validação:
Kappa: 0.0014735535908174668
F1: 0.06135384615384616
Respostas: [3, 3, 3, 3, 4]
Depois do arrumar: [2, 2, 2, 2, 2]
Performance melhor, vou salvar o modelo <<
Foi de -1 para 0.06282739974466363
-- Teste:
Kappa: 0.005613199523728407
F1: 0.03998678384986454
Respostas: [4, 4, 3, 4, 3]
Depois do arrumar: [2, 2, 1, 2, 2]
Iteração 2
Loss média do treinamento: 0.39921598402229513
-- Validação:
Kappa: 0.29824561403508776
F1: 0.5718327699697562
Respostas: [3, 3, 3, 3, 4]
Depois do arrumar: [2, 2, 3, 3, 3]
Performance melhor, vou salvar o modelo <<
Foi de 0.06282739974466363 para 0.8700783840048439
-- Teste:
Kappa: 0.16230090825325783
F1: 0.5100277847941399
Respostas: [4, 4, 3, 4, 3]
Depois do arrumar: [3, 3, 2, 2, 3]
Iteração 3
Loss média do treinamento: 0.34050727696032135
-- Validação:
Kappa: 0.0
F1: 0.4894901960784313
Respostas: [3, 3, 3, 3, 4]
Depois do arrumar: [3, 3, 3, 3, 3]
Performance pior, só tem mais 2 chances
-- T

## Para salvar os modelos de modo a por no HuggingFace

In [87]:
pairs = {'C1': 'formal_register', 'C2': 'thematic_coherence', 'C3': 'narrative_rhetorical_structure', 'C4': 'cohesion'}
OBJECTIVES = ['ordinal', 'classification', 'regression']

In [101]:
OBJECTIVE = 'ordinal'
enem = 'C4'
ModeloTeste = get_model_instance(f"kamel-usp/aes_enem_models-sourceA-{OBJECTIVE}-from-bertimbau-base-{enem}", OBJECTIVE)
ModeloTeste.load_state_dict(torch.load(f"modelo-{OBJECTIVE}-{pairs[enem]}.pt"))
ModeloTeste.save_pretrained(f"Tensores-{enem}-{OBJECTIVE}")

<All keys matched successfully>

In [55]:
#Testa o modelo para ter certeza
for i in range(1,5):
    enem=f'C{i}'
    modelo3 = get_model_instance(f"kamel-usp/aes_enem_models-sourceA-{OBJECTIVE}-from-bertimbau-base-{enem}", OBJECTIVE)
    modelo3.load_state_dict(torch.load(f"modelo-{OBJECTIVE}-{pairs[enem]}.pt"))
    print(f"----- C{i}: {pairs[enem]} - {OBJECTIVE}")
    print("   Performance: ", testar(modelo3, test, OBJECTIVE))

----- C1: formal_register - classification
Kappa: 0.3195681069172084
F1: 0.6658291185922766
Respostas: [4, 4, 3, 4, 3]
Depois do arrumar: [3, 4, 3, 3, 3]
   Performance:  0.9853972255094849
----- C2: thematic_coherence - classification
Kappa: 0.4772701378603168
F1: 0.6035778359217905
Respostas: [3, 3, 1, 4, 4]
Depois do arrumar: [2, 3, 1, 2, 1]
   Performance:  1.0808479737821073
----- C3: narrative_rhetorical_structure - classification
Kappa: 0.2791010285610718
F1: 0.6231348381348382
Respostas: [4, 5, 5, 4, 4]
Depois do arrumar: [4, 5, 4, 4, 4]
   Performance:  0.9022358666959099
----- C4: cohesion - classification
Kappa: 0.34546373515289375
F1: 0.6778667953667953
Respostas: [4, 5, 3, 3, 4]
Depois do arrumar: [3, 3, 3, 3, 3]
   Performance:  1.0233305305196891


## Gera .csvs com as respostas dos modelos

In [25]:
def gerar_novo_csv(modelo, dataset, nome, OBJECTIVE):
    dic = []
    for index, instancia in dataset.iterrows():
        essay = retirar_marcações(instancia['essay'])
        tokenizado = tokenizer(essay, padding=True, truncation=True, max_length=256,return_tensors="pt")
        with torch.no_grad():
            r = modelo(**tokenizado)['logits'].squeeze()
        if OBJECTIVE == "regression":
            dic.append({'index_text': index, 'grade_BERT': r.item()})
        else:
            r = nn.functional.softmax(r,dim=0).tolist()
            dic.append({'index_text': index, 'max_score': r.index(max(r)), 'confidence_0': r[0], 'confidence_1': r[1], 'confidence_2': r[2], 'confidence_3': r[3], 'confidence_4': r[4], 'confidence_5': r[5]})
    df = pd.DataFrame.from_dict(dic) 
    df.to_csv(nome, encoding='utf-8', index=False)

In [27]:
OBJECTIVE = 'classification'
for i in range(1,5):
    enem=f'C{i}'
    nome_csv = f"Bert-{OBJECTIVE}-{pairs[enem]}--Narrativo.csv"
    modelo3 = get_model_instance(f"kamel-usp/aes_enem_models-sourceA-{OBJECTIVE}-from-bertimbau-base-{enem}", OBJECTIVE)
    modelo3.load_state_dict(torch.load(f"modelo-{OBJECTIVE}-{pairs[enem]}.pt"))
    gerar_novo_csv(modelo3, test, f"test{nome_csv}", OBJECTIVE)
    gerar_novo_csv(modelo3, train, f"train{nome_csv}", OBJECTIVE)
    gerar_novo_csv(modelo3, validation, f"validation{nome_csv}", OBJECTIVE)