### Aplicação da estratégia com LLM para a tarefa de correção automática de redações *

In [None]:
import pandas as pd
import numpy as np
import torch
from scipy.stats import spearmanr
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EvalPrediction
)

# CONFIGURAÇÕES GERAIS
MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'
ATTRIBUTES = ['formal_register', 'thematic_coherence', 'narrative_rhetorical_structure', 'cohesion']
LABELS_LIST = [1, 2, 3, 4, 5]

# CARREGAMENTO DOS DADOS
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('validation.csv')
df_test = pd.read_csv('test.csv')

# REMOÇÃO DE TAGS
tag_regex =  r'(\[[PpSsTtXx?]\])'
tag_regex += r'|({[ptx?]})'
tag_regex += r'|(\[L[TC]\])'
tag_regex += r'|(\[l[tc]\])'
tag_regex += r'|(\[ P\])'
tag_regex += r'|(\[[PX?]\})'
tag_regex += r'|(\{?\])'
df_train.essay = df_train.essay.str.replace(tag_regex, '', regex=True)
df_test.essay = df_test.essay.str.replace(tag_regex, '', regex=True)
df_val.essay = df_val.essay.str.replace(tag_regex, '', regex=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['essay'],
        examples['prompt'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

# FUNÇÃO DE CÁLCULO DE MÉTRICAS
def compute_metrics(p: EvalPrediction):
    preds = p.predictions.flatten()
    labels = p.label_ids.flatten()
    rounded = np.round(preds)
    clipped = np.clip(rounded, 1, 5)
    qwk = cohen_kappa_score(labels, clipped, weights='quadratic')
    return {"qwk": qwk}

# RELATÓRIO FINAL
relatorio = {}

# LOOP POR ATRIBUTO
for atributo in ATTRIBUTES:
    print(f"\n{'='*20} Treinando modelo para: {atributo} {'='*20}")

    # Garante que os rótulos estejam em float
    for df in [df_train, df_val, df_test]:
        df[atributo] = df[atributo].astype(float)

    raw_datasets = DatasetDict({
        'train': Dataset.from_pandas(df_train[['essay', 'prompt', atributo]].rename(columns={atributo: 'labels'})),
        'validation': Dataset.from_pandas(df_val[['essay', 'prompt', atributo]].rename(columns={atributo: 'labels'})),
        'test': Dataset.from_pandas(df_test[['essay', 'prompt', atributo]].rename(columns={atributo: 'labels'})),
    })

    tokenized = raw_datasets.map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)

    args = TrainingArguments(
        output_dir=f'./results_{atributo}',
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='qwk',
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        logging_dir=f'./logs_{atributo}',
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized['train'],
        eval_dataset=tokenized['validation'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    print(f"Treinamento concluído para {atributo}. Avaliando...")

    results = trainer.predict(tokenized['validation'])

    preds = results.predictions.flatten()
    labels = results.label_ids.flatten()
    rounded_preds = np.clip(np.round(preds), 1, 5).astype(int)
    labels = labels.astype(int)

    exact_match = np.sum(rounded_preds == labels) / len(labels)
    within_1 = np.sum(np.abs(rounded_preds - labels) <= 1) / len(labels)

    spearman_corr, _ = spearmanr(labels, rounded_preds)

    relatorio[atributo] = {
        'MAE': mean_absolute_error(labels, preds),
        'Spearmans rank correlation': spearman_corr,
        'QWK': cohen_kappa_score(labels, rounded_preds, weights='quadratic'),
        'Acurácia Exata (%)': round(exact_match * 100, 2),
        'Acurácia ±1 (%)': round(within_1 * 100, 2),
        'Previsões': rounded_preds,
    }




Map:   0%|          | 0/740 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Qwk
1,No log,0.543063,0.40586
2,No log,0.351376,0.556675
3,No log,0.271347,0.612892


Treinamento concluído para formal_register. Avaliando...





Map:   0%|          | 0/740 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Qwk
1,No log,0.376028,0.762244
2,No log,0.356104,0.785031
3,No log,0.348499,0.797364


Treinamento concluído para thematic_coherence. Avaliando...





Map:   0%|          | 0/740 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Qwk
1,No log,0.674098,0.225503
2,No log,0.654186,0.401037
3,No log,0.487153,0.436351


Treinamento concluído para narrative_rhetorical_structure. Avaliando...





Map:   0%|          | 0/740 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Qwk
1,No log,0.338953,0.259363
2,No log,0.350455,0.42933
3,No log,0.329738,0.508605


Treinamento concluído para cohesion. Avaliando...


In [None]:
from sklearn.metrics import classification_report

print("\n\nRELATÓRIO DE CLASSIFICAÇÃO POR ATRIBUTO:")
for atributo in ATTRIBUTES:
    print(f"\n{'='*20} Relatório de Classificação para: {atributo} {'='*20}")

    # Os rótulos reais para o atributo atual estão na coluna correspondente do df_val
    y_true = df_val[atributo].astype(int).tolist()

    # As previsões para o atributo atual estão armazenadas no dicionário 'relatorio'
    y_pred = relatorio[atributo]['Previsões'].tolist()

    # Gere e imprima o classification report
    # Definimos as classes como 1 a 5, pois são os possíveis scores
    print(classification_report(y_true, y_pred, labels=LABELS_LIST, zero_division=0))
    print(f"Métrica Qwk: {relatorio[atributo]['QWK']}")
    print(f"Métrica Spearman's: {relatorio[atributo]['Spearmans rank correlation']}\n")



RELATÓRIO DE CLASSIFICAÇÃO POR ATRIBUTO:

              precision    recall  f1-score   support

           1       1.00      0.20      0.33         5
           2       0.69      0.39      0.50        23
           3       0.77      0.84      0.80        79
           4       0.48      0.75      0.59        16
           5       0.00      0.00      0.00         2

    accuracy                           0.70       125
   macro avg       0.59      0.44      0.44       125
weighted avg       0.71      0.70      0.69       125

Métrica Qwk: 0.6128916096507022
Métrica Spearman's: 0.6506043597594393


              precision    recall  f1-score   support

           1       0.88      0.97      0.93        39
           2       0.65      0.37      0.47        35
           3       0.60      0.86      0.70        43
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00         3

    accuracy                           0.70       125
   macro avg  