In [16]:
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install accelerate



In [1]:
#

In [17]:
import os
import numpy as np
import pandas as pd
import datasets
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

In [18]:
annotated_texts_df = pd.read_csv("annotated_texts_repr.csv", sep=",", encoding="utf-8")
annotated_texts_df = annotated_texts_df.rename(columns={"polarization":"label"})

In [19]:
annotated_texts_df.head(1)

Unnamed: 0,id,text,pop_sum,manichean,peoplecentrism,antielitism,emotional,label,tfidf,doc_embedding,doc_embedding_pos,linguistic_profile
0,ParlaMint-IT_2013-08-01-LEG17-Senato-sed-86.u153,"PETROCELLI . Signor Presidente, senatrici e se...",4,1,1,1,1,1,"[0.5361957907801886, 0.049413195954373046, 0.0...","[0.009776607354980394, 0.04375904489842546, -0...","[0.0025272382080579183, 0.002842237250819832, ...","[47.0, 1831.0, 38.95744680851064, 4.6773997569..."


In [20]:
df_pol_train, df_pol_test = train_test_split(annotated_texts_df, test_size=0.25, stratify=annotated_texts_df['label'], random_state=42)
df_pol_train = df_pol_train[["id", "text", "label"]]
df_pol_test = df_pol_test[["id", "text", "label"]]


In [21]:
# Trasformo i dataset nel formato di huggingface
train = datasets.Dataset.from_pandas(pd.DataFrame(data=df_pol_train))
test = datasets.Dataset.from_pandas(pd.DataFrame(data=df_pol_test))
train = train.class_encode_column('label')
test = test.class_encode_column('label')

# ricavo il validation set
train_val = train.train_test_split(test_size=0.2, seed=8, stratify_by_column='label')
train = train_val["train"]
val = train_val["test"]

Stringifying the column:   0%|          | 0/8130 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8130 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/2710 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2710 [00:00<?, ? examples/s]

In [22]:
# nome modello
model_name = 'dbmdz/bert-base-italian-xxl-cased'

# carico il modello
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# tokenizzatore associato
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-xxl-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

# tokenizzo i dataset
train = train.map(tokenize, batched=True)
val = val.map(tokenize, batched=True)
test = test.map(tokenize, batched=True)

train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/6504 [00:00<?, ? examples/s]

Map:   0%|          | 0/1626 [00:00<?, ? examples/s]

Map:   0%|          | 0/2710 [00:00<?, ? examples/s]

In [31]:
num_epochs = 1

training_args = TrainingArguments(
    f"{model_name}-ACTI_finetuned",
    eval_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=num_epochs,
    weight_decay=0,
    load_best_model_at_end=True,
)

In [32]:
def acc_metric(eval_pred):
  accuracy_metric = evaluate.load("accuracy")
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  return accuracy_metric.compute(predictions=predictions, references=labels)

In [33]:
# fine-tuning del modello
trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    compute_metrics=acc_metric
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3981,0.835635,0.740467


TrainOutput(global_step=813, training_loss=0.398126656132284, metrics={'train_runtime': 695.4972, 'train_samples_per_second': 9.352, 'train_steps_per_second': 1.169, 'total_flos': 1711289668902912.0, 'train_loss': 0.398126656132284, 'epoch': 1.0})

In [34]:
# predizioni sul validation set
predictions = trainer.predict(val)

# classification report sul validation set
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.58      0.57      0.57       248
           1       0.76      0.74      0.75       743
           2       0.78      0.81      0.79       635

    accuracy                           0.74      1626
   macro avg       0.71      0.71      0.71      1626
weighted avg       0.74      0.74      0.74      1626



In [35]:
# salvo il modello
trainer.save_model("FINETUNED_MODEL")

In [36]:
log_history = trainer.state.log_history
import json
with open('log.json', 'w') as f:
    json.dump(log_history, f)

In [38]:
predictions = trainer.predict(test)

predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.64      0.62      0.62       413
           1       0.76      0.74      0.75      1239
           2       0.75      0.79      0.77      1058

    accuracy                           0.74      2710
   macro avg       0.72      0.71      0.72      2710
weighted avg       0.74      0.74      0.74      2710

