### Finetuning Longformer Binario

In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import os
import logging
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig

In [2]:
import wandb

In [3]:
wandb.login()
wandb.init(project="binaryclass_long", entity="raffalo8888")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraffalo8888[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [5]:
df = pd.read_csv("/kaggle/input/mergereddata/generated_pairs.csv")

In [6]:
df["Same_Category"] = df["Same_Category"].astype(float)

### Fine Tuning modello 

In [6]:
from torch.nn.functional import cosine_similarity
from torch import nn
from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel, LongformerClassificationHead
from transformers import LongformerModel

class ModifiedModelForBinaryClassification(LongformerPreTrainedModel):
    def __init__(self, config):
        super(ModifiedModelForBinaryClassification, self).__init__(config)
        self.longformer = LongformerModel(config)
        self.init_weights()
    
    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2, labels=None):
       # global attention mask con attenzione solo sul primo token 
        global_attention_mask_1 = torch.zeros_like(input_ids_1)
        global_attention_mask_1[:, 0] = 1 

        global_attention_mask_2 = torch.zeros_like(input_ids_2)
        global_attention_mask_2[:, 0] = 1  

        outputs_1 = self.longformer(input_ids_1, attention_mask=attention_mask_1, 
                                    global_attention_mask=global_attention_mask_1)
        sequence_output_1 = outputs_1['last_hidden_state']
        cls_token_1 = sequence_output_1[:, 0, :]  

        outputs_2 = self.longformer(input_ids_2, attention_mask=attention_mask_2, 
                                    global_attention_mask=global_attention_mask_2)
        sequence_output_2 = outputs_2['last_hidden_state']
        cls_token_2 = sequence_output_2[:, 0, :]  
        #distanza del coseno tra cls1 e cls2
        logits = cosine_similarity(cls_token_1, cls_token_2).unsqueeze(-1)  # Assicurati che i logits siano della forma [batch_size, 1]

        loss = None
        if labels is not None:
                loss_fct = nn.BCEWithLogitsLoss()
                labels = labels.view(-1, 1).float()  # Ridimensiona le etichette per batch_size =1
                loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits


### Dataloader

In [10]:
class TextPairDataset(Dataset):
    def __init__(self, text_pairs, labels, tokenizer, max_length=1024):
        self.text_pairs = text_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

  
    def __getitem__(self, idx):
        text1, text2 = self.text_pairs[idx]
        tokenized_input1 = self.tokenizer(text1, return_tensors="pt", max_length=self.max_length, truncation=True, padding='max_length')
        tokenized_input2 = self.tokenizer(text2, return_tensors="pt", max_length=self.max_length, truncation=True, padding='max_length')

        return {
        'input_ids_1': tokenized_input1['input_ids'].squeeze(0),
        'attention_mask_1': tokenized_input1['attention_mask'].squeeze(0),
        'input_ids_2': tokenized_input2['input_ids'].squeeze(0),
        'attention_mask_2': tokenized_input2['attention_mask'].squeeze(0),
        'labels': torch.tensor(self.labels[idx], dtype=torch.float)  
        }

    def __len__(self):
        return len(self.text_pairs)



### split dataset, instanziazione tokenizer 

In [None]:
df = df.sample(15000) 
train_texts, test_texts, train_labels, test_labels = train_test_split(df[['Text1','Text2']], df['Same_Category'], test_size=0.2)
train_texts.to_csv("train_text.csv")
test_texts.to_csv("test_text.csv")
train_labels.to_csv("train_labels.csv")
test_labels.to_csv("test_labels.csv")

tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096',
                                                    padding = 'max_length',
                                                    truncation=True,max_length = 1024,
                                                   )

def tokenize_function(text1, text2):
    return tokenizer(text1, text2, padding='max_length', truncation=True, max_length=1024)

train_texts = [(row['Text1'], row['Text2']) for index, row in df.iterrows()]
train_labels = df['Same_Category'].values

test_texts = [(row['Text1'], row['Text2']) for index, row in df.iterrows()]
test_labels = df['Same_Category'].values

train_labels_tensor = torch.tensor(train_labels, dtype=torch.float)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.float)

train_dataset = TextPairDataset(train_texts, train_labels_tensor,tokenizer)
test_dataset = TextPairDataset(test_texts, test_labels_tensor,tokenizer)


In [36]:
model = ModifiedModelForBinaryClassification.from_pretrained('allenai/longformer-base-4096', 
                                                            gradient_checkpointing=False,
                                                             attention_window = 512)


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
model.to(device)

ModifiedModelForBinaryClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
         

### Funzioni per calcolare le metriche

In [7]:
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

def multi_label_metrics(predictions, labels):
    probs = predictions  
    y_pred = np.zeros(probs.shape)
    y_true = labels
    y_pred[probs >= 0.5] = 1  
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, probs, average='micro')  
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {
        'f1': f1_micro_average,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result


###  Creazione trainer, e allenamento per 5 epoche , l'ultima epoca non è stata completata.

In [None]:

from transformers import TrainingArguments, Trainer

# Definizione degli argomenti di training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    warmup_steps=20,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=4,
    fp16=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    
) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
    
)

# Addestramento
trainer.train()

# Salva il modello migliore
model.save_pretrained("./best_model")

# Valutazione
trainer.evaluate()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  'labels': torch.tensor(self.labels[idx], dtype=torch.float)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
0,0.5475,0.524543,0.8672,0.91227,0.8672
2,0.5113,0.487825,0.930067,0.959081,0.930067
4,0.5127,0.473986,0.9418,0.967134,0.9418


  'labels': torch.tensor(self.labels[idx], dtype=torch.float)
  'labels': torch.tensor(self.labels[idx], dtype=torch.float)
  'labels': torch.tensor(self.labels[idx], dtype=torch.float)
  'labels': torch.tensor(self.labels[idx], dtype=torch.float)


### 5 epoca 

In [14]:

from transformers import TrainingArguments, Trainer
model = ModifiedModelForBinaryClassification.from_pretrained("/kaggle/working/results/checkpoint-3750").to(device)

# Definizione degli argomenti di training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    warmup_steps=20,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=4,
    fp16=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    resume_from_checkpoint = True,
    
) #save_strategy="epoch"
#load_best_model_at_end=True
 # Valuta ad ogni epoca
    #logging_strategy="steps",
#logging_strategy="steps",
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    
)

# Addestramento

trainer.train("/kaggle/working/results/checkpoint-3750")

# Salva il modello migliore
model.save_pretrained("./best_model")

# Valutazione
trainer.evaluate()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  'labels': torch.tensor(self.labels[idx], dtype=torch.float)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
6,0.4469,0.477389,0.931667,0.96598,0.931667


  'labels': torch.tensor(self.labels[idx], dtype=torch.float)


{'eval_loss': 0.4799279272556305,
 'eval_f1': 0.935,
 'eval_roc_auc': 0.9624858119880568,
 'eval_accuracy': 0.935,
 'eval_runtime': 553.6594,
 'eval_samples_per_second': 5.418,
 'eval_steps_per_second': 0.677,
 'epoch': 6.0}

### Allenato il modello , ho riadattato il codice per estrarre gli embeddings , prima il Dataloader e infinie il modello

In [2]:

class SingleTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=1024):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError("Index out of range")
        
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

    def __len__(self):
        return len(self.texts)


In [4]:


class ModifiedModelForBinaryClassification(LongformerPreTrainedModel):
    def __init__(self, config):
        super(ModifiedModelForBinaryClassification, self).__init__(config)
        self.longformer = LongformerModel(config)
        self.init_weights()
    
    def forward(self, input_ids_1, attention_mask_1, input_ids_2=None, attention_mask_2=None, labels=None):
    # global attention mask con attenzione solo sul primo token 
        global_attention_mask_1 = torch.zeros_like(input_ids_1)
        global_attention_mask_1[:, 0] = 1 
        
        outputs_1 = self.longformer(input_ids_1, attention_mask=attention_mask_1, 
                                    global_attention_mask=global_attention_mask_1)
        sequence_output_1 = outputs_1['last_hidden_state']
        cls_token_1 = sequence_output_1[:, 0, :]  
        
        if input_ids_2 is None:
            return cls_token_1
        
        
        global_attention_mask_2 = torch.zeros_like(input_ids_2)
        global_attention_mask_2[:, 0] = 1  
        
        outputs_2 = self.longformer(input_ids_2, attention_mask=attention_mask_2, 
                                    global_attention_mask=global_attention_mask_2)
        sequence_output_2 = outputs_2['last_hidden_state']
        cls_token_2 = sequence_output_2[:, 0, :]  
        if(inference == True):
            return cls_token_1,cls_token_2
        logits = cosine_similarity(cls_token_1, cls_token_2).unsqueeze(-1)  # Assicurati che i logits siano della forma [batch_size, 1]

        loss = None
        if labels is not None:
                loss_fct = nn.BCEWithLogitsLoss()
                labels = labels.view(-1, 1).float()  # Ridimensiona le etichette per batch_size =1
                loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits



In [5]:
model = ModifiedModelForBinaryClassification.from_pretrained("/kaggle/working/results/checkpoint-3750").to(device)
model

ModifiedModelForBinaryClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
         

### Estrazione dei cls per i vari dataset

In [7]:
dataset = SingleTextDataset(df["Text"], tokenizer, max_length=1024)

data_loader = DataLoader(dataset, batch_size=16, shuffle=False)

model.eval()
embeddings = []
with torch.no_grad():
    for batch in data_loader:
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids_1=input_ids, attention_mask_1=attention_mask)
        
        # Supponendo che il modello restituisca gli embeddings come parte dell'output
        # Ad esempio, potresti avere modificato il modello per restituire il token CLS
        embeddings.append(outputs.cpu().squeeze(1))
        
embeddings_def =  torch.cat(embeddings, dim=0)

In [8]:
numpy_array = embeddings_def.numpy()
df_t = pd.DataFrame(numpy_array)
df_t.to_csv('emb_per_raffa_knn.csv', index=False) 

In [6]:
df = pd.read_csv("/kaggle/input/longformerdata/dataset_Longformer.csv")

In [7]:
df = df[df['Text'].apply(lambda x: isinstance(x, str))]

In [10]:
dataset = SingleTextDataset(df_1["Text"], tokenizer, max_length=1024)

data_loader = DataLoader(dataset, batch_size=64, shuffle=False)

model.eval()
embeddingsl = []
with torch.no_grad():
    for batch in data_loader:
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids_1=input_ids, attention_mask_1=attention_mask)
        
        embeddingsl.append(outputs.cpu().squeeze(1))
        
embeddings_def =  torch.cat(embeddingsl, dim=0)

In [12]:
numpy_array = embeddings_def.numpy()
df_l = pd.DataFrame(numpy_array)
df_l.to_csv('emb_per_raffa_long.csv', index=False) 

In [16]:
df_2.reset_index(inplace=True)

In [19]:
from tqdm import tqdm

dataset = SingleTextDataset(df_2["Text"], tokenizer, max_length=1024)
data_loader = DataLoader(dataset, batch_size=64, shuffle=False)
model.eval()
embeddingsl = []

# Misura il numero totale di iterazioni
total_iterations = len(data_loader)

# Crea una barra di avanzamento
progress_bar = tqdm(total=total_iterations, desc='Estrazione embeddings', unit='batch')

with torch.no_grad():
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids_1=input_ids, attention_mask_1=attention_mask)
        embeddingsl.append(outputs.cpu().squeeze(1))
        
        # Aggiorna la barra di avanzamento
        progress_bar.update(1)

# Ferma la barra di avanzamento
progress_bar.close()

embeddings_def = torch.cat(embeddingsl, dim=0)


Estrazione embeddings: 100%|██████████| 633/633 [41:20<00:00,  3.92s/batch]


In [20]:
numpy_array = embeddings_def.numpy()
df_l = pd.DataFrame(numpy_array)
df_l.to_csv('emb_per_raffa_long_p2.csv', index=False) 