In [1]:
# Import standard libraries
import json
import time
import datetime
import gc
import random
import re

# Import data handling libraries
import numpy as np
import pandas as pd

# Import NLP libraries
import nltk
from nltk.corpus import stopwords

# Import utilities for displaying and progress tracking
from tabulate import tabulate
from tqdm import trange

# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report

# Import PyTorch libraries
import torch
import torch.nn as nn
from torch.utils.data import (
    TensorDataset,
    DataLoader,
    RandomSampler,
    SequentialSampler,
    random_split
)


# Import Transformers library components
import transformers
from transformers import (
    BertForSequenceClassification,
    AdamW,
    BertConfig,
    BertTokenizer,
    get_linear_schedule_with_warmup
)


In [2]:
#verificare la disponibilità di cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/wangrongsheng/ag_news/" + splits["test"])

# World (0), Sports (1), Business (2), Sci/Tech (3).

df['label'].value_counts()

label
2    30000
3    30000
1    30000
0    30000
Name: count, dtype: int64

In [90]:
# Carica il file JSON come JSON Lines
df = pd.read_json('dataset.json', lines=True)


## ETL

In [4]:
# Scarica le stopwords
nltk.download('stopwords')
sw = stopwords.words('english')

# Definizione della funzione clean_text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"http\S+", "", text)
    html = re.compile(r'<.*?>')
    text = html.sub(r'', text)
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p, '')
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    emoji_pattern = re.compile("["  
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dswal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### ETL primo

In [5]:
# Uniamo i due dataset
df_combined = pd.concat([df, df_test])

# Contiamo le occorrenze di ciascuna classe nel dataset combinato
class_counts = df_combined['label'].value_counts()
print(class_counts)

# Per ogni classe, prendiamo solo 4400 elementi
df_final = df_combined.groupby('label').apply(lambda x: x.sample(n=4400, random_state=42))

# Rimuoviamo l'indice multi-livello creato dal groupby
df_final = df_final.reset_index(drop=True)

# Controlliamo che abbiamo 4400 elementi per classe
print(df_final['label'].value_counts())

label
2    31900
3    31900
1    31900
0    31900
Name: count, dtype: int64
label
0    4400
1    4400
2    4400
3    4400
Name: count, dtype: int64


  df_final = df_combined.groupby('label').apply(lambda x: x.sample(n=4400, random_state=42))


In [6]:
df['text'] = df['text'].apply(clean_text)

In [7]:
# Eseguiamo lo split 80% - 20%
df_train, df_test_split = train_test_split(df_final, test_size=0.2, random_state=42)

# Verifichiamo le dimensioni dei dataset risultanti
print(f"Training set size: {df_train.shape}")
print(f"Test set size: {df_test_split.shape}")


Training set size: (14080, 2)
Test set size: (3520, 2)


### ETL secondo

In [91]:
df = df[['category', 'short_description']]

In [92]:
len(df)

209527

In [93]:
df = df[(df['short_description'].str.split().str.len() >= 20) & (df['short_description'].str.split().str.len() <= 100) ]


In [94]:
df['category'].value_counts()

category
WELLNESS          15568
POLITICS           9608
PARENTING          7605
TRAVEL             7401
STYLE & BEAUTY     7117
ENTERTAINMENT      3939
BUSINESS           3373
WEDDINGS           3039
DIVORCE            2861
HEALTHY LIVING     2844
QUEER VOICES       2705
FOOD & DRINK       2668
HOME & LIVING      2403
IMPACT             2243
PARENTS            1752
BLACK VOICES       1714
WOMEN              1555
MONEY              1463
COMEDY             1406
SPORTS             1319
WORLDPOST          1166
GREEN              1141
ENVIRONMENT        1066
RELIGION           1060
FIFTY              1021
WORLD NEWS         1016
CRIME               989
THE WORLDPOST       879
TECH                877
SCIENCE             869
ARTS                823
CULTURE & ARTS      816
MEDIA               746
EDUCATION           712
U.S. NEWS           695
COLLEGE             563
TASTE               545
STYLE               338
LATINO VOICES       337
GOOD NEWS           254
WEIRD NEWS          250
ARTS & 

In [95]:
categories_to_include = ['WELLNESS', 'POLITICS', 'TRAVEL', 'ENTERTAINMENT', 'BUSINESS']
df = df[df['category'].isin(categories_to_include)]


In [None]:
# Applicazione della funzione alla colonna 'short_description'
df['short_description_cleaned'] = df['short_description'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dswal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
len(df)

37550

## Bilanciamento secondo dataset

In [None]:
minority_class_size = df['category'].value_counts().min()

# Seleziona 3000 elementi per ogni categoria mantenendo la distribuzione dei token
sampled_dfs = []
for category, group in df.groupby('category'):
    sampled_group = group.sample(n=min(minority_class_size, len(group)), random_state=42)
    sampled_dfs.append(sampled_group)

# Combina i campioni in un unico DataFrame
balanced_df = pd.concat(sampled_dfs)

# Controlla il risultato
balanced_df['category'].value_counts()

category
BUSINESS         3183
ENTERTAINMENT    3183
POLITICS         3183
TRAVEL           3183
WELLNESS         3183
Name: count, dtype: int64

In [100]:
df['category'].value_counts()

category
WELLNESS         14463
POLITICS          9147
TRAVEL            6981
ENTERTAINMENT     3776
BUSINESS          3183
Name: count, dtype: int64

## Encoding

In [101]:
# Crea un'istanza di LabelEncoder
label_encoder = LabelEncoder()

# Trasforma la colonna 'category' in numerica
balanced_df['category'] = label_encoder.fit_transform(balanced_df['category'])

In [102]:
# Visualizza la corrispondenza tra le categorie e i numeri
category_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

print("Corrispondenza tra categorie e numeri:")
print(category_mapping)


Corrispondenza tra categorie e numeri:
{'BUSINESS': 0, 'ENTERTAINMENT': 1, 'POLITICS': 2, 'TRAVEL': 3, 'WELLNESS': 4}


## Train_test_split

In [103]:
# Dividi i dati bilanciati in 80-20
train_df, test_df = train_test_split(balanced_df, test_size=0.2, stratify=balanced_df['category'], random_state=42)

# Controlla le distribuzioni
print("Distribuzione nel training set:")
print(train_df['category'].value_counts())

print("\nDistribuzione nel testing set:")
print(test_df['category'].value_counts())

Distribuzione nel training set:
category
1    2547
4    2547
3    2546
2    2546
0    2546
Name: count, dtype: int64

Distribuzione nel testing set:
category
2    637
0    637
3    637
1    636
4    636
Name: count, dtype: int64


In [104]:
sentences = list(train_df['short_description'])
labels = list(train_df['category'])

# Tokenizzazione

In [105]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [106]:
index=0
print(' Original: ', sentences[index])

table = np.array([tokenizer.tokenize(sentences[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[index]))]).T
print(tabulate(table,headers = ['Tokens', 'Token IDs'],tablefmt = 'fancy_grid'))


 Original:  With Easter just around the corner, kid-friendly resortsare offering spring savings along with bunny-themed events and thousands upon thousands of hidden eggs.
╒═══════════╤═════════════╕
│ Tokens    │   Token IDs │
╞═══════════╪═════════════╡
│ with      │        2007 │
├───────────┼─────────────┤
│ easter    │       10957 │
├───────────┼─────────────┤
│ just      │        2074 │
├───────────┼─────────────┤
│ around    │        2105 │
├───────────┼─────────────┤
│ the       │        1996 │
├───────────┼─────────────┤
│ corner    │        3420 │
├───────────┼─────────────┤
│ ,         │        1010 │
├───────────┼─────────────┤
│ kid       │        4845 │
├───────────┼─────────────┤
│ -         │        1011 │
├───────────┼─────────────┤
│ friendly  │        5379 │
├───────────┼─────────────┤
│ resorts   │       16511 │
├───────────┼─────────────┤
│ ##are     │       12069 │
├───────────┼─────────────┤
│ offering  │        5378 │
├───────────┼─────────────┤
│ spring    │   

In [None]:
max_len = 0

# For every sentence...
for sentence in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sentence, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  84


In [108]:
input_ids = []
attention_masks = []

# For every tweet...
for sentence in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)





In [109]:
# Print sentence 0, now as a list of IDs.
i=random.randint(0,len(sentences)-1)
print('Original: ', sentences[i])
print('Input IDs:', input_ids[i])

Original:  One of the key components of transformation and innovation is the business model, and since the ability of companies to transform and reinvent themselves is crucial to their lifeline, I went straight to the source.
Input IDs: tensor([  101,  2028,  1997,  1996,  3145,  6177,  1997,  8651,  1998,  8144,
         2003,  1996,  2449,  2944,  1010,  1998,  2144,  1996,  3754,  1997,
         3316,  2000, 10938,  1998, 27788, 15338,  3209,  2003, 10232,  2000,
         2037,  2166,  4179,  1010,  1045,  2253,  3442,  2000,  1996,  3120,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [110]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(sentences) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(input_ids[index]))
  token_ids = [i.numpy() for i in input_ids[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(sentences[index])
  print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

When Kathy discovered that her son, Erik, was snorting heroin, she decided to give it a try, too. “I’m kind of embarrassed
╒═════════════╤═════════════╤══════════════════╕
│ Tokens      │   Token IDs │   Attention Mask │
╞═════════════╪═════════════╪══════════════════╡
│ [CLS]       │         101 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ when        │        2043 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ kathy       │       14986 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ discovered  │        3603 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ that        │        2008 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ her         │        2014 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ son         │        2365 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ ,           │        1010 │               

## Train Validation Split

In [111]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

10,185 training samples
2,547 validation samples


In [None]:
#Per eseguire il fine-tuning di BERT su un task specifico, è raccomandata una dimensione del batch pari a 16 o a 32
batch_size = 16

#Crea il DataLoaders per il training set e per il validation set
#Vengono presi gli elementoi del training set in ordine randomico
train_dataloader = DataLoader(
            train_dataset,  #Il training set
            sampler = RandomSampler(train_dataset), #Seleziona randomicamente i batch
            batch_size = batch_size #Esegue l'allenamento con questa dimensione dei batch
        )


validation_dataloader = DataLoader(
            val_dataset, #Il validation set
            sampler = SequentialSampler(val_dataset), #Seleziona i batch in maniera sequenziale
            batch_size = batch_size #Esegue una valutazione del modello con questa dimensione dei batch
        )

## Metrics

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

## Creazione modello BERT

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

## Fine-tuning

In [None]:
# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


## Test

In [None]:
test_input_ids = []
test_attention_masks = []
test_tweets=['WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.']
for tweet in test_tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,                     
                        add_special_tokens = True, 
                        max_length = max_len,         
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

In [None]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()
            
            predictions.extend(list(pred_flat))

In [None]:
df_output = pd.DataFrame()
df_output['tweets']=test_tweets
df_output['label'] =predictions


In [None]:
df_output.head()