# Librerias

In [55]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import gc  # For garbage collection to manage memory
import re  # For regular expressions
import numpy as np  # For numerical operations and arrays

import warnings  # For handling warnings
warnings.filterwarnings("ignore")  # Ignore warning messages

import torch  # PyTorch library for deep learning
from transformers import AutoModel, AutoTokenizer  # Transformers library for natural language processing
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, Trainer, TrainingArguments, DataCollatorWithPadding  # Transformers components for text processing
from transformers import AutoModelForSequenceClassification  # Transformer model for sequence classification

from nlp import Dataset  # Import custom 'Dataset' class for natural language processing tasks
from imblearn.over_sampling import RandomOverSampler  # For oversampling to handle class imbalance
import datasets  # Import datasets library
from datasets import Dataset, Image, ClassLabel  # Import custom 'Dataset', 'ClassLabel', and 'Image' classes
from transformers import pipeline  # Transformers library for pipelines
from bs4 import BeautifulSoup  # For parsing HTML content

import matplotlib.pyplot as plt  # For data visualization
import itertools  # For working with iterators
from sklearn.metrics import (  # Import various metrics from scikit-learn
    accuracy_score,  # For calculating accuracy
    roc_auc_score,  # For ROC AUC score
    confusion_matrix,  # For confusion matrix
    classification_report,  # For classification report
    f1_score  # For F1 score
)

from datasets import load_metric  # Import load_metric function to load evaluation metrics

from tqdm import tqdm  # For displaying progress bars
tqdm.pandas()  # Enable progress bars for pandas operations

In [13]:
# Set parameters

# Fraction of the dataset used for training, the rest will be used for validation
train_fraction = 0.8

# Number of training epochs
num_train_epochs = 10

# Learning rate
learning_rate = 5e-6

# Batch size for training
train_batch_size = 32

# Batch size for validation
eval_batch_size = 128

# Number of warm-up steps during training
warmup_steps = 50

# Weight decay to control regularization during training
weight_decay = 0.02

# Pre-trained BERT model to be used
BERT_MODEL = "bert-base-uncased"

# Directory where the model output will be saved
output_dir = "Trained Models"

In [14]:
%%time
df = pd.read_json('Datasets/News_Category_Dataset_v3.json', lines=True)  # Read the JSON file into a DataFrame
item0 = df.shape[0]  # Store the initial number of items in the DataFrame
df = df.drop_duplicates()  # Remove duplicate rows from the DataFrame
item1 = df.shape[0]  # Store the number of items in the DataFrame after removing duplicates
print(f"There are {item0-item1} duplicates found in the dataset")  # Print the number of duplicates removed
df.category = df.category.replace('THE WORLDPOST','WORLDPOST')
df.category = df.category.replace('PARENTS','PARENTING')
df.category = df.category.replace('ARTS','ARTS & CULTURE')
df.category = df.category.replace('COLLEGE','EDUCATION')
df.category = df.category.replace('STYLE','STYLE & BEAUTY')
df.category = df.category.replace('TASTE','FOOD & DRINK')
df.category = df.category.replace('WEDDINGS','WEDDINGS & DIVORCES')
df.category = df.category.replace('CULTURE & ARTS','ARTS & CULTURE')
df.category = df.category.replace('GREEN','ENVIRONMENT')
df.category = df.category.replace('HEALTHY LIVING','WELLNESS')
df.category = df.category.replace('WORLDPOST','WORLD NEWS')
df.category = df.category.replace('DIVORCE','WEDDINGS & DIVORCES')

df = df.rename(columns={'category': 'label'})  # Rename the 'category' column to 'label'

df['title'] = df['headline'] + '\n\n\n' + df['short_description']  # Create a new 'title' column by combining 'headline' and 'short_description'

df = df[['label', 'title']]  # Select only the 'label' and 'title' columns
df = df[~df['title'].isnull()]  # Remove rows where 'title' is null
df = df[~df['label'].isnull()]  # Remove rows where 'label' is null

print(df.shape)  # Print the shape of the DataFrame after data preprocessing
df.sample(5).T  # Display a random sample of 5 rows from the DataFrame


There are 13 duplicates found in the dataset
(209514, 2)
CPU times: total: 484 ms
Wall time: 1.9 s


Unnamed: 0,45121,77368,122116,20188,189704
label,ENTERTAINMENT,TRAVEL,PARENTING,POLITICS,PARENTING
title,Nina Dobrev's 'La La Land'-Themed Birthday Is ...,3 Italian Medieval Villages You Don't Want To ...,Expert Tips on Getting Girls Into STEM\n\n\n's...,They Came To Clear Roy Moore's Character. They...,Day Care Cost: Report Reveals States Where Chi...


In [15]:
# Create a list of unique labels
labels_list = sorted(list(df['label'].unique()))

# Initialize empty dictionaries to map labels to IDs and vice versa
label2id, id2label = dict(), dict()

# Iterate over the unique labels and assign each label an ID, and vice versa
for i, label in enumerate(labels_list):
    label2id[label] = i  # Map the label to its corresponding ID
    id2label[i] = label  # Map the ID to its corresponding label

# Print the resulting dictionaries for reference
print("Mapping of IDs to Labels:", id2label, '\n')
print("Mapping of Labels to IDs:", label2id)

Mapping of IDs to Labels: {0: 'ARTS & CULTURE', 1: 'BLACK VOICES', 2: 'BUSINESS', 3: 'COMEDY', 4: 'CRIME', 5: 'EDUCATION', 6: 'ENTERTAINMENT', 7: 'ENVIRONMENT', 8: 'FIFTY', 9: 'FOOD & DRINK', 10: 'GOOD NEWS', 11: 'HOME & LIVING', 12: 'IMPACT', 13: 'LATINO VOICES', 14: 'MEDIA', 15: 'MONEY', 16: 'PARENTING', 17: 'POLITICS', 18: 'QUEER VOICES', 19: 'RELIGION', 20: 'SCIENCE', 21: 'SPORTS', 22: 'STYLE & BEAUTY', 23: 'TECH', 24: 'TRAVEL', 25: 'U.S. NEWS', 26: 'WEDDINGS & DIVORCES', 27: 'WEIRD NEWS', 28: 'WELLNESS', 29: 'WOMEN', 30: 'WORLD NEWS'} 

Mapping of Labels to IDs: {'ARTS & CULTURE': 0, 'BLACK VOICES': 1, 'BUSINESS': 2, 'COMEDY': 3, 'CRIME': 4, 'EDUCATION': 5, 'ENTERTAINMENT': 6, 'ENVIRONMENT': 7, 'FIFTY': 8, 'FOOD & DRINK': 9, 'GOOD NEWS': 10, 'HOME & LIVING': 11, 'IMPACT': 12, 'LATINO VOICES': 13, 'MEDIA': 14, 'MONEY': 15, 'PARENTING': 16, 'POLITICS': 17, 'QUEER VOICES': 18, 'RELIGION': 19, 'SCIENCE': 20, 'SPORTS': 21, 'STYLE & BEAUTY': 22, 'TECH': 23, 'TRAVEL': 24, 'U.S. NEWS': 25

In [16]:
# Create a dataset from the Pandas DataFrame 'df'
dataset = Dataset.from_pandas(df)

In [17]:
# Creating classlabels to match labels to IDs
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)

# Mapping labels to IDs
def map_label2id(example):
    example['label'] = ClassLabels.str2int(example['label'])
    return example

dataset = dataset.map(map_label2id, batched=True)

# Casting label column to ClassLabel Object
dataset = dataset.cast_column('label', ClassLabels)

# Splitting the dataset into training and testing sets using an 80-20 split ratio.
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, stratify_by_column="label")

# Extracting the training data from the split dataset.
df_train = dataset['train']

# Extracting the testing data from the split dataset.
df_test = dataset['test']

Map: 100%|██████████| 209514/209514 [00:00<00:00, 622741.45 examples/s]
Casting the dataset: 100%|██████████| 209514/209514 [00:00<00:00, 4825146.92 examples/s]


In [18]:
# Deleting the DataFrame 'df'
del df

# Performing garbage collection to free up memory
gc.collect()

117

In [19]:
# Create a tokenizer instance for the specified BERT model.
# - 'AutoTokenizer.from_pretrained' loads the pre-trained tokenizer for the specified model.
# - 'use_fast=True' enables fast tokenization, which is recommended for most use cases.
# - 'low_cpu_mem_usage=False' disables low CPU memory usage mode (useful for larger models).
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=True, low_cpu_mem_usage=False)

In [20]:
# Importantly, this is a simple function for preprocessing data before training a natural language processing model.
# It takes a dataset of examples as input.

def preprocess_function(examples):
    # The main task of this function is to tokenize the text data in the 'title' column of the examples.
    # Tokenization is the process of breaking down text into smaller units, such as words or subwords.
    # In this case, the tokenizer is applied to each 'title' in the examples.

    # The 'truncation=True' parameter indicates that if a title is too long to fit within the model's maximum input length,
    # it should be truncated to fit. Truncation can help ensure that the input data is within the model's capacity.

    return tokenizer(examples["title"], truncation=True)

# The code below applies the preprocess_function to two dataframes, df_train and df_test.

# df_train is likely a training dataset, and df_test is likely a testing dataset.
# These datasets contain examples with a 'title' field that we want to tokenize for further processing.

# The 'map' function is used to apply the preprocess_function to each example in the datasets.
# The 'batched=True' parameter indicates that the tokenization should be applied in batches for efficiency.

df_train = df_train.map(preprocess_function, batched=True)
df_test = df_test.map(preprocess_function, batched=True)

Map: 100%|██████████| 167611/167611 [00:10<00:00, 16380.06 examples/s]
Map: 100%|██████████| 41903/41903 [00:02<00:00, 15673.84 examples/s]


In [21]:
# Remove the 'title' column from the training dataset.
df_train = df_train.remove_columns(['title'])

# Remove the '__index_level_0__' column from the training dataset.
df_train = df_train.remove_columns(['__index_level_0__'])

# Remove the 'title' column from the testing dataset.
df_test = df_test.remove_columns(['title'])

# Remove the '__index_level_0__' column from the testing dataset.
df_test = df_test.remove_columns(['__index_level_0__'])

In [22]:
df_train

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 167611
})

In [23]:
df_test

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 41903
})

In [24]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
# Retrieve the 'input_ids' from the first row of the DataFrame 'df_train'
tokenizer.decode(df_train[0]['input_ids'])

'[CLS] salah abdeslam, prime suspect in paris attacks, will not fight extradition to france abdeslam is due in court in brussels on march 31. [SEP]'

# Instanciamiento y Entrenamiento

In [26]:
# Load a pre-trained BERT-based model for sequence classification.
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=len(labels_list),
    output_attentions=False,  # Set to False: Model will not return attention weights.
    output_hidden_states=False  # Set to False: Model will not return all hidden-states.
)

# Configure the mapping of class labels to their corresponding indices for later reference.
model.config.id2label = id2label  # Mapping from label indices to class labels.
model.config.label2id = label2id  # Mapping from class labels to label indices.

# Calculate and print the number of trainable parameters in millions for the model.
print(model.num_parameters(only_trainable=True) / 1e6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


109.506079


In [27]:
# Import the 'load_metric' function from the Hugging Face datasets library to load a metric.
metric = load_metric("accuracy")

# Define a custom 'compute_metrics' function that will be used for evaluating model performance.
# This function takes 'eval_pred' as input, which is a tuple containing predicted logits and true labels.
def compute_metrics(eval_pred):
    # Unpack the 'eval_pred' tuple into 'logits' (predicted logits) and 'labels' (true labels).
    logits, labels = eval_pred
    
    # Calculate the model's predictions by selecting the class with the highest logit value.
    predictions = np.argmax(logits, axis=-1)
    
    # Use the imported metric to compute the accuracy of the model's predictions.
    accuracy = metric.compute(predictions=predictions, references=labels)
    
    # Return the computed accuracy as the evaluation metric.
    return accuracy

In [28]:
# Create TrainingArguments to configure the training process
training_args = TrainingArguments(
    output_dir=output_dir,  # Directory to save the model checkpoints and logs
    logging_dir='./logs',  # Directory to store training logs
    num_train_epochs=num_train_epochs,  # Number of training epochs
    per_device_train_batch_size=train_batch_size,  # Batch size for training data
    per_device_eval_batch_size=eval_batch_size,  # Batch size for evaluation data
    logging_strategy='steps',  # Logging frequency during training (steps or epoch)
    logging_first_step=True,  # Log the first training step
    load_best_model_at_end=True,  # Load the best model at the end of training
    logging_steps=1,  # Log every training step (useful for debugging)
    learning_rate=learning_rate, # Set the learning rate for the optimizer.
    evaluation_strategy='epoch',  # Evaluation frequency (epoch or steps)
    warmup_steps=warmup_steps,  # Number of warmup steps for the learning rate
    weight_decay=weight_decay,  # Weight decay for regularization
    eval_steps=1,  # Evaluate every training step (useful for debugging)
    save_strategy='epoch',  # Save model checkpoints every epoch
    save_total_limit=1,  # Limit the number of saved checkpoints to save space
    report_to="mlflow",  # Log training metrics to MLflow
)

# Define the trainer:
# Instantiate the trainer class and configure its settings
trainer = Trainer(
    model=model,  # The pretrained or custom model to be trained
    args=training_args,  # TrainingArguments for configuring training
    compute_metrics=compute_metrics,  # Function for computing evaluation metrics
    train_dataset=df_train,  # Training dataset
    eval_dataset=df_test,  # Evaluation dataset
    data_collator=data_collator  # Data collator for batching and preprocessing
)

In [29]:
trainer.evaluate()
# Get initial metrics

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [30]:
# Start training the model
trainer.train(resume_from_checkpoint=True)

Trainer is attempting to log a value of "{0: 'ARTS & CULTURE', 1: 'BLACK VOICES', 2: 'BUSINESS', 3: 'COMEDY', 4: 'CRIME', 5: 'EDUCATION', 6: 'ENTERTAINMENT', 7: 'ENVIRONMENT', 8: 'FIFTY', 9: 'FOOD & DRINK', 10: 'GOOD NEWS', 11: 'HOME & LIVING', 12: 'IMPACT', 13: 'LATINO VOICES', 14: 'MEDIA', 15: 'MONEY', 16: 'PARENTING', 17: 'POLITICS', 18: 'QUEER VOICES', 19: 'RELIGION', 20: 'SCIENCE', 21: 'SPORTS', 22: 'STYLE & BEAUTY', 23: 'TECH', 24: 'TRAVEL', 25: 'U.S. NEWS', 26: 'WEDDINGS & DIVORCES', 27: 'WEIRD NEWS', 28: 'WELLNESS', 29: 'WOMEN', 30: 'WORLD NEWS'}" for key "id2label" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.
Trainer is attempting to log a value of "{'ARTS & CULTURE': 0, 'BLACK VOICES': 1, 'BUSINESS': 2, 'COMEDY': 3, 'CRIME': 4, 'EDUCATION': 5, 'ENTERTAINMENT': 6, 'ENVIRONMENT': 7, 'FIFTY': 8, 'FOOD &

{'loss': 0.6302, 'learning_rate': 2.0018154022549207e-06, 'epoch': 6.0}


 60%|██████    | 31430/52380 [00:08<00:02, 7504.10it/s]

{'loss': 0.5709, 'learning_rate': 2.0017198547678198e-06, 'epoch': 6.0}


 60%|██████    | 31431/52380 [00:12<00:02, 7504.10it/s]

{'loss': 0.448, 'learning_rate': 2.001624307280719e-06, 'epoch': 6.0}


 60%|██████    | 31432/52380 [00:17<00:02, 7504.10it/s]

{'loss': 0.5491, 'learning_rate': 2.0015287597936174e-06, 'epoch': 6.0}


 60%|██████    | 31433/52380 [00:21<00:02, 7504.10it/s]

{'loss': 0.8002, 'learning_rate': 2.0014332123065164e-06, 'epoch': 6.0}


 60%|██████    | 31434/52380 [00:25<00:23, 900.05it/s] 

{'loss': 0.7119, 'learning_rate': 2.0013376648194155e-06, 'epoch': 6.0}


 60%|██████    | 31435/52380 [00:29<00:28, 736.35it/s]

{'loss': 0.8113, 'learning_rate': 2.0012421173323145e-06, 'epoch': 6.0}


 60%|██████    | 31436/52380 [00:33<00:28, 736.35it/s]

{'loss': 0.815, 'learning_rate': 2.001146569845213e-06, 'epoch': 6.0}


 60%|██████    | 31437/52380 [00:37<00:28, 736.35it/s]

{'loss': 0.5468, 'learning_rate': 2.001051022358112e-06, 'epoch': 6.0}


KeyboardInterrupt: 

 60%|██████    | 31437/52380 [00:41<00:28, 736.35it/s]

In [None]:
# Final model evaluation
trainer.evaluate()

In [31]:
# Use the trained 'trainer' to make predictions on the 'df_test'.
outputs = trainer.predict(df_test)

# Print the metrics obtained from the prediction outputs.
print(outputs.metrics)



KeyboardInterrupt: 

In [None]:
# Extract the true labels from the model outputs
y_true = outputs.label_ids

# Predict the labels by selecting the class with the highest probability
y_pred = outputs.predictions.argmax(1)

# Define a function to plot a confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues, figsize=(10, 8)):
    """
    This function plots a confusion matrix.

    Parameters:
        cm (array-like): Confusion matrix as returned by sklearn.metrics.confusion_matrix.
        classes (list): List of class names, e.g., ['Class 0', 'Class 1'].
        title (str): Title for the plot.
        cmap (matplotlib colormap): Colormap for the plot.
    """
    # Create a figure with a specified size
    plt.figure(figsize=figsize)
    
    # Display the confusion matrix as an image with a colormap
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    # Define tick marks and labels for the classes on the axes
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.0f'
    # Add text annotations to the plot indicating the values in the cells
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    # Label the axes
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    # Ensure the plot layout is tight
    plt.tight_layout()
    # Display the plot
    plt.show()

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')

# Display accuracy and F1 score
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Get the confusion matrix if there are a relatively small number of labels
if len(labels_list) <= 120:
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Plot the confusion matrix using the defined function
    plot_confusion_matrix(cm, labels_list, figsize=(18, 16))

# Finally, display classification report
print()
print("Classification report:")
print()
print(classification_report(y_true, y_pred, target_names=labels_list, digits=4))

# Guardado y prueba del modelo

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_vocabulary(save_directory=f"./{output_dir}")

In [54]:
# Make a classification pipeline and test with the sample input
pipe = pipeline("text-classification", "Trained Models/checkpoint-31428" , tokenizer=BERT_MODEL)
sample_title = '''Best U.S. Candidates if you are a college student'''
pipe(sample_title, top_k=10)

[{'label': 'POLITICS', 'score': 0.6102012395858765},
 {'label': 'EDUCATION', 'score': 0.08717089146375656},
 {'label': 'BUSINESS', 'score': 0.07915697246789932},
 {'label': 'COMEDY', 'score': 0.07314491271972656},
 {'label': 'TRAVEL', 'score': 0.023624446243047714},
 {'label': 'FIFTY', 'score': 0.013131772167980671},
 {'label': 'WELLNESS', 'score': 0.012738733552396297},
 {'label': 'FOOD & DRINK', 'score': 0.009817044250667095},
 {'label': 'MONEY', 'score': 0.009589473716914654},
 {'label': 'WORLD NEWS', 'score': 0.008676856756210327}]