In [None]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
df = pd.read_csv("stock_news.csv", index_col = 0)

In [None]:
# Map the labels (Positive: 2, Neutral: 1, Negative: 0)
label_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
# Split into training and validation sets
train_texts, temp_text, train_labels, temp_labels = train_test_split(
    df['headline'].values, df['label'].values, test_size=0.3, random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_text, temp_labels, test_size=0.3, random_state=42
)

In [None]:
# Run this cell for RobERTa base models
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load the pre-trained RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
# Run this cell for BERT base models
from transformers import BertForSequenceClassification, BertTokenizer

# Load the pre-trained BERT model for sequence classiffication
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Run this to create a class to orgqanize the labels and the trainning texts and transform to tensors
# Run for all models
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create datasets for training and validation
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

In [None]:
# Training loop for FFT models
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

# Define training arguments with early stopping
training_args = TrainingArguments(
    output_dir='C:\\Users\\34618\\OneDrive\\Documentos\\UNI\\LLMs\\',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,  # Regularization
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch (to match eval_strategy)
    load_best_model_at_end=True,  # Load the best model found at the end of training
)

# Initialize the Trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],  # Stop after 3 epochs of no improvement
)

# Train the model
trainer.train()

# Evaluate the model on the validation data
eval_result = trainer.evaluate()
print(eval_result)

In [None]:
# Plot the training and validation curves
def plot_loss_curves(trainer):
    train_loss = trainer.state.log_history
    train_loss_values = [log['loss'] for log in train_loss if 'loss' in log]
    val_loss_values = [log['eval_loss'] for log in train_loss if 'eval_loss' in log]

    # Drop the last value (best loss value is added at the end of teh validation loss array)
    val_loss_values = val_loss_values[:3]

    # Adjust depending of early stopping
    epochs = [1,2,3]

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_loss_values[:len(epochs)], label="Training Loss")
    plt.plot(epochs, val_loss_values, label="Validation Loss")

    plt.title("Training vs Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

plot_loss_curves(trainer)

In [None]:
# Print the classification report
import numpy as np
from scipy.special import softmax
from sklearn.metrics import classification_report, classification_report, roc_curve, auc, precision_recall_curve

# Make predictions
predictions = trainer.predict(test_dataset)

# Extract predicted probabilities and convert to labels
pred_probs = softmax(predictions.predictions, axis=1)
pred_labels = pred_probs.argmax(axis=1)

unique_classes = np.unique(test_labels)

report = classification_report(test_labels, pred_labels, target_names=["Negative", "Neutral", "Positive"], labels=unique_classes, output_dict=True)

# Print classification report with specified labels
print("Test Classification Report:")
print(report)

In [None]:
# PLot heatmap with the classification report
import seaborn as sns

# Convert the classification report to a pandas DataFrame
df_report = pd.DataFrame(report).transpose()

# Drop the 'support' column for cleaner visualization
df_report = df_report.drop(columns=['support'])
df_report = df_report[~df_report.index.isin(['accuracy','macro avg', 'weighted avg'])]

# Create a heatmap plot of the classification report
plt.figure(figsize=(10, 6))
sns.heatmap(df_report, annot=True, cmap="Blues", cbar=False, fmt=".2f", linewidths=0.5)

# Add labels and title
plt.title("Classification Report Heatmap", fontsize=16)
plt.ylabel("Classes", fontsize=12)
plt.xlabel("Metrics", fontsize=12)

# Show plot
plt.show()

In [None]:
# Plot ROC curve
def plot_roc_curve(y_true, y_scores, pos_label=2):
    n_classes = 3

    plt.figure(figsize=(10, 8))

    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_true, y_scores[:, i], pos_label=i)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label='Class {} (AUC = {:.2f})'.format(i, roc_auc))

    # Plot diagonal line for random chance
    plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-Class Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

# Call the function for ROC Curve
plot_roc_curve(test_labels, pred_probs)

In [None]:
# Display wring predictions for error analysis
def error_analysis(texts, true_labels, predicted_labels):
    errors = np.where(true_labels != predicted_labels)[0]
    for i in errors:
        print(f'Text: {texts[i]}')
        print(f'True Label: {true_labels[i]}, Predicted Label: {predicted_labels[i]}')
        print('---')

# Call the function for error analysis
error_analysis(test_texts, test_labels, pred_labels)