In [None]:
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F


from typing import List, Tuple, Optional
from scipy.stats import zscore
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

# Brief Data Exploration

My goal is to identify:
1. Are the topics balanced?
2. Is the question or excerpt the the reason for the topic?
3. Length of the question and excerpt

However based on the assumption that this data is correct we just need to focus on the modeling.

In [None]:
processed = ...
unprocessed = ...

# Process dataset with helper functions

In [None]:
# map the labels to integers for the model
class LabelMapper:
    def __init__(self, labels: List[str]):
        self.labels = labels
        self.label_map = {label: i for i, label in enumerate(labels)}
        self.inverse_map = {i: label for i, label in enumerate(labels)}
    
    def map(self, label):
        return self.label_map[label]
    
    def inverse(self, label):
        return self.inverse_map[label]

In [None]:
def combine_question_answer(row: pd.Series):
    return f"Question: {row['question']} Excerpt: {row['excerpt']}"

### Process the labeled data

In [None]:
# Combine the question and answer into a single string for extra context map the labels to integers
processed['combined'] = processed.apply(combine_question_answer, axis=1)
processed['length'] = processed['combined'].apply(len)
label_mapper = LabelMapper(processed['topic'].unique())
processed['label'] = processed['topic'].apply(label_mapper.map)
processed

In [None]:
print(f"Max length: {processed['length'].max()} words")

### Process the unlabeled data

In [None]:
unprocessed['combined'] = unprocessed.apply(combine_question_answer, axis=1)
unprocessed['length'] = unprocessed['combined'].apply(len)
unprocessed

In [None]:
print(f"Max length: {unprocessed['length'].max()} words")

### Create stratified train and test datasets

In [None]:
# Parse the data into x and y
x = processed['combined']
y = processed['label']

In [None]:
# Split the data into train, val, and test sets
stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(stratified_splitter.split(x, y))
train_idx, val_idx = next(stratified_splitter.split(train_idx, y[train_idx]))

# Split the data into train, val, and test sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]
x_test, y_test = x[test_idx], y[test_idx]

In [None]:
# Check the distribution of the labels in the train, val, and test sets
train_topics = y_train.map(label_mapper.inverse)
val_topics = y_val.map(label_mapper.inverse) 
test_topics = y_test.map(label_mapper.inverse)

# Calculate proportions for each set
train_proportions = train_topics.value_counts() / len(train_topics)
val_proportions = val_topics.value_counts() / len(val_topics)
test_proportions = test_topics.value_counts() / len(test_topics)

# Plotting
plt.figure(figsize=(10, 6))
train_proportions.plot(kind='bar', color='blue', alpha=0.7, label='Train')
val_proportions.plot(kind='bar', color='orange', alpha=0.7, label='Validation')
test_proportions.plot(kind='bar', color='green', alpha=0.7, label='Test')
plt.xlabel('Topic')
plt.ylabel('Proportion')
plt.title('Distribution of Topics in Different Sets')
plt.legend()
plt.show()

In [None]:
class CustomDataset(Dataset):
    def __init__(self, x: pd.Series, tokenizer: AutoTokenizer, max_length: int, y: Optional[pd.Series] = None,):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x.iloc[idx]
        
        # Tokenize the input text
        inputs = self.tokenizer(
            x,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            )
        input_ids = inputs['input_ids'].squeeze().long()
        attention_mask = inputs['attention_mask'].squeeze().long()
        
        if self.y is not None:
            y = self.y.iloc[idx]
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': torch.tensor(y).long()
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
            }

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 512
batch_size = 8

In [None]:
# Create a CustomDataset for each set
dataset_train = CustomDataset(
    x_train, 
    y_train, 
    tokenizer, 
    max_length=max_length,
    )

dataset_val = CustomDataset(
    x_val, 
    y_val, 
    tokenizer, 
    max_length=max_length,
    )

dataset_test = CustomDataset(
    x_test, 
    y_test, 
    tokenizer, 
    max_length=max_length,
    )

# Create a DataLoader for each set
dataloder_train = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    )

dataloader_val = DataLoader(
    dataset_val,
    batch_size=batch_size,
    shuffle=False,
    )

dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=False,
    )

## Build model

In [None]:
class DistilBERTClassifier(nn.Module):
    def __init__(self, n_classes: int):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(last_hidden_state_cls)
        
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
            return loss, logits
        else:
            return logits

## If we need a custom Loss Function

In [None]:
class MultiRegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("label")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

## Create Training Loop

In [None]:
# Instantiate the model
model = DistilBERTClassifier(n_classes=len(label_mapper.labels)).to('cuda')

In [None]:
n_epochs = 10
output_dir = './results'
logging_steps = len(dataloder_train) // batch_size
num_training_steps = n_epochs * len(dataloder_train)
num_warmup_steps = int(num_training_steps * 0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

training_args = TrainingArguments(
    # Basic configuration and paths
    output_dir = output_dir,
    log_level = 'error',

    # Training and evaluation configuration
    num_train_epochs = n_epochs, # Consider adjusting based on observed convergence
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = 3e-5,
    weight_decay = 0.01,
    warmup_steps = num_warmup_steps,
    lr_scheduler_type = 'cosine', # Consider adjusting to 'linear' or 'polynomial'

    # Logging and Saving configuration
    logging_dir = output_dir,
    logging_steps = logging_steps,
    disable_tqdm = False,
    save_strategy = 'steps',
    save_steps = logging_steps,
    save_total_limit = 1,

    # Evaluation configuration
    evaluation_strategy = 'steps',
    eval_steps = logging_steps,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,

    # Optimizer configuration
    optim= "adamw_torch",
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon = 1e-8,
)

In [None]:
# Instantiate the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train,
    eval_dataset = dataset_val,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model(f"{output_dir}/model.hf")

## Evaluate the model

In [None]:
def predict(
        model: torch.nn.Module,
        dataloader: DataLoader,
        device: torch.device,
        ) -> Tuple[List[int], Optional[List[int]]]:
    model.eval()
    model.to(device)
    
    predictions: List[int] = []
    all_labels: List[int] = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            
            # Check if labels exist in the batch
            if 'labels' in batch:
                labels = batch['labels'].to(device)
                all_labels.extend(labels.cpu().numpy())
    
    # If no labels were found, return None for all_labels
    if len(all_labels) == 0:
        all_labels = None
    
    return predictions, all_labels

In [None]:
# Compare Train, Validation, and Test set performance
y_train_pred, y_train_true = predict(model, dataloder_train, device)
y_val_pred, y_val_true = predict(model, dataloader_val, device)
y_test_pred, y_test_true = predict(model, dataloader_test, device)

## Evalutation Metrics

### Precision
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label a negative sample as positive.

### Recall
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

### F1-Score
The F1-score is the harmonic mean of precision and recall. It provides a single metric that balances both precision and recall. The F1-score reaches its best value at 1 and its worst value at 0. It is a useful metric when you want to seek a balance between precision and recall.

### Support
Support indicates the number of occurrences of each class in the true labels (y_true). It provides insight into the distribution of classes in the dataset and can help evaluate the significance of the precision, recall, and F1-score for each class.

In [None]:
# Calculate metrics for each set
train_report = classification_report(y_train_true, y_train_pred, target_names=label_mapper.labels, output_dict=True)
val_report = classification_report(y_val_true, y_val_pred, target_names=label_mapper.labels, output_dict=True)
test_report = classification_report(y_test_true, y_test_pred, target_names=label_mapper.labels, output_dict=True)

train_report = pd.DataFrame(train_report).transpose()
val_report = pd.DataFrame(val_report).transpose()
test_report = pd.DataFrame(test_report).transpose()

In [None]:
train_report

In [None]:
val_report

In [None]:
test_report

### Confusion Matrix

A confusion matrix is a table that is often used to describe the performance of a classification model on a set of test data for which the true values are known. It allows the visualization of the performance of an algorithm.

Our results show that the model preforms very well, but has struggled on the topics of 'android', 'apple' and 'unix'. This is likely due to the fact that these topics are very similar in nature and the model has struggled to differentiate between them.

In [None]:
# Make Confusion Matrix
confusion_matrix = confusion_matrix(y_test_true, y_test_pred)

# Plot confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Adjust font scale if needed
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=label_mapper.labels, yticklabels=label_mapper.labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

## Make Predictions

In [None]:
# Create an InferenceDataset for the unprocessed data
dataset_inference = CustomDataset(
    x=unprocessed['combined'],
    tokenizer=tokenizer,
    max_length=max_length,
    )

# Create a DataLoader for the inference set
dataloader_inference = DataLoader(
    dataset_inference,
    batch_size=batch_size,
    shuffle=False,
    )

In [None]:
# Predict the labels for the unprocessed data
y_inference_pred, _ = predict(model, dataloader_inference, device)

In [None]:
# Append the predicted labels to the unprocessed data and map them back to the original labels
unprocessed['label'] = pd.Series(y_inference_pred)
unprocessed['topic'] = unprocessed['label'].map(label_mapper.inverse)
unprocessed

In [None]:
unprocessed[['topic', 'question', 'excerpt']].to_json('unprocessed-data-with-labels.json', orient='records', lines=True)

# Visualization Helper with UMAP

In [None]:
def get_sentence_embeddings(
        model: torch.nn.Module,
        dataloader: DataLoader,
        device: torch.device,
        ) -> List[np.ndarray]:

    model.eval()
    model.to(device)
    sentence_embeddings = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # Pass data through the model
            hidden_state = model.bert(input_ids, attention_mask).last_hidden_state
            # Mask the output to ignore padding tokens
            mask = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
            # Apply the mask and calculate mean pooling
            masked_output = hidden_state * mask
            embedding = masked_output.sum(1) / mask.sum(1)
            # Append the embeddings to the list
            sentence_embeddings.extend(embedding.cpu().numpy())

    sentence_embeddings = np.array(sentence_embeddings)
    return sentence_embeddings

In [None]:
# Get the sentence embeddings for the unprocessed data
unprocessed_embeddings = get_sentence_embeddings(model, dataloader_inference, device)

In [None]:
# Reduce the dimensionality of the embeddings using UMAP
umap_model = umap.UMAP(n_neighbors=10, min_dist=0.1, n_components=2)
umap_embeddings = umap_model.fit_transform(unprocessed_embeddings)

In [None]:
unprocessed['embedding'] = unprocessed_embeddings.tolist()
unprocessed['umap_x'] = umap_embeddings[:, 0]
unprocessed['umap_y'] = umap_embeddings[:, 1]
#unprocessed.to_parquet('unprocessed_with_embeddings.parquet', index=False)

In [None]:
# Visualization of UMAP embeddings with topic labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x='umap_x', y='umap_y', hue='topic', data=unprocessed, palette='tab20', legend='full', alpha=0.7)

# Add labels and other plot elements
plt.title('UMAP Embedding with Topic Labels')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(title='Topic', loc='upper right', bbox_to_anchor=(1.25, 1))
plt.tight_layout()
plt.show()

In [None]:
# Calculate cluster centroids for each topic
df_centroids = pd.DataFrame(columns=['topic', 'centroid_x', 'centroid_y'])
for label in label_mapper.labels:
    centroid_x = unprocessed[unprocessed['topic'] == label]['umap_x'].mean()
    centroid_y = unprocessed[unprocessed['topic'] == label]['umap_y'].mean()
    df_temp = pd.DataFrame({'topic': [label], 'centroid_x': [centroid_x], 'centroid_y': [centroid_y]})
    df_centroids = pd.concat([df_centroids, df_temp], axis=0)

In [None]:
# Calculate distances between each point and the respective centroid
distances = []
for i, row in unprocessed.iterrows():
    centroid_x = df_centroids[df_centroids['topic'] == row['topic']]['centroid_x'].values[0]
    centroid_y = df_centroids[df_centroids['topic'] == row['topic']]['centroid_y'].values[0]
    distance = np.sqrt((row['umap_x'] - centroid_x) ** 2 + (row['umap_y'] - centroid_y) ** 2)
    distances.append(distance)

unprocessed['distance'] = distances

In [None]:
# Normalize the distances with z-score
for i, topic in enumerate(label_mapper.labels):
    df_subset = unprocessed[unprocessed['topic'] == topic]
    distances = df_subset['distance']
    normalized_distances = zscore(distances)
    unprocessed.loc[unprocessed['topic'] == topic, 'distance_z_score'] = normalized_distances

unprocessed.head()

In [None]:
# Create hexbin plot
plt.figure(figsize=(10, 8))
plt.hexbin(unprocessed['umap_x'], unprocessed['umap_y'], C=unprocessed['distance_z_score'], gridsize=30, cmap='viridis')
plt.colorbar(label='Distance Z-Score')
plt.xlabel('UMAP X')
plt.ylabel('UMAP Y')
plt.title('Hexbin Plot of Distance Z-Score')
plt.show()

In [None]:
# Set up subplots for each topic
num_topics = len(label_mapper.labels)
fig, axs = plt.subplots(num_topics, 1, figsize=(8, 6 * num_topics))

# Iterate over each topic
for i, topic in enumerate(label_mapper.labels):
    # Filter data points for the current topic
    topic_data = unprocessed[unprocessed['topic'] == topic]
    
    # Plot histogram of distances
    axs[i].hist(topic_data['distance_z_score'], bins=20, color='skyblue', edgecolor='black')
    axs[i].set_title(f'Distribution of Distances for Topic: {topic}')
    axs[i].set_xlabel('Distance to Centroid')
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Assuming distances are supposed to be normally distributed and the 68-95-99.7 rule applies we subset values further than 2 stds from centroid
annomalies = unprocessed[unprocessed['distance_z_score'].abs() > 2]
annomalies