LOOCV: SciBERT embeddings

In [None]:
import pandas as pd
from pandas import read_csv

import numpy as np
from numpy import mean, std

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# Import Models

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import seaborn as sns

import scipy as sp

from flair.embeddings import TransformerWordEmbeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from flair import torch

import umap

from transformers import AutoTokenizer, AutoModel
import tqdm as notebook_tqdm

In [2]:
euct_df = pd.read_csv(euct_df) 
# Data already preprocess since classification tasks

LOOCV

In [None]:
euct_df['concat_corpus'] = euct_df['Title']+ " " + euct_df['Objective'] + " " + euct_df['pr_endpoint'] + " " + euct_df['endpoint_description']
# euct_df.head()

In [4]:
# Fill NaN values with an empty string
euct_df['concat_corpus'] = euct_df['concat_corpus'].fillna('')

In [None]:
# Load pretrained SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [6]:
# Tokenize the text and generate embeddings
def generate_embeddings(texts, tokenizer, model, max_len=512):
    """Generate embeddings for a list of texts using SciBERT."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        inputs = tokenizer(
            texts.tolist(), 
            padding=True, 
            truncation=True, 
            max_length=max_len, 
            return_tensors="pt"
        )
        outputs = model(**inputs)
        # Use the [CLS] token representation (typically at index 0)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

In [7]:
a = euct_df['concat_corpus']

In [8]:
# Generate embeddings for the EUCT-NS concat corpus
X = generate_embeddings(a, tokenizer, model)

In [9]:
# Split the data into X and y
X = X
y = euct_df['manual_label'].values

In [None]:
print("Initial DataFrame shape:", euct_df.shape)
euct_df.dropna(subset=['concat_corpus', 'manual_label'], inplace=True)  # Drop rows with NaN values in relevant columns
print("Shape after dropping NaNs:", euct_df.shape)

In [None]:
print(f'Features shape: {X.shape}, Target shape: {y.shape}')

In [12]:
if X.shape[0] != y.shape[0]:
    raise ValueError("Features and target variable have inconsistent number of samples.")

In [None]:
from sklearn.model_selection import GridSearchCV
# Define the parameters to search. Want to figure out the best parameters for the model
parameters = {'parameters of the model'}

mdl = MDL()
pipeline = make_pipeline(StandardScaler(), mdl)
mdl = GridSearchCV(pipeline, parameters, cv=10)
mdl.fit(X, y)

# Display the best parameters found by GridSearchCV
print("Best parameters found: ", mdl.best_params_)

In [11]:
loo = LeaveOneOut()
predictions = []
actuals = []

In [12]:
mdl = MDL(#parameters of mdl, change these to the best parameters found by GridSearchCV)

In [13]:
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    mdl.fit(X_train, y_train)

    y_pred = mdl.predict(X_test)

    predictions.append(y_pred[0])
    actuals.append(y_test[0])

Evaluation metrics

In [None]:
from sklearn.metrics import classification_report
accuracy_weighted = accuracy_score(actuals, predictions)
precision_weighted = precision_score(actuals, predictions, average='weighted')
recall_weighted = recall_score(actuals, predictions, average='weighted')
f1_weighted = f1_score(actuals, predictions, average='weighted')

accuracy_unweighted = accuracy_score(actuals, predictions)
classification_metrics = classification_report(actuals, predictions, output_dict=True)

# Print metrics
print(f'LOOCV Accuracy (Weighted): {accuracy_weighted:.2f}')
print(f'LOOCV Precision (Weighted): {precision_weighted:.2f}')
print(f'LOOCV Recall (Weighted): {recall_weighted:.2f}')
print(f'LOOCV F1 Score (Weighted): {f1_weighted:.2f}')
print()

print(f'LOOCV Accuracy (Unweighted): {accuracy_unweighted:.2f}')
print("Precision, Recall, and F1 Score by Class:")
for cls, metrics in classification_metrics.items():
    if cls.isdigit():  # Filter class-specific metrics
        print(f"  Class {cls}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1 Score={metrics['f1-score']:.2f}")

# Calculate mean and standard deviation of precision, recall, and F1 scores across classes
class_precisions = [metrics['precision'] for cls, metrics in classification_metrics.items() if cls.isdigit()]
class_recalls = [metrics['recall'] for cls, metrics in classification_metrics.items() if cls.isdigit()]
class_f1_scores = [metrics['f1-score'] for cls, metrics in classification_metrics.items() if cls.isdigit()]

print()
print(f'Mean and Standard Deviation of Precision: Mean={np.mean(class_precisions):.2f}, Std={np.std(class_precisions):.2f}')
print(f'Mean and Standard Deviation of Recall: Mean={np.mean(class_recalls):.2f}, Std={np.std(class_recalls):.2f}')
print(f'Mean and Standard Deviation of F1 Score: Mean={np.mean(class_f1_scores):.2f}, Std={np.std(class_f1_scores):.2f}')

In [None]:
conf_matrix = confusion_matrix(actuals, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=rfc.classes_)
disp.plot(cmap=plt.cm.Greens)
plt.title('Confusion Matrix of Estimated Performance When Making Predictions On New Data Based on Concatenated Text')
plt.show()

Performed LOOCV with dimensionality reduction applied to the SciBERT embeddings to see if estimations improved

In [None]:
umap_reducer = umap.UMAP(n_components=2, random_state=42) 
umap_result = umap_reducer.fit_transform(X)

In [17]:
X = umap_result
y = euct_df['manual_label'].values

In [None]:
print("Initial DataFrame shape:", euct_df.shape)
euct_df.dropna(subset=['concat_corpus', 'manual_label'], inplace=True)  # Drop rows with NaN values in relevant columns
print("Shape after dropping NaNs:", euct_df.shape)

In [None]:
print(f'Features shape: {X.shape}, Target shape: {y.shape}')

In [24]:
if X.shape[0] != y.shape[0]:
    raise ValueError("Features and target variable have inconsistent number of samples.")

In [None]:
parameters = {'parameters of the model'}

mdl = MDL()
pipeline = make_pipeline(StandardScaler(), mdl)
mdl = GridSearchCV(pipeline, parameters, cv=10)
mdl.fit(X, y)

# Display the best parameters found by GridSearchCV
print("Best parameters found: ", mdl.best_params_)

In [19]:
loo = LeaveOneOut()
predictions = []
actuals = []

In [20]:
mdl = MDL(#parameters of mdl, change these to the best parameters found by GridSearchCV)

In [21]:
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    mdl.fit(X_train, y_train)

    y_pred = mdl.predict(X_test)

    predictions.append(y_pred[0])
    actuals.append(y_test[0])

Evaluation metrics

In [None]:
accuracy_weighted = accuracy_score(actuals, predictions)
precision_weighted = precision_score(actuals, predictions, average='weighted')
recall_weighted = recall_score(actuals, predictions, average='weighted')
f1_weighted = f1_score(actuals, predictions, average='weighted')

accuracy_unweighted = accuracy_score(actuals, predictions)
classification_metrics = classification_report(actuals, predictions, output_dict=True)

# Print metrics
print(f'LOOCV Accuracy (Weighted): {accuracy_weighted:.2f}')
print(f'LOOCV Precision (Weighted): {precision_weighted:.2f}')
print(f'LOOCV Recall (Weighted): {recall_weighted:.2f}')
print(f'LOOCV F1 Score (Weighted): {f1_weighted:.2f}')
print()

print(f'LOOCV Accuracy (Unweighted): {accuracy_unweighted:.2f}')
print("Precision, Recall, and F1 Score by Class:")
for cls, metrics in classification_metrics.items():
    if cls.isdigit():  # Filter class-specific metrics
        print(f"  Class {cls}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1 Score={metrics['f1-score']:.2f}")

# Calculate mean and standard deviation of precision, recall, and F1 scores across classes
class_precisions = [metrics['precision'] for cls, metrics in classification_metrics.items() if cls.isdigit()]
class_recalls = [metrics['recall'] for cls, metrics in classification_metrics.items() if cls.isdigit()]
class_f1_scores = [metrics['f1-score'] for cls, metrics in classification_metrics.items() if cls.isdigit()]

print()
print(f'Mean and Standard Deviation of Precision: Mean={np.mean(class_precisions):.2f}, Std={np.std(class_precisions):.2f}')
print(f'Mean and Standard Deviation of Recall: Mean={np.mean(class_recalls):.2f}, Std={np.std(class_recalls):.2f}')
print(f'Mean and Standard Deviation of F1 Score: Mean={np.mean(class_f1_scores):.2f}, Std={np.std(class_f1_scores):.2f}')

In [None]:
conf_matrix = confusion_matrix(actuals, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=rfc.classes_)
disp.plot(cmap=plt.cm.Greens)
plt.title('Confusion Matrix of Estimated Performance When Making Predictions On New Data Based on Concatenated Text with UMAP')
plt.show()