Testing classificiation of primary endpoint type in datasets using MLP with SciBERT embeddings

In [1]:
import pandas as pd

import numpy as np # Don't think I need this but it's just habit at this point
import re
import string

#import torch
#from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import tqdm as notebook_tqdm

Train the classifier

In [4]:
euct_ns = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\euct_ns.csv', encoding='unicode_escape')

In [5]:
text_columns = ['Title', 'Objective', 'pr_endpoint', 'endpoint_description']
X = euct_ns[text_columns] 
y = euct_ns['manual_label'].values

X is words so needs to be converted into numerical features

In [6]:
X = X[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [7]:
# Load pretrained SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")



In [8]:
# Tokenize the text and generate embeddings
def generate_embeddings(texts, tokenizer, model, max_len=512):
    """Generate embeddings for a list of texts using SciBERT."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        inputs = tokenizer(
            texts.tolist(), 
            padding=True, 
            truncation=True, 
            max_length=max_len, 
            return_tensors="pt"
        )
        outputs = model(**inputs)
        # Use the [CLS] token representation (typically at index 0)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

In [9]:
# Generate embeddings for the dataset
X_embeddings = generate_embeddings(X, tokenizer, model)

In [10]:
joblib.dump(X_embeddings, "embeddings.pkl")

['embeddings.pkl']

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, y, test_size=0.2, random_state=42
)

In [12]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [13]:
print("Shape of X_train_embeddings during scaler training:", X_train.shape)

Shape of X_train_embeddings during scaler training: (152, 768)


In [14]:
# Create an MLPClassifier model
mlp = MLPClassifier(hidden_layer_sizes=(64, 32),
                    max_iter=1000, random_state=3)

In [15]:
# Train the model on the training data
mlp.fit(X_train, y_train)

# Make predictions on the test data
y_pred = mlp.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 65.79%


In [16]:
joblib.dump(mlp, "model.pkl")

['model.pkl']

In [None]:
print(y_test)

In [None]:
print(y_pred) # There is no cases of intermediate outcomes in the pred set. Do I re-run it?

In [None]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print(class_report)

Apply MLP model to NS-HRA dataset

In [17]:
ns_hra = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra.csv', encoding='unicode_escape')

Now this should be properly cleaned and I can re-run everything

In [18]:
embeddings = joblib.load('embeddings.pkl')
scaler = joblib.load('scaler_train.pkl')

In [None]:
model = joblib.load('model.pkl')

In [20]:
text_columns = ['Title', 'Objective', '1ry_endpoint'] # In the HRA REC forms, the primary endpoint and endpoint description are together
X2 = ns_hra[text_columns] 

In [21]:
X2 = X2[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [22]:
X2_embeddings = generate_embeddings(X2, tokenizer, model)

AttributeError: 'MLPClassifier' object has no attribute 'eval'

In [None]:
X2 = X2_embeddings

In [None]:
X2 = scaler.transform(X2_embeddings) # I did this in the training so I guess I have to do that here

In [None]:
y_pred = mlp.predict(X2)

In [None]:
confidence_scores = mlp.predict_proba(X2) # How sure is the model on the predictions that it made?

In [None]:
euct_ns_pred = pd.DataFrame(confidence_scores, columns=['PFO_0', 'IO_1', 'SO_2'])
euct_ns_pred['Predicted_label'] = y_pred

In [None]:
print(euct_ns_pred.head())

In [None]:
euct_ns_pred.to_csv('euct_ns_pred.csv', index=False)

Descriptive statistics
# Need to know what the hell is going on

In [None]:
# Data visualisation of clusters - t-SNE
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
# Convert sparse matrix to dense
X2_dense = X2.toarray()

# Reduce dimensions using t-SNE
tsne = TSNE(n_components=2, random_state=42)
X2_tsne = tsne.fit_transform(X2_dense)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X2_tsne[:, 0], X2_tsne[:, 1], c=y_pred, cmap='viridis', marker='o', edgecolor='k')
plt.title('t-SNE of predicted primary endpoint types in the NS-HRA dataset')
plt.colorbar(scatter)
plt.grid(True)
#plt.savefig('t-SNE of predicted primary endpoint types in the NS-HRA dataset')
plt.show()

In [None]:
# Slope chart for frequency distribution across different labels
pfo_df = ns_hra[y_pred == 0]
io_df = ns_hra[y_pred == 1]
so_df = ns_hra[y_pred == 2]

In [None]:
pfo_df.head() #Sanity check

In [None]:
pfo_df = pfo_df.copy()
io_df = io_df.copy()
so_df = so_df.copy()

In [None]:
pfo_df['concat_corpus'] = pfo_df['Title']+ " " + pfo_df['Objective'] + " " + pfo_df['1ry_endpoint'] 
io_df['concat_corpus'] = io_df['Title']+ " " + io_df['Objective'] + " " + io_df['1ry_endpoint'] 
so_df['concat_corpus'] = so_df['Title']+ " " + so_df['Objective'] + " " +so_df['1ry_endpoint'] 

In [None]:
pfo_df.head() #sanity check

In [None]:
# Fill missing values with an empty string
so_df['concat_corpus'] = so_df['concat_corpus'].fillna('')

tfidf_matrix = vectorizer.fit_transform(so_df['concat_corpus'])
so_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(so_tfidf_df)

In [None]:
so_tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()

In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
so_tfidf_df_list = pd.DataFrame({'word': feature_names, 'tfidf_score': so_tfidf_scores})

In [None]:
so_tfidf = so_tfidf_df_list.sort_values(by='tfidf_score', ascending=False)

In [None]:
 top_n = 618

In [None]:
so_tfidf.head(top_n).to_csv('tf-idf so.csv', index=False)

In [None]:
pfo_tfidf_df = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\tf-idf pfo.csv', encoding='unicode_escape')
io_tfidf_df = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\tf-idf io.csv', encoding='unicode_escape')
so_tfidf_df = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\tf-idf so.csv', encoding='unicode_escape')

In [None]:
merged_df = pfo_tfidf_df.merge(io_tfidf_df, on='word', suffixes=('_pfo', '_io'))
merged_df = merged_df.merge(so_tfidf_df, on='word')
merged_df.rename(columns={'tfidf_score': 'tfidf_score_so'}, inplace=True)

In [None]:
merged_df.head()
# The first one is clearly a problem with the structure of XML files and i've cleaned as best as I can google. I'll just delete row 1.

In [None]:
merged_df = merged_df.iloc[1:]
merged_df = merged_df.copy()

In [None]:
merged_df['max_diff'] = merged_df[['tfidf_score_pfo', 'tfidf_score_io', 'tfidf_score_so']].max(axis=1) - \
                        merged_df[['tfidf_score_pfo', 'tfidf_score_io', 'tfidf_score_so']].min(axis=1)

sorted_df = merged_df.sort_values(by='max_diff', ascending=False)

top_features_df = sorted_df.head(15)
print(top_features_df)

In [None]:
top_features_df = top_features_df.copy()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

jitter_strength = 0.4

for i in range(len(top_features_df)):
    x_positions = np.array([0, 1, 2]) + np.random.normal(0, jitter_strength, size=3)
    ax.plot(['pfo', 'io', 'so'], 
            [top_features_df.iloc[i]['tfidf_score_pfo'], top_features_df.iloc[i]['tfidf_score_io'], top_features_df.iloc[i]['tfidf_score_so']],
            marker='o', label=top_features_df['word'].iloc[i])

for i, word in enumerate(top_features_df['word']):
    for j, subset in enumerate(['pfo', 'io', 'so']):
        ax.text(j, top_features_df.iloc[i, j+1], f'{word}', verticalalignment='center', fontsize=8)

ax.axvline(x=1, color='gray', linestyle='--', linewidth=1) 

# Customize the plot
ax.set_title('Slope Chart of Largest Change in TF-IDF Score Across Predicted Primary Endpoint Label in The NS-HRA dataset', fontsize=16)
ax.set_ylabel('TF-IDF Score')
ax.set_xticks(['pfo', 'io', 'so'])
ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.7)

# Display the slope chart
plt.show()

In [None]:
# Correlation matrix of TF-IDF scores

In [None]:
concordance_df = merged_df[['word', 'tfidf_score_pfo', 'tfidf_score_io', 'tfidf_score_so']]
correlation_matrix = concordance_df[['tfidf_score_pfo', 'tfidf_score_io', 'tfidf_score_so']].corr()
print(correlation_matrix)

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(8, 6))

sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='Greens', square=True)

plt.title('Correlation Matrix of TF-IDF Scores Across Predicted Labels in NS-HRA dataset', fontsize=16)
plt.show()