Testing classificiation of primary endpoint type in datasets using MLP

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np # Don't think I need this but it's just habit at this point
import re
import string

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
import nltk

In [21]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

For some reason, up til now I have yet to save a clean CSV file of the EUCT-NS dataset

In [3]:
# Load the untouched EUCT-NS dataset
df = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\untouched_euct_ns.csv', encoding='unicode_escape')

In [4]:
print(df.head())

       EudraCT_No                                              Title Phase  \
0  2018-003243-39  A Phase 3, Randomized, Double-Blind, Placebo-C...     3   
1  2009-016138-29  âRandomized, Multicenter, Open-label, Phase ...   iii   
2  2016-000474-38  A Multicenter, 2-Cohort Trial to First Assess ...     0   
3  2014-000418-75  A Multicenter, Multinational, Randomized, Doub...     0   
4  2012-002933-12  A Phase II pilot study to explore treatment wi...    ii   

                                           Objective   End_date  Sample_size  \
0  The primary purpose of this study is to evalua...        NaN          175   
1  To compare the efficacy of plitidepsin in comb...  20-Nov-17          255   
2  To demonstrate that fenfluramine hydrochloride...  05-Jun-18           87   
3  The primary objective of this study was to ass...  19-Jun-18          352   
4  To determine whether patients taking a medicin...  10-Dec-18            8   

                                         pr_endpoi

In [7]:
# Clean the data 
lemmatizer = WordNetLemmatizer()

In [22]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lower case
    text = text.split()  # Split into words
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]  # Lemmatize and remove stopwords # Get rid of primary as a stop word
    return ' '.join(text)

In [23]:
import unicodedata
def strip_accents(text):
    return ''. join(word for word in unicodedata.normalize ('NFD', text)
                     if unicodedata.category(word) != 'Mn')

In [24]:
df['Title'] = df['Title'].apply(preprocess_text)
df['Objective'] = df['Objective'].apply(preprocess_text)
df['pr_endpoint'] = df['pr_endpoint'].apply(preprocess_text)

In [25]:
df['endpoint_description'] = df['endpoint_description'].apply(str)
df['endpoint_description'] = df['endpoint_description'].apply(preprocess_text)
df['Title'] = df['Title'].apply(strip_accents)

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s2421127\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def no_primary(text):
    stop = nltk.corpus.stopwords.words('english')
    stop.append("primary")
    return ' '.join([word for word in text.split() if word not in stop])

In [28]:
df['pr_endpoint'] = df['pr_endpoint'].apply(no_primary)

In [29]:
display(df.head())

Unnamed: 0,EudraCT_No,Title,Phase,Objective,End_date,Sample_size,pr_endpoint,endpoint_description,Treatment,LT_followup,manual_label
0,2018-003243-39,phase 3 randomized double blind placebo contro...,3,primary purpose study evaluate safety efficacy...,,175,change baseline myasthenia gravis activity dai...,mg adl 8 point questionnaire focusing relevant...,Ravulizumab,No,0
1,2009-016138-29,randomized multicenter open label phase iii st...,iii,compare efficacy plitidepsin combination dexam...,20-Nov-17,255,progression free survival independent review c...,primary study analysis based externally assess...,Aplidin,No,2
2,2016-000474-38,multicenter 2 cohort trial first as pharmacoki...,0,demonstrate fenfluramine hydrochloride superio...,05-Jun-18,87,change convulsive seizure frequency csf baseli...,baseline adjusted csf mean number convulsive s...,fenfluramine hydrochloride,Yes,0
3,2014-000418-75,multicenter multinational randomized double bl...,0,primary objective study as efficacy laquinimod...,19-Jun-18,352,change baseline uhdrs tm week 52,uhdrs as motor function cognition behaviour fu...,Placebo,No,0
4,2012-002933-12,phase ii pilot study explore treatment sodium ...,ii,determine whether patient taking medicine call...,10-Dec-18,8,workload,participant cycled cycle ergometer oxygen cons...,Sodium Valproate,No,1


In [30]:
df.to_csv('euct_ns.csv', index=False)

Train the classifier

In [83]:
import joblib # Need this to save the fitted vectorizer when I apply it to the NS-HRA dataset

In [82]:
euct_ns = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\euct_ns.csv', encoding='unicode_escape')

In [84]:
text_columns = ['Title', 'Objective', 'pr_endpoint', 'endpoint_description']
X = euct_ns[text_columns] 
y = euct_ns['manual_label'].values

X is words so needs to be converted into numerical features

In [85]:
X = X[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
tfidf = TfidfVectorizer(ngram_range=(1,3))

In [87]:
X = tfidf.fit_transform(X)

In [88]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [89]:
joblib.dump(tfidf, 'tfidf_train.pkl')

['tfidf_train.pkl']

In [90]:
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler(with_mean=False) # Sparse dataset
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [91]:
joblib.dump(scaler, 'scaler_train.pkl')

['scaler_train.pkl']

In [59]:
# Create an MLPClassifier model
mlp = MLPClassifier(hidden_layer_sizes=(64, 32),
                    max_iter=1000, random_state=3)

In [60]:
# Train the model on the training data
mlp.fit(X_train, y_train)

# Make predictions on the test data
y_pred = mlp.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 73.68%


In [61]:
print(y_test)

[0 0 2 0 0 0 2 2 2 2 1 0 0 0 0 0 0 1 2 1 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0
 0]


In [62]:
print(y_pred) # There is no cases of intermediate outcomes in the pred set. Do I re-run it?

[0 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0]


In [46]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print(f"Classification Report:\"class_report")

Classification Report:"class_report


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Apply MLP model to NS-HRA dataset

In [63]:
eleven = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_11.csv', encoding='unicode_escape')
twelve = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_12.csv', encoding='unicode_escape')
thirteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_13.csv', encoding='unicode_escape')
fourteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_14.csv', encoding='unicode_escape')
fifteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_15.csv', encoding='unicode_escape')
sixteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_16.csv', encoding='unicode_escape')
seventeen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_17.csv', encoding='unicode_escape')
eighteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_18.csv', encoding='unicode_escape')
nineteen = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_19.csv', encoding='unicode_escape')
twenty = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra_20.csv', encoding='unicode_escape')

In [64]:
ns_hra_untouched = pd.concat([eleven, twelve, thirteen, fourteen, fifteen, sixteen, seventeen, eighteen, nineteen, twenty], axis=0)

In [65]:
len(ns_hra_untouched)

694

In [66]:
ns_hra_untouched.to_csv('untouched_ns_hra.csv', index=False)

In [67]:
df = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\untouched_ns_hra.csv', encoding='unicode_escape')

In [68]:
display(df.head())

Unnamed: 0,Unique_ID,Title,Phase,Objective,End_date,Sample_size,1ry_endpoint,Treatment,LT_followup
0,IRAS_projectID_57754,"Phase II, multicenter, randomized, adaptive, d...",ii,To assess the efficacy and the safety of liqui...,31/05/2013 00:00,150.0,The primary outcome measure is the Motor Funct...,,Subjects will participate in the study for a t...
1,IRAS_projectID_82035,"A Randomized, Controlled, Long-term Safety Stu...",0,The principal objective of the study is to ass...,30/09/2013 00:00,450.0,The primary endpoint in this study is the chan...,,Subjects will be involved in this study for ju...
2,IRAS_projectID_64187,"A Randomized, Double-Blind, Double-Dummy, Para...",0,To assess whether the efficacy of Ocrelizumab ...,31/03/2015 00:00,800.0,The primary efficacy analysis for this trial w...,,Screening 2 weeks\nTreatment phase 96 weeks\nS...
3,IRAS_projectID_72673,"Multi-Centre, Open-Label, Randomised Trial Inv...",0,To evaluate the pharmacokinetics (PK) of two d...,15/10/2011 00:00,10.0,Primary Endpoints: Pharmacokinetic Endpoints\n...,,The subjectÃ¢ÂÂs participation in the trial ...
4,IRAS_projectID_67978,The use of carer assisted adherence therapy fo...,0,PRIMARY Aim:\nTo investigate if a seven week p...,,120.0,The primary outcome measures for the study are...,,Three to three and a half months for patients ...


In [70]:
df['Title'] = df['Title'].apply(str)

In [71]:
df['Title'] = df['Title'].apply(preprocess_text)
df['Objective'] = df['Objective'].apply(preprocess_text)
df['1ry_endpoint'] = df['1ry_endpoint'].apply(preprocess_text)

In [72]:
df['Title'] = df['Title'].apply(strip_accents)

In [73]:
df['1ry_endpoint'] = df['1ry_endpoint'].apply(no_primary)

In [74]:
display(df.head())

Unnamed: 0,Unique_ID,Title,Phase,Objective,End_date,Sample_size,1ry_endpoint,Treatment,LT_followup
0,IRAS_projectID_57754,phase ii multicenter randomized adaptive doubl...,ii,ass efficacy safety liquid suspension formulat...,31/05/2013 00:00,150.0,outcome measure motor function measure mfm d1 ...,,Subjects will participate in the study for a t...
1,IRAS_projectID_82035,randomized controlled long term safety study e...,0,principal objective study ass long term safety...,30/09/2013 00:00,450.0,endpoint study change baseline total score nor...,,Subjects will be involved in this study for ju...
2,IRAS_projectID_64187,randomized double blind double dummy parallel ...,0,ass whether efficacy ocrelizumab given two dos...,31/03/2015 00:00,800.0,efficacy analysis trial compare annualized pro...,,Screening 2 weeks\nTreatment phase 96 weeks\nS...
3,IRAS_projectID_72673,multi centre open label randomised trial inves...,0,evaluate pharmacokinetics pk two different bat...,15/10/2011 00:00,10.0,endpoint pharmacokinetic endpoint pk endpoint ...,,The subjectÃ¢ÂÂs participation in the trial ...
4,IRAS_projectID_67978,use carer assisted adherence therapy people pa...,0,primary aim investigate seven week programme c...,,120.0,outcome measure study morisky medication asses...,,Three to three and a half months for patients ...


In [75]:
df.to_csv('ns_hra.csv', index=False)

Make predictions of primary endpoint type on NS-HRA dataset

In [76]:
ns_hra = pd.read_csv('c:\\Users\\s2421127\\Documents\\NLP Project\\ObuayaO\\NLP project\\Chapter 3\\ns_hra.csv', encoding='unicode_escape')

In [92]:
vectorizer = joblib.load('tfidf_train.pkl')
scaler = joblib.load('scaler_train.pkl')

In [93]:
text_columns = ['Title', 'Objective', '1ry_endpoint'] # In the HRA REC forms, the primary endpoint and endpoint description are together
X2 = ns_hra[text_columns] 

In [94]:
X2 = X2[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [95]:
X2 = vectorizer.transform(X2)

In [96]:
X2 = scaler.transform(X2) # I did this in the training so I guess I have to do that here

In [97]:
y_pred = mlp.predict(X2)

In [99]:
confidence_scores = mlp.predict_proba(X2) # How sure is the model on the predictions that it made?

In [101]:
euct_ns_pred = pd.DataFrame(confidence_scores, columns=['PFO_0', 'IO_1', 'SO_2'])
euct_ns_pred['Predicted_label'] = y_pred

In [102]:
print(euct_ns_pred.head())

      PFO_0      IO_1      SO_2  Predicted_label
0  0.966957  0.023504  0.009539                0
1  0.992628  0.005143  0.002228                0
2  0.998461  0.001181  0.000358                0
3  0.844298  0.087099  0.068603                0
4  0.961650  0.026418  0.011932                0


In [103]:
euct_ns_pred.to_csv('euct_ns_pred.csv', index=False)