In [2]:
#!pip install simpletransformers
#!pip install wandb
#!pip install torch

import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, classification_report, average_precision_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_classification

#from imblearn.over_sampling import SMOTE
#from simpletransformers.classification import ClassificationModel
#import wandb

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

## Preprocessing for NLP model

In [3]:
#Colab
#raw_data_path = '/content/drive/My Drive/NLP/ML_xlnet/training_test.xlsx'
#destination_folder = '/content/drive/My Drive/NLP/ML_xlnet/preprocessed'

#Local
raw_data_path = 'Data/wo_Meta/unprocessed/training_test.xlsx'
destination_folder = 'Data/wo_Meta/preprocessed'

# Read raw data
df_raw = pd.read_excel(raw_data_path)

In [7]:
# Prepare columns
df_raw['label'] = (df_raw['label'] == 'YES').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)
df_raw = df_raw[df_raw['title'].notnull()]
#df_raw = df_raw[df_raw['text'].fillna("")]

def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

# Trim text and titletext to first_n_words
#first_n_words = 1000
#df_raw['text'] = df_raw['text'].apply(trim_string)
#df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Drop title and text
df_raw.drop(["title", "text"], axis= 1, inplace=True)

# Drop emtpy rows
nan_value = float("NaN")
df_raw.replace("", nan_value, inplace=True)
df_raw.dropna(subset = ["titletext"], inplace=True)

# Assign X and y
X = df_raw[["titletext" ,"DOI"]]
y = df_raw["label"]

In [None]:
# train test split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train_pre, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train_pre, y_test = y.iloc[train_index], y.iloc[test_index]

#We will not need this later, this is a preliminary mix of metadata filtering with NLP test-set
#X_test_filtered = X_test[~X_test['DOI'].isin(DOIs_for_prefiltering)]
#ids_X_test_filtered = X_test_filtered.index.values.tolist()
#y_test_filtered = y_test[y_test.index.isin(ids_X_test_filtered)]

#X_test = X_test.drop("DOI", axis=1)
#X_test_filtered = X_test_filtered.drop("DOI", axis=1)

## train valid split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X_train_pre, y_train_pre):
    X_train, X_valid = X_train_pre.iloc[train_index], X_train_pre.iloc[test_index]
    y_train, y_valid = y_train_pre.iloc[train_index], y_train_pre.iloc[test_index]

# Write preprocessed data
X_train.to_csv(destination_folder + '/X_train.csv', index=False)
y_train.to_csv(destination_folder + '/y_train.csv', index=False)
X_test.to_csv(destination_folder + '/X_test.csv', index=False)
y_test.to_csv(destination_folder + '/y_test.csv', index=False)
X_valid.to_csv(destination_folder + '/X_valid.csv', index=False)
y_valid.to_csv(destination_folder + '/y_valid.csv', index=False)

X_test_filtered.to_csv(destination_folder + '/X_test_filtered.csv', index=False)
y_test_filtered.to_csv(destination_folder + '/y_test_filtered.csv', index=False)

df_train = X_train.to_frame().join(y_train)
df_test = X_test.to_frame().join(y_test)
df_valid = X_valid.to_frame().join(y_valid)

In [None]:
X_train = pd.read_csv("./X_train.csv")
y_train = pd.read_csv("./y_train.csv")
X_test = pd.read_csv("./X_test.csv")
y_test = pd.read_csv("./y_test.csv")

df_train = X_train
df_train["label"] = y_train
df_test = X_test
df_test["label"] = y_test

## Pre-Processing and matching metadata with annotations data

In [2]:
#Colab
#raw_data_path_meta = '/content/drive/My Drive/NLP/Data/w_Meta/raw/Pubmed_final_df_v2.csv'
#raw_data_path = '/content/drive/My Drive/NLP/Data/wo_Meta/raw/annotated/Scope extract (1).xlsx'
#destination_folder = '/content/drive/My Drive/NLP/Data/w_Meta/preprocessed'

#local
raw_data_path_meta = 'Data/w_Meta/raw/Pubmed_final_df_v2.csv'
raw_data_path = 'Data/wo_Meta/raw/annotated/Scope extract (1).xlsx'
destination_folder = 'Data/w_Meta/preprocessed'

# Read raw data
df_raw_meta = pd.read_csv(raw_data_path_meta)
df_raw = pd.read_excel(raw_data_path)
df_raw = df_raw.rename(columns={'Title': 'ArticleTitle'})

#doi2pmid = "/content/drive/My Drive/NLP/Data/DOI2PMID/DOIs and PMIDs (1).csv"
#doi2pmid2 = "/content/drive/My Drive/NLP/Data/DOI2PMID/DOIs and PMIDs (2).csv"
#text2pmid = "/content/drive/My Drive/NLP/Data/wo_Meta/raw/Pubmed_final_df.csv"

#df_doi2pmid = pd.read_csv(doi2pmid)
#df_doi2pmid2 = pd.read_csv(doi2pmid2)
#df_text2pmid = pd.read_csv(text2pmid)

#df_doi2pmid2.drop(columns=['Unnamed: 0'], inplace=True)

df_merged = pd.merge(df_raw_meta, df_raw, how='inner', on=['DOI'])

df_merged.MESH = df_merged.MESH.str.replace("'", "");
df_merged.MESH = df_merged.MESH.str.replace("[", "");
df_merged.MESH = df_merged.MESH.str.replace("]", "");
df_merged.MESH = df_merged.MESH.str.replace(" ", "");
df_merged.MESH = df_merged.MESH.fillna("No data provided")

df_merged.Keywords = df_merged.Keywords.str.replace("'", "");
df_merged.Keywords = df_merged.Keywords.str.replace("[", "");
df_merged.Keywords = df_merged.Keywords.str.replace("]", "");
df_merged.Keywords = df_merged.Keywords.str.replace(" ", "");
df_merged.Keywords = df_merged.Keywords.fillna("")

df_merged = df_merged.drop(["Unnamed: 0", "AbstractText", "ArticleTitle_y"], axis=1)
df_merged = df_merged.rename(columns={"ArticleTitle_x": "ArticleTitle"})

# Colab
#df_merged.to_csv('/content/drive/My Drive/NLP/Data/w_Meta/annotated/merged_1000only.csv', index=False)

# Local
df_merged.to_csv('./Data/w_Meta/annotated/merged_1000only.csv', index=False)

In [20]:
#Colab
#data_merged = '/content/drive/My Drive/NLP/Data/w_Meta/annotated/merged_1000only.csv'

#Local
data_merged = 'Data/w_Meta/annotated/merged_1000only.csv'

df_merged = pd.read_csv(data_merged)

In [3]:
def converter(x):
    try:
        return x.split(',')
    except AttributeError:
        return None

dic_MESH = df_merged.MESH.values
dic_MESH = pd.Series(dic_MESH)
dic_MESH = dic_MESH.apply(converter)
dic_MESH = dic_MESH.to_numpy()

dic_key = df_merged.Keywords.values
dic_key = pd.Series(dic_key)
dic_key = dic_key.apply(converter)
dic_key = dic_key.to_numpy()

## Make Dictionary / Frequeuncies

In [28]:
frequency_flattened = []
for list in dic_MESH:
  for item in list:
    item = item.strip()
    frequency_flattened.append(item)

wordfreq = {}
for word in frequency_flattened:
    if word not in wordfreq:
        wordfreq[word] = 0 
    wordfreq[word] += 1
wordfreq = sorted(wordfreq.items(), key=lambda item: item[1], reverse=True)

In [None]:
frequency_flattened = []
for list in dic_key:
  for item in list:
    item = item.strip()
    frequency_flattened.append(item)

wordfreq = {}
for word in frequency_flattened:
    if word not in wordfreq:
        wordfreq[word] = 0 
    wordfreq[word] += 1
wordfreq = sorted(wordfreq.items(), key=lambda item: item[1], reverse=True)

In [None]:
wordfreq

## Dataset with Metadata pre-screening

In [4]:
df_merged.MESH = dic_MESH.copy()
df_meta_only = pd.get_dummies(df_merged["MESH"].apply(pd.Series).stack()).sum(level=0)
df_merged["SHORTLIST?"] = (df_merged["SHORTLIST?"] == 'YES').astype('int')
df_meta_only["SHORTLIST?"] = df_merged["SHORTLIST?"].copy()
df_meta_only["PMID"] = df_merged["PMID"].copy()
df_meta_only["DOI"] = df_merged["DOI"].copy()

In [5]:
# took 'Transcriptome' out of negative list due to poor performance 
# took DeliveryofHealthCare from positive to negative list 
negative_list = ['Animals', 'Mice', 'MolecularDockingSimulation', 'GeneExpressionProfiling', 'Rats', 'Proteomics', 'BindingSites', 'Animal', 'Ligands', 'AminoAcidSequence', 'Plant']
positive_list = ['DeliveryofHealthCare', 'ElectronicHealthRecords', 'RadiographicImageInterpretation', 'Radiography', 'SeverityofIllnessIndex', 'ClinicalDecision-Making', 'Electrocardiography', 'TreatmentOutcome', 'Mammography', 'Ultrasonography', 'Asthma', 'PulmonaryDisease', 'ChronicObstructive', 'EarlyDetectionofCancer', 'Hospitals', 'AtrialFibrillation', 'Triage', 'HealthCare', 'Hospitalization', 'Anemia', 'ChronicDisease', 'Radiology', 'CoronaryAngiography', 'ParkinsonDisease', 'Dementia', 'RenalInsufficiency', 'CoronaryArteryDisease', 'AcuteCoronarySyndrome', 'Electroencephalography', 'Echocardiography', 'HeartVentricles', 'CriticalCare', 'PulmonaryEmbolism', 'DiagnosticImaging', 'MagneticResonanceAngiography', 'MyocardialInfarction', 'IntensiveCareUnits']
 
df_sanity_check_negative = df_meta_only[df_meta_only[negative_list].any(1)]
df_sanity_check_positive = df_meta_only[df_meta_only[positive_list].any(1)]

In [6]:
#How good are the lists overall?
print("percentage positive correctly classified: " + str((df_sanity_check_positive["SHORTLIST?"].sum()/len(df_sanity_check_positive))) + '\n' + "percentage negative correctly classified: " + str((1- df_sanity_check_negative["SHORTLIST?"].sum()/len(df_sanity_check_negative))))

percentage positive correctly classified: 0.376865671641791
percentage negative correctly classified: 0.9774774774774775


In [9]:
#Worst classifiers
for item in positive_list:
    df_temp = df_meta_only[df_meta_only[[item]].any(1)]
    print(item + ' accurcacy: ' + str(round((df_temp["SHORTLIST?"].sum()) /df_temp[item].count(),2)*100) + '%, and in numbers: ' + str((df_temp["SHORTLIST?"].sum())) + " are positive. Total frequency '" + item + "' occurs: " + str(df_temp[item].count()))

DeliveryofHealthCare accurcacy: 0.0%, and in numbers: 0 are positive. Total frequency 'DeliveryofHealthCare' occurs: 14
ElectronicHealthRecords accurcacy: 73.0%, and in numbers: 37 are positive. Total frequency 'ElectronicHealthRecords' occurs: 51
RadiographicImageInterpretation accurcacy: 70.0%, and in numbers: 16 are positive. Total frequency 'RadiographicImageInterpretation' occurs: 23
Radiography accurcacy: 48.0%, and in numbers: 10 are positive. Total frequency 'Radiography' occurs: 21
SeverityofIllnessIndex accurcacy: 44.0%, and in numbers: 7 are positive. Total frequency 'SeverityofIllnessIndex' occurs: 16
ClinicalDecision-Making accurcacy: 31.0%, and in numbers: 4 are positive. Total frequency 'ClinicalDecision-Making' occurs: 13
Electrocardiography accurcacy: 46.0%, and in numbers: 6 are positive. Total frequency 'Electrocardiography' occurs: 13
TreatmentOutcome accurcacy: 8.0%, and in numbers: 1 are positive. Total frequency 'TreatmentOutcome' occurs: 13
Mammography accurcacy

In [7]:
PMIDs_for_prefiltering = df_sanity_check_negative["PMID"].tolist()
DOIs_for_prefiltering = df_sanity_check_negative["DOI"].tolist()

# Meta data ML Model

In [22]:
df_merged.MESH = dic_MESH.copy()
df_meta_only = pd.get_dummies(df_merged["MESH"].apply(pd.Series).stack()).sum(level=0)
df_merged["SHORTLIST?"] = (df_merged["SHORTLIST?"] == 'YES').astype('int')
df_meta_only["SHORTLIST?"] = df_merged["SHORTLIST?"].copy()

#df_meta_only.drop([col for col, val in df_meta_only.sum().iteritems() if val < 6], axis=1, inplace=True)

y = df_meta_only["SHORTLIST?"]
X = df_meta_only.drop(columns="SHORTLIST?")

# train test split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
#sm = SMOTE(random_state=42)
#X_train, y_train = sm.fit_resample(X_train, y_train)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
clf = RandomForestClassifier(class_weight = {1:4})
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [None]:
best_random = rf_random.best_estimator_
best_random.score(X_train, y_train)

In [None]:
y_pred = best_random.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
importances = best_random.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_random.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
X.columns[188]

In [None]:
from sklearn.metrics import average_precision_score
y_score = best_random.predict(X_test)

average_precision = average_precision_score(y_test, y_score)


from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

disp = plot_precision_recall_curve(best_random, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

## Add aditional Testset for NLP classification


In [11]:
#Local
raw_data_path = 'Data/wo_Meta/unprocessed/validation2.xlsx'
destination_folder = 'Data/wo_Meta/preprocessed/additional'

# Read raw data
df_raw = pd.read_excel(raw_data_path)

In [12]:
# Prepare columns
df_raw['label'] = (df_raw['label'] == 'YES').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)
df_raw = df_raw[df_raw['title'].notnull()]
#df_raw = df_raw[df_raw['text'].fillna("")]

def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

# Trim text and titletext to first_n_words
#first_n_words = 1000
#df_raw['text'] = df_raw['text'].apply(trim_string)
#df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Drop title and text
df_raw.drop(["title", "text"], axis= 1, inplace=True)

# Drop emtpy rows
nan_value = float("NaN")
df_raw.replace("", nan_value, inplace=True)
df_raw.dropna(subset = ["titletext"], inplace=True)

# Assign X and y
X = df_raw[["titletext" ,"PMID"]]
y = df_raw["label"]

In [14]:
y

0     1
2     1
3     1
4     0
5     0
     ..
88    1
92    0
93    0
94    0
95    1
Name: label, Length: 71, dtype: int64

In [23]:
# train test split
sss = StratifiedShuffleSplit(n_splits=5, test_size=2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Write preprocessed data
X_train.to_csv(destination_folder + '/X_train.csv', index=False)
y_train.to_csv(destination_folder + '/y_train.csv', index=False)
X_test.to_csv(destination_folder + '/X_test.csv', index=False)
y_test.to_csv(destination_folder + '/y_test.csv', index=False)