# DISEASE PREDICTION WITH NLP - OA

### Loading Data and Necessary packages

In [None]:
#!pip install torch torchtext transformers sentencepiece pandas tqdm datasets
#!pip install keras
#!pip install tensorflow
#!curl https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin
# alternatively, do this with a progress bar
#!curl  --progress-bar -O https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install torch torchvision torchaudio
# Installing SPARQLWrapper to access the disease ontology
#!pip install SPARQLWrapper
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix"
#!pip install gensim
#!pip install keras
#!pip install tensorflow
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install transformers
#!pip install tf-keras
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
#!pip install nlpaug
#!pip install transformers[torch]

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4') 
nltk.download('punkt')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Conv1D, MaxPooling1D, Flatten, Activation
import tensorflow as tf
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, pipeline
from gensim.models import KeyedVectors
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
import time
import joblib
import ast


#### Loading the Data

In [None]:
## Major Datasets
nls2d = pd.read_csv("Natural language Symptom2Disease prediction.csv")
textsympdata = pd.read_csv("full_textbased_symptom_dataset.csv")

In [None]:
textsympdata.head()

In [None]:
textsympdata.sample(5)

In [None]:
# Choosing only the Disease and description columns
nls2d = nls2d[['Disease', 'text']]
nls2d.head()

In [None]:
textsympdata.shape

In [None]:
nls2d.shape

#### Combining the disease symptom and text descriptions datasets

In [None]:
# Merging the datasets on the Disease column
combineddata = pd.merge(nls2d, textsympdata, on="Disease", how="outer")
print(combineddata)
combineddata.to_csv("Combined disease and symptom text data.csv", index=False)

In [None]:
combineddata.head()

In [None]:
combineddata.sample(2)

#### Understanding which diseases are responsible for the dataset explosion

In [None]:
# Count of rows for each disease in each dataset
nls2d_counts = nls2d['Disease'].value_counts()
textsympdata_counts = textsympdata['Disease'].value_counts()

# Merging the counts into one DataFrame
disease_merge_counts = pd.DataFrame({
    "nls2d_count": nls2d_counts,
    "textsympdata_count": textsympdata_counts
}).fillna(0)

# Adding an column for the expected rows
disease_merge_counts['expected_rows'] = disease_merge_counts['nls2d_count'] * disease_merge_counts['textsympdata_count']

# Summary
print(disease_merge_counts)
print(f"Expected total rows after merge: {disease_merge_counts['expected_rows'].sum()}")


## Baseline modelling - TFIDF Logistic regression


In [None]:
X = combineddata['text'].fillna('')  # Replacing all the NAN values empty strings
y = combineddata['Disease']

# Splitting data into training and test sets
X_train, X_test, y_train, b_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction with TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic regression model with One-vs-Rest classifier for multilabel classification
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(b_test, y_pred))
print("Classification Report:\n", classification_report(b_test, y_pred))


In [None]:
# Defining a function to process and predict from user input - For demo
def predict_disease(symptom_text, model, tfidf_vectorizer, top_n=5):
    
    # Converting the input text into a TF-IDF vector
    symptom_vector = tfidf_vectorizer.transform([symptom_text])

    # Predicting probabilities for each disease
    probabilities = model.predict_proba(symptom_vector)[0]  

    #  Most likely diseases
    top_disease_indices = probabilities.argsort()[-top_n:][::-1]  
    top_disease_probs = probabilities[top_disease_indices]  
    top_disease_labels = model.classes_[top_disease_indices]  

    # Results
    predictions = list(zip(top_disease_labels, top_disease_probs))
    print("Top predicted diseases and probabilities:")
    for disease, prob in predictions:
        print(f"{disease}: {prob:.4f}")

    return predictions


In [None]:
# Sample 1: Predictions using training data language
Patient_input1 = "I have a headache and a high fever"
predictions = predict_disease(Patient_input1, model, tfidf, top_n=5)

In [None]:
# Sample 2 : Using unseen Language
patient_input2 = "I always want to pee, i have a headache and i have a high fever"
predictions = predict_disease(patient_input2, model, tfidf, top_n=5)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter as ctr
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

In [None]:
nls2d.head(10)

In [None]:
ctr(nls2d['Disease'])

In [None]:
nls2d.sample(10)

In [None]:
def text_preprocessing(text):
  tokens = word_tokenize(text)
  snowball_stemmer = SnowballStemmer('english')
  tokens = [snowball_stemmer.stem(token.lower()) for token in tokens if token.isalpha()]
  return ' '.join(tokens)

nls2d["text"] = nls2d["text"].apply(text_preprocessing)


In [None]:
tfidf_vectorizer = TfidfVectorizer()
A = tfidf_vectorizer.fit_transform(nls2d["text"])

label_encoder = LabelEncoder()
b = label_encoder.fit_transform(nls2d["Disease"])

In [None]:
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.2, random_state=42)

base_models = [
    ("nb", MultinomialNB()),
    ("rf", RandomForestClassifier()),
    ("lr", LogisticRegression()),
    ("svm", SVC(kernel = "linear", probability = True))
]

voting_classifier = VotingClassifier(estimators=base_models, voting='hard')

In [None]:
voting_classifier.fit(A_train, b_train)

In [None]:
accuracy = voting_classifier.score(A_test, b_test)
print("Accuracy:", accuracy)

In [None]:
import joblib

joblib.dump(voting_classifier, 'voting_classifier_model_Disease_pred_97_percent_acc.pkl')
loaded_model = joblib.load('voting_classifier_model_Disease_pred_97_percent_acc.pkl')


In [None]:
# Sample text
sample_text = "I have to pee all the time and I am stressed."
sample_text_processed = text_preprocessing(sample_text)
sample_text_transformed = tfidf_vectorizer.transform([sample_text_processed])
predicted_label = label_encoder.inverse_transform(voting_classifier.predict(sample_text_transformed))

print("Predicted Label:", predicted_label)

In [None]:
# Compute accuracy
accuracy = accuracy_score(b_test, b_test)
print("Accuracy:", accuracy)

# Compute precision
precision = precision_score(b_test, b_test, average='macro')  # 'macro' computes precision for each label, and returns the average
print("Precision:", precision)

# Compute recall
recall = recall_score(b_test, b_test, average='macro')  # 'macro' computes recall for each label, and returns the average
print("Recall:", recall)

# Compute F1-score
f1 = f1_score(b_test, b_test, average='macro')  # 'macro' computes F1-score for each label, and returns the average
print("F1-score:", f1)

# Compute confusion matrix
conf_matrix = confusion_matrix(b_test, b_test)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_disease_nlp.joblib')
joblib.dump(label_encoder, 'label_encoder_disease_nlp.joblib')

import joblib
voting_classifier = joblib.load('voting_classifier_model_Disease_pred_97_percent_acc.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer_disease_nlp.joblib')
label_encoder = joblib.load('label_encoder_disease_nlp.joblib')

In [None]:
# full code with everything saved to joblib

# Mdel and vectorizer
voting_classifier = joblib.load('voting_classifier_model_Disease_pred_97_percent_acc.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer_disease_nlp.joblib')
label_encoder = joblib.load('label_encoder_disease_nlp.joblib')

# Sample text
sample_text = "I have been experiencing a skin rash on my arm for the past few weeks."

# Preprocess the sample text
def preprocess_text(text):
    tokens = word_tokenize(text)
    snowball_stemmer = SnowballStemmer('english')
    tokens = [snowball_stemmer.stem(token.lower()) for token in tokens if token.isalpha()]
    return ' '.join(tokens)

sample_text_processed = preprocess_text(sample_text)

# Transform the preprocessed sample text using the loaded vectorizer
sample_text_transformed = tfidf_vectorizer.transform([sample_text_processed])

# Predict using the loaded model
predicted_label_encoded = voting_classifier.predict(sample_text_transformed)

# Decode the predicted label
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

print("Predicted Label:", predicted_label)

## Additional Data preparation 

Checking the combineddata to see which diseases have text descriptions and which don't. I'd also like to see what percentage of the data the ones that have a text based symptom description cover so I'll know if my best approach is just to drop the ones that dont have a text based symptom description

In [None]:
#standardizing the disease names to prevent dropping a disease due to capitalization or space

# Replacing NaN with empty strings and ensure all values are strings
combineddata = combineddata.fillna('').astype(str)

# Standardizing disease names to lowercase and stripping the leading/trailing spaces
combineddata['Disease'] = combineddata['Disease'].str.lower().str.strip()

# Checking for unique diseases after normalization
unique_diseases = combineddata['Disease'].unique()
print(f"Unique diseases after normalization: {len(unique_diseases)}")


In [None]:
summary_diseases = combineddata['Disease'].value_counts()
print(summary_diseases)

In [None]:
# Mapping diseasese descriptions to empty text spaces
disease_text_mapping = combineddata[combineddata['text'].str.strip() != ''].groupby('Disease')['text'].first().to_dict()

# Populating the missing text descriptions using the mapping
combineddata['text'] = combineddata.apply(
    lambda row: disease_text_mapping[row['Disease']] if row['text'].strip() == '' and row['Disease'] in disease_text_mapping else row['text'],
    axis=1
)

In [None]:
# Just checking
combineddata.sample(3)

In [None]:
# Checking for missing text-based symptom descriptions
missing_text = combineddata['text'].str.strip().eq('')

# Number of rows with and without text descriptions
num_with_text = (~missing_text).sum()
num_without_text = missing_text.sum()

# Total number of rows
total_rows = len(combineddata)

# Coverage percentages
coverage_with_text = (num_with_text / total_rows) * 100
coverage_without_text = (num_without_text / total_rows) * 100

# Results
print(f"Total rows: {total_rows}")
print(f"Rows with text descriptions: {num_with_text} ({coverage_with_text:.2f}%)")
print(f"Rows without text descriptions: {num_without_text} ({coverage_without_text:.2f}%)")

# Checking for diseases without text descriptions
diseases_without_text = combineddata[missing_text]['Disease'].unique()
diseases_with_text = combineddata[~missing_text]['Disease'].unique()
print("Diseases without text descriptions:", diseases_without_text)
print("Diseases with text descriptions:", diseases_with_text)

Since 97.44% of the dataset contains rows with text descriptions of the disease, I have decided to drop the rows that do not contain a text description and document which diseases were dropped out of the dataset. This should ease the rest of the process with preprocessing and model building. If at any point later in the advancement of this model, I feel it is necessary to reinclude these diseases, I will check SNOMED-CT or the Disease Oncologyfor standard disease descriptions of the diseases that were dropped or work to collect data on them for the models.

The old dataset contains 44 unique diseases

The new dataset contains 24 Unique diseases namely: 'Acne' 'Arthritis' 'Bronchial Asthma' 'Cervical spondylosis'
 'Chicken pox' 'Common Cold' 'Dengue' 'Dimorphic Hemorrhoids'
 'Fungal infection' 'Hypertension' 'Impetigo' 'Jaundice' 'Malaria'
 'Migraine' 'Pneumonia' 'Psoriasis' 'Typhoid' 'Varicose Veins' 'allergy'
 'diabetes' 'drug reaction' 'gastroesophageal reflux disease'
 'peptic ulcer disease' 'urinary tract infection'

In [None]:
# Dropping the rows without Text based symptom descriptions
combineddata_with_text = combineddata[~missing_text]

# Print remaining rows
print(f"Remaining rows after dropping: {len(combineddata_with_text)}")

In [None]:
combineddata_with_text.sample(10)

In [None]:
# Saving the new combined dataset with text to my computer for use in modelling
combineddata_with_text.to_csv("New combined dataset with text.csv", index=False)

print("Dataset saved as 'New combined dataset with text'.")


In [None]:
"""# Documenting the dropped rows and diseases for future records and advancements
diseases_without_text = combineddata[missing_text]['Disease'].unique()
dropped_diseases_df = pd.DataFrame(diseases_without_text, columns=["Disease"])
dropped_diseases_df.to_csv("All_dropped_diseases.csv", index=False)

print(f"Dropped diseases saved to 'All dropped_diseases.csv'")
"""

** Notes: 

New dataset containing 24 unique diseases is now prepared. This new dataset for NER and SNOMED-CT/Disease oncology mapping as well as the final modelling.

## Feature Extraction

#### Loading the updated data

In [None]:
dpnlp = pd.read_csv("New combined dataset with text.csv")

# Replace NaN with empty strings and ensure all values are strings
dpnlp = dpnlp.fillna('').astype(str)
dpnlp.sample(10)

#### Data preprocessing

In [None]:
#  initializing tools
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text Preprocessing Function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Removing punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = text.split()
    # Removing stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Appling preprocessing to the 'text' column
dpnlp['processed_text'] = dpnlp['text'].apply(preprocess_text)

# Saving the preprocessed dataset
dpnlp.to_csv("Preprocessed_dataset.csv", index=False)

dpnlp.sample(3)

In [None]:
# Standardizing the symptom columns
for col in ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']:
    dpnlp[col] = dpnlp[col].str.lower().str.strip()

#concatenating symptoms into 1 string per row for feature extraction
dpnlp['all_symptoms'] = dpnlp[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']].apply(lambda x: ' '.join(x), axis=1)

# preprocessing all symptoms
dpnlp['processed_symptoms'] = dpnlp['all_symptoms'].apply(preprocess_text)


## Feature Extraction with TFIDF and BioWordVec

In [None]:
# TFIDF feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(dpnlp['processed_symptoms'])


In [None]:
# Using gensim library to load BioWordVec
bio_word_vec = KeyedVectors.load_word2vec_format('BioWordVec_PubMed_MIMICIII_d200.vec.bin', binary=True)

# Test loading
print(bio_word_vec['rash'])  

In [None]:
# Retrieving the word embeddings from Bio-WordVec
def get_word_embeddings(word, word_vectors):
    embedding_dim = word_vectors.vector_size
    return word_vectors[word] if word in word_vectors else np.zeros(embedding_dim)

def text_to_embeddings(text, word_vectors):
    tokens = text.split()  # Tokenize text
    embeddings = [get_word_embeddings(token, word_vectors) for token in tokens]
    if embeddings:  # Avoid empty embeddings
        return np.mean(embeddings, axis=0)  # Averaging word embeddings
    else:
        return np.zeros(word_vectors.vector_size) 

# Applying BioWordVec to transformprocessed text
dpnlp['embeddings'] = dpnlp['processed_text'].apply(lambda x: text_to_embeddings(x, bio_word_vec))


Initial results before adding post processing function to deal with the word splitting in tokenization

text = "The patient was diagnosed with diabetes and hypertension."
Entity: The, Label: 0, Score: 0.999998927116394
Entity: patient, Label: 0, Score: 0.999998927116394
Entity: was, Label: 0, Score: 0.9999988079071045
Entity: diagnosed, Label: 0, Score: 0.9999986886978149
Entity: with, Label: 0, Score: 0.9999985694885254
Entity: diabetes, Label: B-DISEASE, Score: 0.9999868869781494
Entity: and, Label: 0, Score: 0.999997615814209
Entity: h, Label: B-DISEASE, Score: 0.9999666213989258
Entity: ##yper, Label: B-DISEASE, Score: 0.9349839687347412
Entity: ##tens, Label: I-DISEASE, Score: 0.9963584542274475
Entity: ##ion, Label: I-DISEASE, Score: 0.9990299940109253
Entity: ., Label: 0, Score: 0.9999988079071045

## Finetuned TFIDF Logistic regression modelling with BioWord Vec and structured symptoms processed from dataset

In [None]:
# preparing the embedding matrix
texts = dpnlp['processed_text'].tolist() 

# Initializing teh tokenizer
tokenizer = Tokenizer()

# Fitting tokenizer to training data
tokenizer.fit_on_texts(texts)

# Using the tokenizer to get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  

# embedding matrix
embedding_dim = 200  
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Filling the embedding matrix with BioWordVec embeddings
for word, idx in tokenizer.word_index.items():
    if word in bio_word_vec:  
        embedding_matrix[idx] = bio_word_vec[word]

print(f"Embedding matrix shape: {embedding_matrix.shape}")

In [None]:
# Training test split
X = X_tfidf
y = dpnlp['Disease']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
## Baseline Log reg modelling with one vs rest classification
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)


In [None]:
# Model evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Tf-idf feature extraction and Logistic rgression with One-v-rest gives a model with almost perfect accuracy. Further review of the other evaluation metrics shows that the model is predicting excellently for diseases with large sample size, moderately well with an average of 85% for diseases with a sample size of around 30 and not at all (i.e) 0 score for diseases with less than 15 instances in the data

In [None]:
# Saving the TFIDF and finetuned logistic regression model
joblib.dump(model, 'b_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

In [None]:
# Testing the model with sample inputs
sample_input = "I have a headache and a high fever"
sample_input_processed = preprocess_text(sample_input)
sample_input_tfidf = tfidf.transform([sample_input_processed])
sample_input_prediction = model.predict(sample_input_tfidf)

In [None]:
sample_input_prediction

## Feed Forward Neural Network (FFNN) Modelling

#### Training test split FFNN

In [None]:
# Stacking the embeddings into a 2D array
# Convert the embeddings column to a 2D array
X = np.stack(dpnlp['embeddings'].values)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(dpnlp['Disease'])  # Converts disease names into integers

# Training Test split
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Handling data imbalance with SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


#### Feed forward neural network (FFNN)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Defining the model class with configurable hyperparameters
class DiseasePredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_classes=10, dropout=0.3):
        super(DiseasePredictionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return self.softmax(x)

# Hyperparameters
hyperparams = {
    "hidden_size": 128,
    "dropout": 0.3,
    "num_classes": len(np.unique(y)),  
    "input_size": X_train.shape[1],  
    "learning_rate": 0.001,
    "epochs": 20,
    "batch_size": 32,
}

#  initializing the model
model = DiseasePredictionModel(
    input_size=hyperparams["input_size"],
    hidden_size=hyperparams["hidden_size"],
    num_classes=hyperparams["num_classes"],
    dropout=hyperparams["dropout"],
)


In [None]:
# Preparing the data

# Converting to PyTorch tensors
train_data = torch.tensor(X_train, dtype=torch.float32)
train_labels = torch.tensor(y_train, dtype=torch.long)
test_data = torch.tensor(X_test, dtype=torch.float32)
test_labels = torch.tensor(y_test, dtype=torch.long)

# Creating Dataloaders
train_dataset = TensorDataset(train_data, train_labels)
test_dataset = TensorDataset(test_data, test_labels)

train_loader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)


In [None]:
# Model training

# Setting device tp use my GPU if it's still got memory or just CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_feedforward(model, train_loader, test_loader, hyperparams):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
    
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    model.to(device)  # moving model to the device

    for epoch in range(hyperparams["epochs"]):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # moving data to device
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_losses.append(running_loss / len(train_loader))
        train_accuracies.append(correct / total)

        # Validation
        model.eval()
        val_loss = 0.0
        correct, total = 0, 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)  # moving data to device
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_losses.append(val_loss / len(test_loader))
        val_accuracies.append(correct / total)

        print(
            f"Epoch [{epoch + 1}/{hyperparams['epochs']}], "
            f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, "
            f"Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}"
        )

    return train_losses, val_losses, train_accuracies, val_accuracies


In [None]:
# Monitoring training and validation loss

# Training the Feed Forward model
train_losses, val_losses, train_accuracies, val_accuracies = train_feedforward(
    model, train_loader, test_loader, hyperparams
)


In [None]:
# Plot Loss
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training vs. Validation Loss (Feed Forward Network)")
plt.legend()
plt.savefig("feedforward_loss_plot1.png")  
plt.show()

# Plot Accuracy
plt.figure(figsize=(12, 6))
plt.plot(train_accuracies, label="Training Accuracy")
plt.plot(val_accuracies, label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training vs. Validation Accuracy (Feed Forward Network)")
plt.legend()
plt.savefig("feedforward_accuracy_plot1.png")  
plt.show()


In [None]:
#Rechecking for data imbalance
print(dpnlp['Disease'].value_counts())

In [None]:
# Model Evaluation

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

# Evaluating the model
evaluate_model(model, test_loader)


### Data Augmentation data for smaller sample diseases to balance the dataset - NLP Aug

In [None]:
# Importing NLPAug and setting up Augmenters
import nlpaug.augmenter.word as naw

# Augmenters
synonym_augmenter = naw.SynonymAug(aug_src='wordnet')  # Augmenting using WordNet synonyms
random_swap_augmenter = naw.RandomWordAug(action="swap")  # Random word swapping
random_deletion_augmenter = naw.RandomWordAug(action="delete")  # Random word deletion


In [None]:
# Confirming package download
print(pos_tag(word_tokenize("This is a test sentence")))


#### Augmenting the Underepresented Diseases - 1

In [None]:
# Augmenting the underepresented diseases

# Filter underrepresented classes
underrepresented_classes = ['urinary tract infection', 'varicose veins', 'hypertension', 
                            'allergy', 'drug reaction', 'diabetes', 
                            'dimorphic hemorrhoids', 'gastroesophageal reflux disease', 
                            'peptic ulcer disease']

# CDataframe to hold the augmented Data
augmented_data = pd.DataFrame()

# Augmenting for each underrepresented class
for disease in underrepresented_classes:
    disease_samples = dpnlp[dpnlp['Disease'] == disease]
    augmented_samples = disease_samples.copy()

    # Augmenting the 'processed_text' column
    augmented_samples['processed_text'] = augmented_samples['processed_text'].apply(
        lambda x: synonym_augmenter.augment(x)
    )

    # Appending it to the augmented dataset
    augmented_data = pd.concat([augmented_data, augmented_samples])


In [None]:
print(augmented_data["Disease"].value_counts())

In [None]:
# First few entries
print(augmented_data['processed_text'].head())

# Data types in the columns
print(augmented_data['processed_text'].apply(type).value_counts())

In [None]:
# Converting lists to strings to support augmentation - Cannot augment lists due to error
def flatten_and_convert_to_string(entry):
    if isinstance(entry, list):
        # Flatten the list if necessary and join it into a single string
        return ' '.join([str(item) for item in entry])
    elif isinstance(entry, str):
        return entry  # Leave strings unchanged
    else:
        return str(entry)  # Convert other data types to strings

# Applying the function to the 'processed_text' column
augmented_data['processed_text'] = augmented_data['processed_text'].apply(flatten_and_convert_to_string)

# Just checking
print(augmented_data['processed_text'].apply(type).value_counts())

In [None]:
# Checking data types again to ensure they are now strings and properly converted
print(augmented_data['processed_text'].apply(type).value_counts())
print(augmented_data['processed_text'].head())

#### Augmenting the last 3 Disease classes and the first 6 separately - 2

In [None]:
# Repeating augmentation for underrepresented classes

# Ensure NLPAug leverages multiprocessing
synonym_augmenter = naw.SynonymAug(aug_src='wordnet', aug_p=0.3) 

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Defining separate groups
group_170 = ['urinary tract infection', 'varicose veins', 'hypertension', 
             'allergy', 'drug reaction', 'diabetes']
group_50 = ['dimorphic hemorrhoids', 'gastroesophageal reflux disease', 'peptic ulcer disease']

# Target sizes
min_sample_size_170 = 3000
min_sample_size_50 = 6000

# Augmenting Group 1 (170 samples)
for disease in group_170:
    current_count = len(augmented_data[augmented_data['Disease'] == disease])

    while current_count < min_sample_size_170:
        new_samples = augmented_data[augmented_data['Disease'] == disease].copy()
        new_samples['processed_text'] = new_samples['processed_text'].apply(
            lambda x: synonym_augmenter.augment(x)
        )
        augmented_data = pd.concat([augmented_data, new_samples], ignore_index=True)
        current_count = len(augmented_data[augmented_data['Disease'] == disease])
        print(f"Augmented {current_count}/{min_sample_size_170} samples for {disease}.")

# Augmenting Group 2 (50 samples)
for disease in group_50:
    current_count = len(augmented_data[augmented_data['Disease'] == disease])

    while current_count < min_sample_size_50:
        new_samples = augmented_data[augmented_data['Disease'] == disease].copy()
        new_samples['processed_text'] = new_samples['processed_text'].apply(
            lambda x: synonym_augmenter.augment(x)
        )
        augmented_data = pd.concat([augmented_data, new_samples], ignore_index=True)
        current_count = len(augmented_data[augmented_data['Disease'] == disease])
        print(f"Augmented {current_count}/{min_sample_size_50} samples for {disease}.")

print("Data augmentation completed!")


Due to the last 3 diseases having the minmum number of samples, the while loop is playing catch up by over augmenting the larger classes to try to get the smallest classes up to the mimimum level. To resolve this, I have decided to augment each group of classes separately. The classes with 170 samples will be augmented together, and so will the samples with only 50. That way, both are augmented separately and no one is oversampled in favor of the other

In [None]:
print(augmented_data['Disease'].value_counts())

In [None]:
# Saving to avoid losing augmented data
augmented_data.to_csv('augmented_data_backup.csv', index=False)
print("Data augmentation completed!")


In [None]:
# Combining the original and augmented datasets without dropping duplicates
final_data = pd.concat([dpnlp, augmented_data], ignore_index=True)

# strings
final_data['processed_text'] = final_data['processed_text'].apply(
    lambda x: ' '.join(x) if isinstance(x, (list, np.ndarray)) else x
)

In [None]:
final_data.sample(5)

In [None]:
# Saving the combined dataset to a CSV file
file_path = "final_Balanced_Augmented Datasset.csv" 
final_data.to_csv(file_path, index=False)
print(f"Combined dataset saved at {file_path}")

## Final Models

### Data preparation

In [None]:
# Loading the augmented data
final_data = pd.read_csv("final_Balanced_Augmented Datasset.csv")

# Convert the disease descriptions in the 'text' column to lowercase
final_data["text"] = final_data["text"].str.lower()

# Updated dataframe
final_data.head()

In [None]:
final_data['Disease'].value_counts()

In [None]:
# Loading bioWordVec embeddings
# Load BioWordVec
biowordvec = KeyedVectors.load_word2vec_format("BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True)


In [None]:
# Function for creating text embeddings
def embed_text(text, embedding_model):
    words = text.split()  # Split input text into words
    word_vectors = [embedding_model[word] for word in words if word in embedding_model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)  
    else:
        return np.zeros(embedding_model.vector_size)  # Fallback for missing words

# Adding embeddings
final_data['embeddings'] = final_data['text'].apply(lambda x: embed_text(x, biowordvec))

### Model 1:  Dense Neural Network with BioWordVec Embeddings

In [None]:
# Split data
X = np.stack(final_data["embeddings"])
y = pd.get_dummies(final_data["Disease"]).values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#  Model
b_model = Sequential([
    Input(shape=(X.shape[1],)),  
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(y.shape[1], activation="softmax")  
])

b_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Evaluateing the model on the test set
b_model.evaluate(X_test, y_test)

#Adding class weights to balance the importance of the minority classes
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(final_data["Disease"]), y=final_data["Disease"])
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Model training
history = b_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=12,
    class_weight=class_weights_dict
)

# Evaluating the model on the test set
b_model.evaluate(X_test, y_test)


In [None]:
import matplotlib.pyplot as plt

# Training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Classification report

# Test set predictions
y_pred = b_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  
y_true = np.argmax(y_test, axis=1) 

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes))

# OConfusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred_classes))


In [None]:
# Making disease predictions with teh model

def predict_disease(input_text, model, embedding_model, class_names):
    # Preprocess the input text
    input_embedding = embed_text(input_text, embedding_model)
    input_embedding = np.expand_dims(input_embedding, axis=0)   

    # Predicting probabilities
    predictions = model.predict(input_embedding)

    # Top 5 predictions
    top_indices = predictions[0].argsort()[-5:][::-1]  
    top_diseases = [(class_names[idx], predictions[0][idx]) for idx in top_indices]

    return top_diseases

# Class names
class_names = pd.get_dummies(final_data["Disease"]).columns.tolist()

In [None]:
#Testing function with sample input
input_text = "The past few days I haven’t been feeling well. My head has a full feeling. And my sinuses are all congested. I can’t go 10 minutes without coughing or sneezing. "
top_diseases = predict_disease(input_text, b_model, biowordvec, class_names)

# Predictions
print("Top predicted diseases with probabilities:")
for disease, prob in top_diseases:
    print(f"{disease}: {prob:.2f}")

In [None]:
# Sample input test 5
input_text = "I have been having strange rashes all oer my body with some having round blusters. They are not itchy, just red and around my abdomen. I am always tired and I cant sleep well"
top_diseases = predict_disease(input_text, b_model, biowordvec, class_names)

#   Print the predictions
print("Top predicted diseases with probabilities:")
for disease, prob in top_diseases:
    print(f"{disease}: {prob:.2f}")

In [None]:
#Sample input test 2 - This result is a little iffy. How did we get hyperytension from that?
input_text = "I am getting a severe headache and i am unable to sleep "
top_diseases = predict_disease(input_text, b_model, biowordvec, class_names)

#   Print the predictions
print("Top predicted diseases with probabilities:")
for disease, prob in top_diseases:
    print(f"{disease}: {prob:.2f}")

In [None]:
# Sample input test 3
input_text = "I had few blood clot patches on my legs. It happens with me when I'm feeling weak or I had few foot cramps during winter."
top_diseases = predict_disease(input_text, b_model, biowordvec, class_names)

#   Print the predictions
print("Top predicted diseases with probabilities:")
for disease, prob in top_diseases:
    print(f"{disease}: {prob:.2f}")

In [None]:
# Sample input test 4
input_text = "ts just a normal cough and fever"
top_diseases = predict_disease(input_text, b_model, biowordvec, class_names)

#   Print the predictions
print("Top predicted diseases with probabilities:")
for disease, prob in top_diseases:
    print(f"{disease}: {prob:.2f}")

 Based on the outputs of all the test samples, it looks to me like descriptions that are more verbose or fuller tend to have better predictions that closely match the predictions of Medical chatGPT. While all the predictions seem possible, some are more likely than others. Shorter descriptions look like they have far fetched predictions. For instance, even though I know hypertension can be a likely disease for sample input 2, it's too little information to predict hypertension. Overall, the model performs much better with more information provided.

### Model 2: Experimental Zero-Shot learning with Disease ontology 

In [None]:
# Installing SPARQLWrapper to access the disease ontology
#!pip install SPARQLWrapper

In [None]:
# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Removing special characters
    return text.strip()

# Function to compute sentence embeddings
def sentence_embedding(sentence, word2index, embedding_matrix):
    words = sentence.split()
    indices = [word2index[word] for word in words if word in word2index]
    if indices:
        
        # moving embeddings for this sentence to GPU
        embeddings = embedding_matrix[indices].to("cuda")
        return embeddings.mean(dim=0)  
    else:
        return torch.zeros(embedding_matrix.shape[1], device="cuda") 

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

#  initializing SPARQL endpoint
sparql = SPARQLWrapper("https://sparql.disease-ontology.org/")
sparql.setQuery("""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?id ?label ?source
WHERE {
  ?iri a owl:Class ;
       oboInOwl:id ?id ;
       rdfs:label ?label ;
       obo:IAO_0000115 ?definition ;
       oboInOwl:hasOBONamespace "disease_ontology" .

  # Ignore obsolete
  FILTER NOT EXISTS { ?iri owl:deprecated ?deprecated . }

  [] owl:annotatedSource ?iri ;
     owl:annotatedProperty obo:IAO_0000115 ;
     owl:annotatedTarget ?definition ;
     oboInOwl:hasDbXref ?def_src .

  BIND(REPLACE(?def_src, "url:", "") AS ?source)
}
""")
sparql.setReturnFormat(JSON)

# Results
results = sparql.query().convert()

# DataFrame - Saves my time
data = []
for result in results["results"]["bindings"]:
    data.append({
        "Disease ID": result["id"]["value"],
        "Disease Name": result["label"]["value"],
        "Source": result["source"]["value"]
    })

disease_descriptions = pd.DataFrame(data)

In [None]:
#!pip install accelerate transformers[torch]


In [None]:
# Loading BioWordVec and preparing embeddings
#biowordvec already in enviromnet - I.e the word vectors
embedding_matrix = torch.tensor(biowordvec.vectors, device="cpu")
word2index = {word: idx for idx, word in enumerate(biowordvec.index_to_key)}

# Adding embeddings to disease ontology
disease_descriptions['embedding'] = disease_descriptions['Disease Name'].apply(
    lambda x: sentence_embedding(x, word2index, embedding_matrix)
)

# Dataset
final_data = pd.read_csv("final_Balanced_Augmented Datasset.csv")

#filtering non_string values
final_data['processed_symptoms'] = final_data['processed_symptoms'].apply(
    lambda x: str(x) if not isinstance(x, str) else x
)

# handling missing values
final_data['processed_symptoms'] = final_data['processed_symptoms'].fillna("")

symptom_descriptions = final_data['processed_symptoms'].unique()
symptom_descriptions = symptom_descriptions.dropna() #filtering out Nan Values

symptom_embeddings = torch.stack(
    [sentence_embedding(preprocess_text(symptom), word2index, embedding_matrix)
     for symptom in symptom_descriptions]
)

# Generating embeddings for symptoms
symptom_embeddings = torch.stack(
    [sentence_embedding(symptom, word2index, embedding_matrix) for symptom in symptom_descriptions]
)

#  Cosine similarities
disease_embedding_matrix = torch.stack(disease_descriptions['embedding'].tolist())
similarities = torch.nn.functional.cosine_similarity(
    symptom_embeddings.unsqueeze(1),  # Reshape for broadcasting
    disease_embedding_matrix.unsqueeze(0),
    dim=-1,
)

# Smptoms to disease matching
top_k = 5  
for i, symptom in enumerate(symptom_descriptions):
    top_indices = torch.topk(similarities[i], top_k).indices
    top_diseases = disease_descriptions.iloc[top_indices.tolist()]['Disease Name']
    print(f"Symptom: '{symptom}' -> Top Diseases: {list(top_diseases)}")
    