### Import Libraries

In [4]:
# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from sklearn.model_selection import train_test_split  # For splitting data into train and test sets
from sklearn.metrics import classification_report, confusion_matrix  # To evaluate model performance
from sklearn.naive_bayes import MultinomialNB  # Multinomial Naive Bayes Classifier
import joblib  # For saving and loading trained models and data
import seaborn as sns  # For visualization
import matplotlib.pyplot as plt  # For plotting
import warnings  # To handle warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Load the Dataset

In [6]:
# Load the dataset from the file
file_path = '../resources/disease_symptom_data.csv'
disease_symptom_df = pd.read_csv(file_path)
disease_symptom_df

Unnamed: 0,disease_id,d_name,symptom_id,s_name
0,C0020538,Hypertensive Disease,C0008031,Pain Chest
1,C0020538,Hypertensive Disease,C0392680,Shortness Of Breath
2,C0020538,Hypertensive Disease,C0012833,Dizziness
3,C0020538,Hypertensive Disease,C0004093,Asthenia
4,C0020538,Hypertensive Disease,C0085639,Fall
...,...,...,...,...
1901,C0233472,Affect Labile,C0741453,Bedridden
1902,C0233472,Affect Labile,C0242453,Prostatism
1903,C0011127,Decubitus Ulcer,C0232257,Systolic Murmur
1904,C0011127,Decubitus Ulcer,C0871754,Frail


### Disease-Symptom one hot coding

In [7]:
# Convert the 'symptom_id' column into binary features (one-hot encoding)
binary_features = pd.get_dummies(disease_symptom_df['symptom_id'])

# Aggregate data by 'disease_id' to create a disease-symptom matrix
disease_symptom_hotcoded = pd.concat([disease_symptom_df['disease_id'], binary_features], axis=1).groupby('disease_id').sum().reset_index()
disease_symptom_hotcoded

Unnamed: 0,disease_id,C0000727,C0000731,C0000737,C0002416,C0002962,C0003123,C0003126,C0003862,C0003962,...,C1320716,C1321756,C1384489,C1384606,C1405524,C1444773,C1456822,C1511606,C1513183,C1517205
0,C0001175,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C0001418,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C0001511,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C0001973,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C0002395,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,C1258215,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129,C1456784,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130,C1510475,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
131,C1565489,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Separate Features and Labels

In [8]:
# Separate features (symptoms) and labels (disease IDs)
X = disease_symptom_hotcoded.iloc[:, 1:]  # Symptom data as features
y = disease_symptom_hotcoded['disease_id']  # Disease IDs as labels

# Print basic statistics about the dataset
print(f"Total number of data points: {X.shape[0]}")  
print(f"Total number of features (symptoms): {X.shape[1]}")  
print(f"Number of unique diseases: {len(y.unique())}")  

Total number of data points: 133
Total number of features (symptoms): 406
Number of unique diseases: 133


### Split training and test Data

In [9]:
# Split the balanced dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train Multinomial Naive Bayes on training data
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = mnb.score(X_test, y_test)
print(accuracy)

0.0


In [14]:
# Train Multinomial Naive Bayes on training data
mnb_tot = MultinomialNB()
mnb_tot = mnb_tot.fit(X, y)

# Evaluate the model on the test data
accuracy = mnb_tot.score(X_test, y_test)
print(accuracy)

1.0


In [15]:
# Make predictions
disease_pred = mnb.predict(X_test)
disease_real = y_test.values

# Check for mismatches
mismatch_found = False
for pred, actual in zip(disease_pred, disease_real):
    if pred != actual:
        mismatch_found = True
        break

if not mismatch_found:
    pass


In [10]:
disease_pred = mnb_tot.predict(X)

In [11]:
disease_real = y.values

In [12]:
mismatch_found = False  # Flag to check for mismatches

for i in range(0, len(disease_real)):
    print(f"Predicted: {disease_pred[i]} | Actual: {disease_real[i]}")
    if disease_pred[i] != disease_real[i]:
        print('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))
        mismatch_found = True  # Set the flag to True if a mismatch is found

# Check the flag after the loop
if not mismatch_found:
    print("==========================================================")
    print("No mismatches found between predictions and actual values.")

Predicted: C0001175 | Actual: C0001175
Predicted: C0001418 | Actual: C0001418
Predicted: C0001511 | Actual: C0001511
Predicted: C0001973 | Actual: C0001973
Predicted: C0002395 | Actual: C0002395
Predicted: C0002871 | Actual: C0002871
Predicted: C0002895 | Actual: C0002895
Predicted: C0003507 | Actual: C0003507
Predicted: C0003537 | Actual: C0003537
Predicted: C0003864 | Actual: C0003864
Predicted: C0004096 | Actual: C0004096
Predicted: C0004610 | Actual: C0004610
Predicted: C0005001 | Actual: C0005001
Predicted: C0005586 | Actual: C0005586
Predicted: C0006142 | Actual: C0006142
Predicted: C0006266 | Actual: C0006266
Predicted: C0006277 | Actual: C0006277
Predicted: C0006826 | Actual: C0006826
Predicted: C0006840 | Actual: C0006840
Predicted: C0007097 | Actual: C0007097
Predicted: C0007102 | Actual: C0007102
Predicted: C0007642 | Actual: C0007642
Predicted: C0007787 | Actual: C0007787
Predicted: C0008325 | Actual: C0008325
Predicted: C0008350 | Actual: C0008350
Predicted: C0009319 | Act

In [13]:
import os
import joblib

joblib.dump(mnb, os.path.join('', 'disease_symptom_mnb.pkl'), protocol=2)


['disease_symptom_mnb.pkl']

In [14]:
df= X
cols = df.columns
features = cols # = symptoms
features_raw = [str(features[x]) for x in range(len(features))]
features_raw = ','.join(map(str, features_raw))

In [15]:

# convert feature array into dict of symptom: index
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i

In [16]:
def findFeatures(disease):
    return disease_symptom_hotcoded.loc[disease_symptom_hotcoded['disease_id'] == disease]["symptom_id"].values.astype(str)

In [37]:
import numpy as np

sample = np.zeros((len(features),), dtype=int)
sample = sample.tolist()


In [38]:
search = ["C0008031", "C0392680", "C0233481"]
for i,s in enumerate(search):
    sample[feature_dict[s]] = 1

In [39]:
sample = np.array(sample).reshape(1,len(sample))

In [40]:
results = mnb.predict_proba(sample)[0]

In [41]:

# gets a dictionary of {'class_name': probability}
prob_per_class_dictionary = dict(zip(mnb.classes_, results))

In [42]:
# gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
results_ordered_by_probability = map(lambda x: {"disease": x[0],"prop": x[1] * 100, "sy": findFeatures(x[0])}, sorted(zip(mnb.classes_, results), key=lambda x: x[1], reverse=True))
print (results_ordered_by_probability)

<map object at 0x00000231F7777E20>


In [43]:
print (list(results_ordered_by_probability))

KeyError: 'symptom_id'

In [None]:

#store the predicted probabilities for class 1
y_pred_prob = mnb.predict_proba(sample)[0]