In [15]:
# Note book to Train the following models
#Logistic Regression, Decision Trees, Random Forests, and SVM 

In [58]:
!pip install imbalanced-learn



In [44]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')


In [45]:
# Test/Train Data Preparation 

decease_df = pd.read_csv('./resources/Disease_data.csv')
decease_df.shape
# Get the data types of each column
dtypes = decease_df.dtypes
# Find the unique data types
unique_dtypes = dtypes.unique()
#print("\nUnique data types in the DataFrame:")
print(unique_dtypes)


object_columns = decease_df.select_dtypes(include=['object'])
object_columns

decease_df.replace({True: 1, False: 0, '0':0 ,'False':0, 'True':1}, inplace=True)

object_columns = decease_df.select_dtypes(include=['object'])

[dtype('float64') dtype('int64') dtype('O')]


In [18]:
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
# Initialize the models
def executeModels(X_train, X_test, y_train, y_test):
    models = {
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Naive Bayes": GaussianNB(),
        "SVC": SVC()
    }
    
    # Train and evaluate each model
    results = {}
    
    for model_name, model in models.items():
    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        
        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": cm
        }
    
    # Display the results
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value}")
        print("\n")

    return results

In [33]:
executeModels(X_train, X_test, y_train, y_test)

Model: Decision Tree
Accuracy: 0.6882188374261734
Precision: 0.6884560602292769
Recall: 0.6882188374261734
F1 Score: 0.6882674697304584
Confusion Matrix: [[26  0  0 ...  0  0  0]
 [ 0 22  0 ...  0  0  0]
 [ 0  0 13 ...  0  0  0]
 ...
 [ 0  1  0 ... 27  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 21]]


Model: Random Forest
Accuracy: 0.7000310848616723
Precision: 0.6960209871997592
Recall: 0.7000310848616723
F1 Score: 0.6977271212722559
Confusion Matrix: [[26  0  0 ...  0  0  0]
 [ 0 22  0 ...  0  0  0]
 [ 0  0 13 ...  0  0  0]
 ...
 [ 0  0  0 ... 28  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 21]]


Model: Naive Bayes
Accuracy: 0.7379546161019583
Precision: 0.7284748827464953
Recall: 0.7379546161019583
F1 Score: 0.7277051759200379
Confusion Matrix: [[26  0  0 ...  0  0  0]
 [ 0 22  0 ...  0  0  0]
 [ 0  1 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 28  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 21]]


Model: SVC
Accuracy: 0.6092632887783649
Precision: 0.4808511091077

{'Decision Tree': {'Accuracy': 0.6882188374261734,
  'Precision': 0.6884560602292769,
  'Recall': 0.6882188374261734,
  'F1 Score': 0.6882674697304584,
  'Confusion Matrix': array([[26,  0,  0, ...,  0,  0,  0],
         [ 0, 22,  0, ...,  0,  0,  0],
         [ 0,  0, 13, ...,  0,  0,  0],
         ...,
         [ 0,  1,  0, ..., 27,  0,  0],
         [ 0,  0,  0, ...,  0, 23,  0],
         [ 0,  0,  0, ...,  0,  0, 21]])},
 'Random Forest': {'Accuracy': 0.7000310848616723,
  'Precision': 0.6960209871997592,
  'Recall': 0.7000310848616723,
  'F1 Score': 0.6977271212722559,
  'Confusion Matrix': array([[26,  0,  0, ...,  0,  0,  0],
         [ 0, 22,  0, ...,  0,  0,  0],
         [ 0,  0, 13, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 28,  0,  0],
         [ 0,  0,  0, ...,  0, 23,  0],
         [ 0,  0,  0, ...,  0,  0, 21]])},
 'Naive Bayes': {'Accuracy': 0.7379546161019583,
  'Precision': 0.7284748827464953,
  'Recall': 0.7379546161019583,
  'F1 Score': 0.7277051759

In [34]:
# Model Performance is below 70% . Let's try the following fine tuning to achive better performance.
# Let's validate Balancing 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
# Apply PCA to reduce the dimensions to 20 for visualization
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

executeModels(X_train, X_test, y_train, y_test)

Model: Decision Tree
Accuracy: 0.6829344109418714
Precision: 0.681854166960828
Recall: 0.6829344109418714
F1 Score: 0.682365642815157
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7241736607605429
Precision: 0.7136879579156352
Recall: 0.7241736607605429
F1 Score: 0.7153430867380022
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7109107864470003
Precision: 0.7084215224790911
Recall: 0.7109107864470003
F1 Score: 0.7095702785258959
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7148482022588333
Precision: 0.791733871644694

{'Decision Tree': {'Accuracy': 0.6829344109418714,
  'Precision': 0.681854166960828,
  'Recall': 0.6829344109418714,
  'F1 Score': 0.682365642815157,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7241736607605429,
  'Precision': 0.7136879579156352,
  'Recall': 0.7241736607605429,
  'F1 Score': 0.7153430867380022,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7109107864470003,
  'Precision': 0.7084215224790911,
  'Recall': 0.7109107864470003,
  'F1 Score': 0.709570278525

In [35]:
# Model performance improved, but still the overall performance is below 75. Let's use reblanancing.

y_train.value_counts()
# Completely inblanaced dataaset. More requests for Monkey Pox, No Disease Identified and Celiac.
# Take out the records for 'No Disease Identified'


prognosis
Monkey Pox                                 11116
No disease identified                       6641
Celiac                                      1279
Hepatitis E                                   95
Jaundice                                      94
Gastroenteritis                               93
Acne                                          93
Psoriasis                                     92
Migraine                                      92
Impetigo                                      91
GERD                                          90
Allergy                                       89
Paralysis (brain hemorrhage)                  89
Chronic cholestasis                           89
Hypothyroidism                                88
Cervical spondylosis                          88
Hepatitis B                                   87
(vertigo) Paroymsal  Positional Vertigo       86
Hypertension                                  86
Fungal infection                              86
Hyperthyro

In [53]:
# Delete all the records that below to no decease
#decease_df.drop(decease_df.loc[(decease_df['prognosis']=='No disease identified')],inplace=True,axis=1)

decease_df = decease_df.loc[(decease_df['prognosis']!='No disease identified')]

decease_df.shape

#decease_df.size
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
executeModels(X_train, X_test, y_train, y_test)

Model: Decision Tree
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Random Forest
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Naive Bayes
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: SVC
Accuracy: 0.7989728539985327
Precision: 0.6613220287792579
Recall: 0.7989728539985327
F1 Score: 0.715487107378886
Confusion Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0

{'Decision Tree': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Random Forest': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Naive Bayes': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0, 

In [57]:
y_train.value_counts()

prognosis
Monkey Pox                                 11136
Celiac                                      1263
Chronic cholestasis                           93
Acne                                          93
Heart attack                                  92
Hepatitis B                                   92
Fungal infection                              91
Dimorphic hemmorhoids(piles)                  91
Migraine                                      90
Hyperthyroidism                               90
Allergy                                       89
GERD                                          89
Malaria                                       89
hepatitis A                                   89
Hepatitis E                                   88
Peptic ulcer diseae                           88
Pneumonia                                     88
Psoriasis                                     86
Chicken pox                                   86
Gastroenteritis                               86
Hypoglycem

In [59]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [60]:
y_resampled.value_counts()

prognosis
(vertigo) Paroymsal  Positional Vertigo    75
Monkey Pox                                 75
Hypertension                               75
Hyperthyroidism                            75
Hypoglycemia                               75
Hypothyroidism                             75
Impetigo                                   75
Jaundice                                   75
Malaria                                    75
Migraine                                   75
Osteoarthristis                            75
Hepatitis D                                75
Paralysis (brain hemorrhage)               75
Peptic ulcer diseae                        75
Pneumonia                                  75
Psoriasis                                  75
Tuberculosis                               75
Typhoid                                    75
Urinary tract infection                    75
Varicose veins                             75
Hepatitis E                                75
Hepatitis C             

In [61]:
executeModels(X_resampled, X_test, y_resampled, y_test)

Model: Decision Tree
Accuracy: 0.9991195891415994
Precision: 0.9992344253405212
Recall: 0.9991195891415994
F1 Score: 0.99911145949277
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Random Forest
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Naive Bayes
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: SVC
Accuracy: 0.09581804842259721
Precision: 0.07981216291640904
Recall: 0.09581804842259721
F1 Score: 0.05724096597234466
Confusion Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [

{'Decision Tree': {'Accuracy': 0.9991195891415994,
  'Precision': 0.9992344253405212,
  'Recall': 0.9991195891415994,
  'F1 Score': 0.99911145949277,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Random Forest': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Naive Bayes': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0, 

In [62]:
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Mapping of encoded labels to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)



Label Mapping: {'(vertigo) Paroymsal  Positional Vertigo': 0, 'AIDS': 1, 'Acne': 2, 'Alcoholic hepatitis': 3, 'Allergy': 4, 'Arthritis': 5, 'Bronchial Asthma': 6, 'Celiac': 7, 'Cervical spondylosis': 8, 'Chicken pox': 9, 'Chronic cholestasis': 10, 'Common Cold': 11, 'Dengue': 12, 'Diabetes ': 13, 'Dimorphic hemmorhoids(piles)': 14, 'Drug Reaction': 15, 'Fungal infection': 16, 'GERD': 17, 'Gastroenteritis': 18, 'Heart attack': 19, 'Hepatitis B': 20, 'Hepatitis C': 21, 'Hepatitis D': 22, 'Hepatitis E': 23, 'Hypertension ': 24, 'Hyperthyroidism': 25, 'Hypoglycemia': 26, 'Hypothyroidism': 27, 'Impetigo': 28, 'Jaundice': 29, 'Malaria': 30, 'Migraine': 31, 'Monkey Pox': 32, 'Osteoarthristis': 33, 'Paralysis (brain hemorrhage)': 34, 'Peptic ulcer diseae': 35, 'Pneumonia': 36, 'Psoriasis': 37, 'Tuberculosis': 38, 'Typhoid': 39, 'Urinary tract infection': 40, 'Varicose veins': 41, 'hepatitis A': 42}


In [75]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Save the trained model to disk
with open('naive_bayes_model_v2.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


In [78]:
import json

cols = X_resampled.columns

type(cols)

index_keys = X_train.index
default_value = 0
index_dict = {key: default_value for key in cols}

# Convert dictionary to JSON
index_json = json.dumps(index_dict)

print(index_json)

print ("-"*40)

features = ""
for col in cols : 
    features+="data[\'"+ col +"\'],"


print(features)

{"Age": 0, "Gender": 0, "itching": 0, "skin_rash": 0, "nodal_skin_eruptions": 0, "continuous_sneezing": 0, "shivering": 0, "chills": 0, "joint_pain": 0, "stomach_pain": 0, "acidity": 0, "ulcers_on_tongue": 0, "muscle_wasting": 0, "vomiting": 0, "burning_micturition": 0, "spotting_ urination": 0, "fatigue": 0, "weight_gain": 0, "anxiety": 0, "cold_hands_and_feets": 0, "mood_swings": 0, "weight_loss": 0, "restlessness": 0, "lethargy": 0, "patches_in_throat": 0, "irregular_sugar_level": 0, "cough": 0, "high_fever": 0, "sunken_eyes": 0, "breathlessness": 0, "sweating": 0, "dehydration": 0, "indigestion": 0, "headache": 0, "yellowish_skin": 0, "dark_urine": 0, "nausea": 0, "loss_of_appetite": 0, "pain_behind_the_eyes": 0, "back_pain": 0, "constipation": 0, "mild_fever": 0, "yellow_urine": 0, "yellowing_of_eyes": 0, "acute_liver_failure": 0, "fluid_overload": 0, "swelling_of_stomach": 0, "swelled_lymph_nodes": 0, "malaise": 0, "blurred_and_distorted_vision": 0, "phlegm": 0, "throat_irritatio