In [1]:
# Note book to Train the following models
#Logistic Regression, Decision Trees, Random Forests, and SVM 

In [2]:
!pip install imbalanced-learn



In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')


In [4]:
# Load the data that's already pre-processed
# Encode the True/False values to 1/0

In [5]:
# Test/Train Data Preparation 

decease_df = pd.read_csv('./resources/Disease_data.csv')
decease_df.shape
# Get the data types of each column
dtypes = decease_df.dtypes
# Find the unique data types
unique_dtypes = dtypes.unique()
#print("\nUnique data types in the DataFrame:")
print(unique_dtypes)


object_columns = decease_df.select_dtypes(include=['object'])
object_columns

decease_df.replace({True: 1, False: 0, '0':0 ,'False':0, 'True':1}, inplace=True)

object_columns = decease_df.select_dtypes(include=['object'])

[dtype('float64') dtype('int64') dtype('O')]


In [6]:
# Prognosis is the target variable. Prepare X and Y
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)

In [7]:
# Create a function to execute Models and print performance matrix.
def executeModels(X_train, X_test, y_train, y_test):
    models = {
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Naive Bayes": GaussianNB(),
        "SVC": SVC(),
        "KNN": KNeighborsClassifier()
    }
    
    # Train and evaluate each model
    results = {}
    
    for model_name, model in models.items():
    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        
        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": cm
        }
    
    # Display the results
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value}")
        print("\n")

    return results

In [8]:
# Create Training Set and Test Set from the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
executeModels(X_train, X_test, y_train, y_test)

# Model performance is as follows 
# Decision Tree - 0.6882188374261734
# Random Forest - 0.7000310848616723
# Naive Bayes - 0.7379546161019583
# SVC -  0.6092632887783649

# Room to improve the performance.

Model: Decision Tree
Accuracy: 0.6839094752549117
Precision: 0.685145374929967
Recall: 0.6839094752549117
F1 Score: 0.684509873368169
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: Random Forest
Accuracy: 0.6998259139517533
Precision: 0.6960713506322336
Recall: 0.6998259139517533
F1 Score: 0.6976862196394452
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: Naive Bayes
Accuracy: 0.717607560308381
Precision: 0.8195452905119949
Recall: 0.717607560308381
F1 Score: 0.6164818492572341
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: SVC
Accuracy: 0.5778413330017409
Precision: 0.5650547637140237


{'Decision Tree': {'Accuracy': 0.6839094752549117,
  'Precision': 0.685145374929967,
  'Recall': 0.6839094752549117,
  'F1 Score': 0.684509873368169,
  'Confusion Matrix': array([[31,  0,  0, ...,  0,  0,  0],
         [ 0, 29,  0, ...,  0,  0,  0],
         [ 0,  0, 21, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 36,  0,  0],
         [ 0,  0,  0, ...,  0, 31,  0],
         [ 0,  0,  0, ...,  0,  0, 31]])},
 'Random Forest': {'Accuracy': 0.6998259139517533,
  'Precision': 0.6960713506322336,
  'Recall': 0.6998259139517533,
  'F1 Score': 0.6976862196394452,
  'Confusion Matrix': array([[31,  0,  0, ...,  0,  0,  0],
         [ 0, 29,  0, ...,  0,  0,  0],
         [ 0,  0, 21, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 36,  0,  0],
         [ 0,  0,  0, ...,  0, 31,  0],
         [ 0,  0,  0, ...,  0,  0, 31]])},
 'Naive Bayes': {'Accuracy': 0.717607560308381,
  'Precision': 0.8195452905119949,
  'Recall': 0.717607560308381,
  'F1 Score': 0.61648184925723

In [9]:
# Let's try PCA to reduce the diemantions and Execute Models.
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
# Apply PCA to reduce the dimensions to 20 for visualization
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

executeModels(X_train, X_test, y_train, y_test)

#Model: 
#Decision Tree : 0.6829344109418714
#Random Forest : 0.7241736607605429
#Naive Bayes : 0.7109107864470003
#SVC : 0.7148482022588333

# Not much of an improvement . Only slight improvement on SVC. 
# Still there is room to improve the performance.

Model: Decision Tree
Accuracy: 0.6803440058025075
Precision: 0.6811081724570016
Recall: 0.6803440058025075
F1 Score: 0.6807205060288314
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7156771319034297
Precision: 0.7077080161638643
Recall: 0.7156771319034297
F1 Score: 0.7101022201629119
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7183711532483681
Precision: 0.7142844886698051
Recall: 0.7183711532483681
F1 Score: 0.7160086569570636
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7145373536421096
Precision: 0.7899889627887

{'Decision Tree': {'Accuracy': 0.6803440058025075,
  'Precision': 0.6811081724570016,
  'Recall': 0.6803440058025075,
  'F1 Score': 0.6807205060288314,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7156771319034297,
  'Precision': 0.7077080161638643,
  'Recall': 0.7156771319034297,
  'F1 Score': 0.7101022201629119,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7183711532483681,
  'Precision': 0.7142844886698051,
  'Recall': 0.7183711532483681,
  'F1 Score': 0.7160086569

In [10]:
# Validated the dataset and identified that the dataset is imbalanced. Lot more data assocaited with Monkey Pox, No Disease Identified and Celiac.
# Take out the records for 'No Disease Identified'

y_train.value_counts()


prognosis
Monkey Pox                                 11116
No disease identified                       6641
Celiac                                      1279
Hepatitis E                                   95
Jaundice                                      94
Gastroenteritis                               93
Acne                                          93
Psoriasis                                     92
Migraine                                      92
Impetigo                                      91
GERD                                          90
Allergy                                       89
Paralysis (brain hemorrhage)                  89
Chronic cholestasis                           89
Hypothyroidism                                88
Cervical spondylosis                          88
Hepatitis B                                   87
(vertigo) Paroymsal  Positional Vertigo       86
Hypertension                                  86
Fungal infection                              86
Hyperthyro

In [11]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [12]:
y_resampled.value_counts()

prognosis
(vertigo) Paroymsal  Positional Vertigo    74
AIDS                                       74
Hypertension                               74
Hyperthyroidism                            74
Hypoglycemia                               74
Hypothyroidism                             74
Impetigo                                   74
Jaundice                                   74
Malaria                                    74
Migraine                                   74
Monkey Pox                                 74
No disease identified                      74
Osteoarthristis                            74
Paralysis (brain hemorrhage)               74
Peptic ulcer diseae                        74
Pneumonia                                  74
Psoriasis                                  74
Tuberculosis                               74
Typhoid                                    74
Urinary tract infection                    74
Varicose veins                             74
Hepatitis E             

In [13]:
executeModels(X_resampled, X_test, y_resampled, y_test)

Model: Decision Tree
Accuracy: 0.6708113148896487
Precision: 0.6715020079810089
Recall: 0.6708113148896487
F1 Score: 0.6710595387357778
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7032431872344834
Precision: 0.7014725017816497
Recall: 0.7032431872344834
F1 Score: 0.7023009583164617
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7218941042379028
Precision: 0.7146575206056143
Recall: 0.7218941042379028
F1 Score: 0.71700968554525
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7084239975132111
Precision: 0.813859683847033

{'Decision Tree': {'Accuracy': 0.6708113148896487,
  'Precision': 0.6715020079810089,
  'Recall': 0.6708113148896487,
  'F1 Score': 0.6710595387357778,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7032431872344834,
  'Precision': 0.7014725017816497,
  'Recall': 0.7032431872344834,
  'F1 Score': 0.7023009583164617,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7218941042379028,
  'Precision': 0.7146575206056143,
  'Recall': 0.7218941042379028,
  'F1 Score': 0.7170096855

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
executeModels(X_train_scaled, X_test_scaled, y_train, y_test)

Model: Decision Tree
Accuracy: 0.6806548544192311
Precision: 0.6811420613533611
Recall: 0.6806548544192311
F1 Score: 0.6808943782384752
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7157807481090043
Precision: 0.7076648603389768
Recall: 0.7157807481090043
F1 Score: 0.7100633585477024
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7183711532483681
Precision: 0.7142844886698051
Recall: 0.7183711532483681
F1 Score: 0.7160086569570636
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7145373536421096
Precision: 0.7893338715373

{'Decision Tree': {'Accuracy': 0.6806548544192311,
  'Precision': 0.6811420613533611,
  'Recall': 0.6806548544192311,
  'F1 Score': 0.6808943782384752,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7157807481090043,
  'Precision': 0.7076648603389768,
  'Recall': 0.7157807481090043,
  'F1 Score': 0.7100633585477024,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7183711532483681,
  'Precision': 0.7142844886698051,
  'Recall': 0.7183711532483681,
  'F1 Score': 0.7160086569

In [15]:
# Delete all the records that below to no decease
#decease_df.drop(decease_df.loc[(decease_df['prognosis']=='No disease identified')],inplace=True,axis=1)
decease_df = decease_df.loc[(decease_df['prognosis']!='No disease identified')]

#decease_df.size
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
executeModels(X_train, X_test, y_train, y_test)

Model: Decision Tree
Accuracy: 0.9991195891415994
Precision: 0.9992344253405212
Recall: 0.9991195891415994
F1 Score: 0.99911145949277
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Random Forest
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Naive Bayes
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: SVC
Accuracy: 0.7989728539985327
Precision: 0.6613220287792579
Recall: 0.7989728539985327
F1 Score: 0.715487107378886
Confusion Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0

{'Decision Tree': {'Accuracy': 0.9991195891415994,
  'Precision': 0.9992344253405212,
  'Recall': 0.9991195891415994,
  'F1 Score': 0.99911145949277,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Random Forest': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Naive Bayes': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0, 

In [16]:
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_resampled)
# Mapping of encoded labels to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'(vertigo) Paroymsal  Positional Vertigo': 0, 'AIDS': 1, 'Acne': 2, 'Alcoholic hepatitis': 3, 'Allergy': 4, 'Arthritis': 5, 'Bronchial Asthma': 6, 'Celiac': 7, 'Cervical spondylosis': 8, 'Chicken pox': 9, 'Chronic cholestasis': 10, 'Common Cold': 11, 'Dengue': 12, 'Diabetes ': 13, 'Dimorphic hemmorhoids(piles)': 14, 'Drug Reaction': 15, 'Fungal infection': 16, 'GERD': 17, 'Gastroenteritis': 18, 'Heart attack': 19, 'Hepatitis B': 20, 'Hepatitis C': 21, 'Hepatitis D': 22, 'Hepatitis E': 23, 'Hypertension ': 24, 'Hyperthyroidism': 25, 'Hypoglycemia': 26, 'Hypothyroidism': 27, 'Impetigo': 28, 'Jaundice': 29, 'Malaria': 30, 'Migraine': 31, 'Monkey Pox': 32, 'No disease identified': 33, 'Osteoarthristis': 34, 'Paralysis (brain hemorrhage)': 35, 'Peptic ulcer diseae': 36, 'Pneumonia': 37, 'Psoriasis': 38, 'Tuberculosis': 39, 'Typhoid': 40, 'Urinary tract infection': 41, 'Varicose veins': 42, 'hepatitis A': 43}


In [17]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_encoded, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Save the trained model to disk
with open('naive_bayes_model_v2.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
