In [40]:
# Note book to Train the following models
#Logistic Regression, Decision Trees, Random Forests, and SVM 

In [41]:
!pip install imbalanced-learn



In [42]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')


In [43]:
# Load the data that's already pre-processed
# Encode the True/False values to 1/0

In [44]:
# Test/Train Data Preparation 
def defineData():
    decease_df = pd.read_csv('./resources/Disease_data.csv')
    # Get the data types of each column
    dtypes = decease_df.dtypes
    # Find the unique data types
    unique_dtypes = dtypes.unique()
    #print("\nUnique data types in the DataFrame:")
    #print(unique_dtypes)
    object_columns = decease_df.select_dtypes(include=['object'])
    return decease_df



In [45]:
decease_df = defineData()
decease_df.replace({True: 1, False: 0, '0':0 ,'False':0, 'True':1}, inplace=True)
object_columns = decease_df.select_dtypes(include=['object'])

In [46]:
# Prognosis is the target variable. Prepare X and Y
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)

In [47]:
# Create a function to execute Models and print performance matrix.
def executeModels(X_train, X_test, y_train, y_test):
    models = {
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Naive Bayes": GaussianNB(),
        "SVC": SVC(),
        "KNN": KNeighborsClassifier()
    }
    
    # Train and evaluate each model
    results = {}
    
    for model_name, model in models.items():
    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred)
        #r2 = r2_score(y_test, y_pred)
        
        
        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": cm
        }
    
    # Display the results
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value}")
        print("\n")

    return results

In [48]:
# Create Training Set and Test Set from the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
executeModels(X_train, X_test, y_train, y_test)

# Model performance is as follows 
# Decision Tree - 0.6882188374261734
# Random Forest - 0.6980850534692863
# Naive Bayes - 0.717607560308381
# SVC -  0.5778413330017409
#KNN - 0.700945038547625

# Room to improve the performance.

Model: Decision Tree
Accuracy: 0.6835364337229545
Precision: 0.6851397861887639
Recall: 0.6835364337229545
F1 Score: 0.6843072574076298
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: Random Forest
Accuracy: 0.7013180800795822
Precision: 0.6976858242151098
Recall: 0.7013180800795822
F1 Score: 0.699252512884198
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: Naive Bayes
Accuracy: 0.717607560308381
Precision: 0.8195452905119949
Recall: 0.717607560308381
F1 Score: 0.6164818492572341
Confusion Matrix: [[31  0  0 ...  0  0  0]
 [ 0 29  0 ...  0  0  0]
 [ 0  0 21 ...  0  0  0]
 ...
 [ 0  0  0 ... 36  0  0]
 [ 0  0  0 ...  0 31  0]
 [ 0  0  0 ...  0  0 31]]


Model: SVC
Accuracy: 0.5778413330017409
Precision: 0.5650547637140237

{'Decision Tree': {'Accuracy': 0.6835364337229545,
  'Precision': 0.6851397861887639,
  'Recall': 0.6835364337229545,
  'F1 Score': 0.6843072574076298,
  'Confusion Matrix': array([[31,  0,  0, ...,  0,  0,  0],
         [ 0, 29,  0, ...,  0,  0,  0],
         [ 0,  0, 21, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 36,  0,  0],
         [ 0,  0,  0, ...,  0, 31,  0],
         [ 0,  0,  0, ...,  0,  0, 31]])},
 'Random Forest': {'Accuracy': 0.7013180800795822,
  'Precision': 0.6976858242151098,
  'Recall': 0.7013180800795822,
  'F1 Score': 0.699252512884198,
  'Confusion Matrix': array([[31,  0,  0, ...,  0,  0,  0],
         [ 0, 29,  0, ...,  0,  0,  0],
         [ 0,  0, 21, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 36,  0,  0],
         [ 0,  0,  0, ...,  0, 31,  0],
         [ 0,  0,  0, ...,  0,  0, 31]])},
 'Naive Bayes': {'Accuracy': 0.717607560308381,
  'Precision': 0.8195452905119949,
  'Recall': 0.717607560308381,
  'F1 Score': 0.6164818492572

In [49]:
# Let's try PCA to reduce the diemantions and Execute Models.
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
# Apply PCA to reduce the dimensions to 20 for visualization
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

executeModels(X_train, X_test, y_train, y_test)

#Model: 
#Decision Tree : 0.6829344109418714
#Random Forest : 0.7241736607605429
#Naive Bayes : 0.7109107864470003
#SVC : 0.7148482022588333

# Not much of an improvement . Only slight improvement on SVC. 
# Still there is room to improve the performance.

Model: Decision Tree
Accuracy: 0.6815874002694021
Precision: 0.6809148048610744
Recall: 0.6815874002694021
F1 Score: 0.6812413807311043
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7206507097710082
Precision: 0.7106124878858237
Recall: 0.7206507097710082
F1 Score: 0.7126563567282569
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7047974303181017
Precision: 0.7064748307516505
Recall: 0.7047974303181017
F1 Score: 0.7055674509024571
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7143301212309605
Precision: 0.7897779987015

{'Decision Tree': {'Accuracy': 0.6815874002694021,
  'Precision': 0.6809148048610744,
  'Recall': 0.6815874002694021,
  'F1 Score': 0.6812413807311043,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7206507097710082,
  'Precision': 0.7106124878858237,
  'Recall': 0.7206507097710082,
  'F1 Score': 0.7126563567282569,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7047974303181017,
  'Precision': 0.7064748307516505,
  'Recall': 0.7047974303181017,
  'F1 Score': 0.7055674509

In [50]:
# Validated the dataset and identified that the dataset is imbalanced. Lot more data assocaited with Monkey Pox, No Disease Identified and Celiac.
# Take out the records for 'No Disease Identified'
y_train.value_counts()


prognosis
Monkey Pox                                 11116
No disease identified                       6641
Celiac                                      1279
Hepatitis E                                   95
Jaundice                                      94
Gastroenteritis                               93
Acne                                          93
Psoriasis                                     92
Migraine                                      92
Impetigo                                      91
GERD                                          90
Allergy                                       89
Paralysis (brain hemorrhage)                  89
Chronic cholestasis                           89
Hypothyroidism                                88
Cervical spondylosis                          88
Hepatitis B                                   87
(vertigo) Paroymsal  Positional Vertigo       86
Hypertension                                  86
Fungal infection                              86
Hyperthyro

In [51]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [52]:
y_resampled.value_counts()

prognosis
(vertigo) Paroymsal  Positional Vertigo    74
AIDS                                       74
Hypertension                               74
Hyperthyroidism                            74
Hypoglycemia                               74
Hypothyroidism                             74
Impetigo                                   74
Jaundice                                   74
Malaria                                    74
Migraine                                   74
Monkey Pox                                 74
No disease identified                      74
Osteoarthristis                            74
Paralysis (brain hemorrhage)               74
Peptic ulcer diseae                        74
Pneumonia                                  74
Psoriasis                                  74
Tuberculosis                               74
Typhoid                                    74
Urinary tract infection                    74
Varicose veins                             74
Hepatitis E             

In [53]:
executeModels(X_resampled, X_test, y_resampled, y_test)

Model: Decision Tree
Accuracy: 0.6809657030359548
Precision: 0.6919438837796743
Recall: 0.6809657030359548
F1 Score: 0.6849192512985133
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7101854730079784
Precision: 0.7103777812226266
Recall: 0.7101854730079784
F1 Score: 0.7102080874103486
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7240700445549684
Precision: 0.7137734120538948
Recall: 0.7240700445549684
F1 Score: 0.7155027077016313
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.708216765102062
Precision: 0.81364871975981

{'Decision Tree': {'Accuracy': 0.6809657030359548,
  'Precision': 0.6919438837796743,
  'Recall': 0.6809657030359548,
  'F1 Score': 0.6849192512985133,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7101854730079784,
  'Precision': 0.7103777812226266,
  'Recall': 0.7101854730079784,
  'F1 Score': 0.7102080874103486,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7240700445549684,
  'Precision': 0.7137734120538948,
  'Recall': 0.7240700445549684,
  'F1 Score': 0.7155027077

In [54]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
executeModels(X_train_scaled, X_test_scaled, y_train, y_test)

Model: Decision Tree
Accuracy: 0.6879079888094498
Precision: 0.6877824472194354
Recall: 0.6879079888094498
F1 Score: 0.6878431933594606
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Random Forest
Accuracy: 0.7210651745933064
Precision: 0.7105680651402697
Recall: 0.7210651745933064
F1 Score: 0.7124677082858349
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: Naive Bayes
Accuracy: 0.7047974303181017
Precision: 0.7064748307516505
Recall: 0.7047974303181017
F1 Score: 0.7055674509024571
Confusion Matrix: [[35  0  0 ...  0  0  0]
 [ 0 36  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 37]]


Model: SVC
Accuracy: 0.7141228888198115
Precision: 0.7889278651807

{'Decision Tree': {'Accuracy': 0.6879079888094498,
  'Precision': 0.6877824472194354,
  'Recall': 0.6879079888094498,
  'F1 Score': 0.6878431933594606,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Random Forest': {'Accuracy': 0.7210651745933064,
  'Precision': 0.7105680651402697,
  'Recall': 0.7210651745933064,
  'F1 Score': 0.7124677082858349,
  'Confusion Matrix': array([[35,  0,  0, ...,  0,  0,  0],
         [ 0, 36,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 43,  0,  0],
         [ 0,  0,  0, ...,  0, 37,  0],
         [ 0,  0,  0, ...,  0,  0, 37]])},
 'Naive Bayes': {'Accuracy': 0.7047974303181017,
  'Precision': 0.7064748307516505,
  'Recall': 0.7047974303181017,
  'F1 Score': 0.7055674509

In [55]:
# Delete all the records that below to no decease
#decease_df.drop(decease_df.loc[(decease_df['prognosis']=='No disease identified')],inplace=True,axis=1)
decease_df_NoDisease = decease_df.loc[(decease_df['prognosis']!='No disease identified')]

#decease_df.size
y = decease_df_NoDisease['prognosis']
X = decease_df_NoDisease.copy()
X.drop(columns='prognosis',inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train.value_counts()

rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

y_resampled.value_counts()
executeModels(X_resampled, X_test, y_resampled, y_test)

Model: Decision Tree
Accuracy: 0.979457079970653
Precision: 0.9940659404407678
Recall: 0.979457079970653
F1 Score: 0.9853556848868662
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Random Forest
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: Naive Bayes
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[40  0  0 ...  0  0  0]
 [ 0 42  0 ...  0  0  0]
 [ 0  0 28 ...  0  0  0]
 ...
 [ 0  0  0 ... 37  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 32]]


Model: SVC
Accuracy: 0.09581804842259721
Precision: 0.07981216291640904
Recall: 0.09581804842259721
F1 Score: 0.05724096597234466
Confusion Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [

{'Decision Tree': {'Accuracy': 0.979457079970653,
  'Precision': 0.9940659404407678,
  'Recall': 0.979457079970653,
  'F1 Score': 0.9853556848868662,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 34,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Random Forest': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0,  0, 28, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 37,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 32]])},
 'Naive Bayes': {'Accuracy': 1.0,
  'Precision': 1.0,
  'Recall': 1.0,
  'F1 Score': 1.0,
  'Confusion Matrix': array([[40,  0,  0, ...,  0,  0,  0],
         [ 0, 42,  0, ...,  0,  0,  0],
         [ 0, 

In [56]:
#decease_df_MonkeyPoxNegative
decease_df.shape
#decease_df_MonkeyPoxNegative.shape

(32168, 154)

In [57]:
decease_df = defineData()
decease_df = decease_df.loc[(decease_df['prognosis'] == 'No disease identified') & (decease_df['Rectal Pain'] != '0' ) == False]
decease_df.replace({True: 1, False: 0, '0':0 ,'False':0, 'True':1}, inplace=True)
y = decease_df['prognosis']
X = decease_df.copy()
X.drop(columns='prognosis',inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
executeModels(X_train, X_test, y_train, y_test)

Model: Decision Tree
Accuracy: 0.9997111496244945
Precision: 0.9997111496244945
Recall: 0.9997111496244945
F1 Score: 0.9997111496244945
Confusion Matrix: [[37  0  0 ...  0  0  0]
 [ 0 41  0 ...  0  0  0]
 [ 0  0 29 ...  0  0  0]
 ...
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 33]]


Model: Random Forest
Accuracy: 0.9998555748122473
Precision: 0.999855824251259
Recall: 0.9998555748122473
F1 Score: 0.9998552630673804
Confusion Matrix: [[37  0  0 ...  0  0  0]
 [ 0 41  0 ...  0  0  0]
 [ 0  0 29 ...  0  0  0]
 ...
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 33]]


Model: Naive Bayes
Accuracy: 0.9997111496244945
Precision: 0.9997121456602721
Recall: 0.9997111496244945
F1 Score: 0.999709894634467
Confusion Matrix: [[37  0  0 ...  0  0  0]
 [ 0 41  0 ...  0  0  0]
 [ 0  0 29 ...  0  0  0]
 ...
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ...  0 40  0]
 [ 0  0  0 ...  0  0 33]]


Model: SVC
Accuracy: 0.7863951473136915
Precision: 0.635295880190108

{'Decision Tree': {'Accuracy': 0.9997111496244945,
  'Precision': 0.9997111496244945,
  'Recall': 0.9997111496244945,
  'F1 Score': 0.9997111496244945,
  'Confusion Matrix': array([[37,  0,  0, ...,  0,  0,  0],
         [ 0, 41,  0, ...,  0,  0,  0],
         [ 0,  0, 29, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 40,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 33]])},
 'Random Forest': {'Accuracy': 0.9998555748122473,
  'Precision': 0.999855824251259,
  'Recall': 0.9998555748122473,
  'F1 Score': 0.9998552630673804,
  'Confusion Matrix': array([[37,  0,  0, ...,  0,  0,  0],
         [ 0, 41,  0, ...,  0,  0,  0],
         [ 0,  0, 29, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ..., 40,  0,  0],
         [ 0,  0,  0, ...,  0, 40,  0],
         [ 0,  0,  0, ...,  0,  0, 33]])},
 'Naive Bayes': {'Accuracy': 0.9997111496244945,
  'Precision': 0.9997121456602721,
  'Recall': 0.9997111496244945,
  'F1 Score': 0.99970989463

In [63]:
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Mapping of encoded labels to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'(vertigo) Paroymsal  Positional Vertigo': 0, 'AIDS': 1, 'Acne': 2, 'Alcoholic hepatitis': 3, 'Allergy': 4, 'Arthritis': 5, 'Bronchial Asthma': 6, 'Celiac': 7, 'Cervical spondylosis': 8, 'Chicken pox': 9, 'Chronic cholestasis': 10, 'Common Cold': 11, 'Dengue': 12, 'Diabetes ': 13, 'Dimorphic hemmorhoids(piles)': 14, 'Drug Reaction': 15, 'Fungal infection': 16, 'GERD': 17, 'Gastroenteritis': 18, 'Heart attack': 19, 'Hepatitis B': 20, 'Hepatitis C': 21, 'Hepatitis D': 22, 'Hepatitis E': 23, 'Hypertension ': 24, 'Hyperthyroidism': 25, 'Hypoglycemia': 26, 'Hypothyroidism': 27, 'Impetigo': 28, 'Jaundice': 29, 'Malaria': 30, 'Migraine': 31, 'Monkey Pox': 32, 'No disease identified': 33, 'Osteoarthristis': 34, 'Paralysis (brain hemorrhage)': 35, 'Peptic ulcer diseae': 36, 'Pneumonia': 37, 'Psoriasis': 38, 'Tuberculosis': 39, 'Typhoid': 40, 'Urinary tract infection': 41, 'Varicose veins': 42, 'hepatitis A': 43}


In [64]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_encoded, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Save the trained model to disk
with open('naive_bayes_model_v2.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
