In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

In [14]:
# Load the dataset
dataset = pd.read_csv('Training_new.csv').dropna(axis=1)

# Separate features and labels
X = dataset.drop('prognosis', axis=1)
y = dataset['prognosis']
encoder = LabelEncoder()
dataset["prognosis"] = encoder.fit_transform(dataset["prognosis"])

# Splitting the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Limiting the size of the training data to avoid overfitting
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

# Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_small, y_train_small)
rf_predictions_train = rf_classifier.predict(X_train)
rf_cm_train = confusion_matrix(y_train, rf_predictions_train)

# Save the Random Forest model
joblib.dump(rf_classifier, 'rf_model.pkl')

# SVM Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_small, y_train_small)
svm_predictions_train = svm_classifier.predict(X_train)
svm_cm_train = confusion_matrix(y_train, svm_predictions_train)

# Save the SVM model
joblib.dump(svm_classifier, 'svm_model.pkl')

# Gaussian Naive Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_small, y_train_small)
nb_predictions_train = nb_classifier.predict(X_train)
nb_cm_train = confusion_matrix(y_train, nb_predictions_train)

# Save the Naive Bayes model
joblib.dump(nb_classifier, 'nb_model.pkl')

['nb_model.pkl']

In [15]:
# Calculate accuracy for each model on the training data
rf_accuracy_train = accuracy_score(y_train, rf_predictions_train)
svm_accuracy_train = accuracy_score(y_train, svm_predictions_train)
nb_accuracy_train = accuracy_score(y_train, nb_predictions_train)

In [16]:
# Display confusion matrix for each model in 2x2 format
print("Random Forest Confusion Matrix (Training):")
print("True Positive:", rf_cm_train[1, 1])
print("True Negative:", rf_cm_train[0, 0])
print("False Positive:", rf_cm_train[0, 1])
print("False Negative:", rf_cm_train[1, 0])

print("\nSVM Confusion Matrix (Training):")
print("True Positive:", svm_cm_train[1, 1])
print("True Negative:", svm_cm_train[0, 0])
print("False Positive:", svm_cm_train[0, 1])
print("False Negative:", svm_cm_train[1, 0])

print("\nGaussian Naive Bayes Confusion Matrix (Training):")
print("True Positive:", nb_cm_train[1, 1])
print("True Negative:", nb_cm_train[0, 0])
print("False Positive:", nb_cm_train[0, 1])
print("False Negative:", nb_cm_train[1, 0])

Random Forest Confusion Matrix (Training):
True Positive: 5
True Negative: 0
False Positive: 0
False Negative: 0

SVM Confusion Matrix (Training):
True Positive: 4
True Negative: 0
False Positive: 0
False Negative: 0

Gaussian Naive Bayes Confusion Matrix (Training):
True Positive: 5
True Negative: 0
False Positive: 0
False Negative: 0


In [17]:
# Print accuracy for each model
print("\nRandom Forest Accuracy (Training):", rf_accuracy_train)
print("SVM Accuracy (Training):", svm_accuracy_train)
print("Gaussian Naive Bayes Accuracy (Training):", nb_accuracy_train)


Random Forest Accuracy (Training): 0.9510869565217391
SVM Accuracy (Training): 0.8967391304347826
Gaussian Naive Bayes Accuracy (Training): 0.9510869565217391


In [18]:
# Define a function to convert multi-class confusion matrix to 2x2 confusion matrix
def multiclass_to_2x2_confusion_matrix(cm):
    # Sum of diagonal elements represents TP
    TP = cm.diagonal().sum()
    
    # Sum of all elements in the confusion matrix represents the total
    total = cm.sum()
    
    # False Positive (FP) is the sum of all elements in the predicted column excluding TP
    FP = cm.sum(axis=0).sum() - TP
    
    # False Negative (FN) is the sum of all elements in the actual row excluding TP
    FN = cm.sum(axis=1).sum() - TP
    
    # True Negative (TN) is the total minus TP, FP, and FN
    TN = total - TP - FP - FN
    
    # Return 2x2 confusion matrix
    return TP, FP, FN, TN

# Convert each multi-class confusion matrix to 2x2 confusion matrix
rf_TP, rf_FP, rf_FN, rf_TN = multiclass_to_2x2_confusion_matrix(rf_cm_train)
svm_TP, svm_FP, svm_FN, svm_TN = multiclass_to_2x2_confusion_matrix(svm_cm_train)
nb_TP, nb_FP, nb_FN, nb_TN = multiclass_to_2x2_confusion_matrix(nb_cm_train)

# Print 2x2 confusion matrices
print("Random Forest Confusion Matrix (Training):")
print(f"True Positive: {rf_TP}, False Positive: {rf_FP}, False Negative: {rf_FN}, True Negative: {rf_TN}")

print("\nSVM Confusion Matrix (Training):")
print(f"True Positive: {svm_TP}, False Positive: {svm_FP}, False Negative: {svm_FN}, True Negative: {svm_TN}")

print("\nGaussian Naive Bayes Confusion Matrix (Training):")
print(f"True Positive: {nb_TP}, False Positive: {nb_FP}, False Negative: {nb_FN}, True Negative: {nb_TN}")


Random Forest Confusion Matrix (Training):
True Positive: 175, False Positive: 9, False Negative: 9, True Negative: -9

SVM Confusion Matrix (Training):
True Positive: 165, False Positive: 19, False Negative: 19, True Negative: -19

Gaussian Naive Bayes Confusion Matrix (Training):
True Positive: 175, False Positive: 9, False Negative: 9, True Negative: -9


In [19]:
import joblib
import numpy as np
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
import warnings

# Load the saved models
rf_model = joblib.load('rf_model.pkl')
svm_model = joblib.load('svm_model.pkl')
nb_model = joblib.load('nb_model.pkl')

# Define label encoder
label_encoder = LabelEncoder()
label_encoder.fit(y.unique())
symptoms = X.columns.values
prediction_classes = encoder.classes_
symptom_index = {}
for index, value in enumerate(symptoms):
    symptom = " ".join([i.capitalize() for i in value.split("_")])
    symptom_index[symptom] = index
data_dict = {
    "symptom_index": symptom_index,
    "predictions_classes": prediction_classes
}


def predict_disease(symptoms):
    input_data = [0] * len(data_dict["symptom_index"])  # Initialize input_data with correct length
    for symptom in symptoms:
        index = data_dict["symptom_index"].get(symptom.capitalize())
        if index is not None:
            input_data[index] = 1

    input_data = np.array(input_data).reshape(1, -1)

    # Predictions from models
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rf_prediction = rf_model.predict(input_data)
        nb_prediction = nb_model.predict(input_data)
        svm_prediction = svm_model.predict(input_data)

    # Convert predictions to numeric labels
    rf_prediction_label = label_encoder.transform(rf_prediction)[0]
    nb_prediction_label = label_encoder.transform(nb_prediction)[0]
    svm_prediction_label = label_encoder.transform(svm_prediction)[0]

    # Calculate majority prediction
    predictions = [rf_prediction_label, nb_prediction_label, svm_prediction_label]
    if len(set(predictions)) == 1:
        final_prediction = predictions[0]  # All predictions are the same
    else:
        final_prediction = mode(predictions)[0][0]

    # Decode prediction label to disease name
    final_prediction = label_encoder.inverse_transform([final_prediction])[0]
    
    output = {
        "Random Forest": label_encoder.inverse_transform([rf_prediction_label])[0],
        "SVM": label_encoder.inverse_transform([svm_prediction_label])[0],
        "Gaussian Naive Bayes": label_encoder.inverse_transform([nb_prediction_label])[0],
        "Final Prediction": final_prediction
    }

    return output


In [20]:
# Define symptoms
test_symptoms = ["continuous_sneezing", "fatigue", "cough", "high_fever", "headache"]

# Call predict_disease function
predictions = predict_disease(test_symptoms)

# Print the predictions
for model, disease in predictions.items():
    print(f"{model} Prediction:", disease)


Random Forest Prediction: Paralysis (brain hemorrhage)
SVM Prediction: Paralysis (brain hemorrhage)
Gaussian Naive Bayes Prediction: Paralysis (brain hemorrhage)
Final Prediction Prediction: Paralysis (brain hemorrhage)
