In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("dataset.csv")

# Drop the column containing the diseases in column form
data = data.drop(columns="prognosis")
# Remove unnecessary columns (e.g., those with 'Unnamed')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

# List of symptoms and diseases
# 132 Unique Symptoms
# 41 Unique Diseases
symptoms = data.iloc[:, :132]  # First 132 columns (symptoms)
diseases = data.iloc[:, 132:]  # Columns from 132 onwards (diseases)

# Calculate posterior probabilities
def predict_disease_given_symptoms(observed_symptoms):
    posteriors = {}
    
    # Calculate posterior probabilities for each disease
    for disease in diseases.columns:
        # Prior P(D)
        prior = diseases[disease].mean()
        
        # Likelihood P(S|D) for observed symptoms
        likelihood = 1
        for symptom, value in observed_symptoms.items():
            P_S1_D1 = data[data[disease] == 1][symptom].mean()  # P(S=1|D=1)
            P_S1_D0 = data[data[disease] == 0][symptom].mean()  # P(S=1|D=0)
            likelihood *= P_S1_D1 if value == 1 else (1 - P_S1_D1)
        
        # Evidence P(S)
        evidence = 1
        for symptom, value in observed_symptoms.items():
            P_S1 = symptoms[symptom].mean()  # Overall probability of symptom
            evidence *= P_S1 if value == 1 else (1 - P_S1)
        
        # Posterior P(D|S)
        posterior = (likelihood * prior) / evidence
        posteriors[disease] = posterior

    # Sort diseases by posterior probabilities and return the top 10
    sorted_posteriors = sorted(posteriors.items(), key=lambda x: x[1], reverse=True)[:10]
    
    # Return top 10 diseases and their probabilities
    top_10_diseases = [(disease, probability * 100) for disease, probability in sorted_posteriors]
    
    return top_10_diseases

# Initialize user symptoms (132 symptoms)
user_symptoms = [0] * 132

# Mark the symptoms as per the user input (AG, AI, AJ, AR)
# AG -> Index 33, AI -> Index 34, AJ -> Index 35, AR -> Index 43
#Antecedent
user_symptoms[32] = 1  # yellowish_skin
user_symptoms[34] = 1  # nausea
user_symptoms[35] = 1  # loss of appetite
#Consequent
user_symptoms[43] = 1  # yellowing of eyes

#Antecedent
# AH -> AG
#user_symptoms[33] = 1  # dark-urine
#Consequent
#user_symptoms[32] = 1  # yellowish_skin
# Convert the list of symptoms into a dictionary with the symptom names
observed_symptoms = dict(zip(symptoms.columns, user_symptoms))

# Make prediction based on the observed symptoms
top_10_diseases = predict_disease_given_symptoms(observed_symptoms)
for i, (disease, probability) in enumerate(top_10_diseases, 1):
    print(f"{i}. Disease: {disease}, Probability: {probability:.2f}%")

#Antecedent Result
#The most probable disease is: Hepatitis D
#The probability of this disease is: 0.78%

#Consequent Result



1. Disease: Hepatitis C, Probability: 3485.63%
2. Disease: Chronic Cholestasis, Probability: 182.36%
3. Disease: Hepatitis D, Probability: 0.45%
4. Disease: Fungal Infection, Probability: 0.00%
5. Disease: Allergy, Probability: 0.00%
6. Disease: GERD, Probability: 0.00%
7. Disease: Drug Reaction, Probability: 0.00%
8. Disease: Peptic Ulcer Disease, Probability: 0.00%
9. Disease: AIDS, Probability: 0.00%
10. Disease: Diabetes , Probability: 0.00%
