In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle

with open('disease_prediction_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [3]:
with open("symptoms_dict.pkl", "rb") as file:
    symptoms_dict = pickle.load(file)

In [4]:
with open("diseases_list.pkl", "rb") as file:
    diseases_list = pickle.load(file)

In [5]:
def predict_diseases_with_probabilities_rf(patient_symptoms):
    
    patient_symptoms = [symptom.lower().replace(" ", "_") for symptom in patient_symptoms]
    input_vector = np.zeros(len(symptoms_dict))
    for symptom in patient_symptoms:
        if symptom in symptoms_dict:
            input_vector[symptoms_dict[symptom]] = 1
        else:
            print(f"Warning: Symptom '{symptom}' not found in training data.")
    
    # Get probabilities from the Random Forest model
    predicted_probs = model.predict_proba([input_vector])[0]
    
    # Map probabilities to disease names
    predicted_diseases = {
        diseases_list[label]: prob 
        for label, prob in enumerate(predicted_probs) if prob > 0.01
    }
    
    # Sort diseases by probability in descending order
    sorted_diseases = sorted(predicted_diseases.items(), key=lambda x: x[1], reverse=True)[:5]
    
    return sorted_diseases

In [6]:
input_symptoms = ['red spots over body', 'mild fever']  # Example inputs
predicted_diseases = predict_diseases_with_probabilities_rf(input_symptoms)

print("Predicted Diseases with Probabilities:")
for disease, prob in predicted_diseases:
    print(f"{disease}: {prob * 100:.2f}%")

Predicted Diseases with Probabilities:
Chicken pox: 27.00%
Impetigo: 9.00%
AIDS: 6.00%
Cervical spondylosis: 6.00%
Acne: 5.00%




In [7]:
speciality_df = pd.read_csv("disease_speciality.csv")
speciality_df.head()

Unnamed: 0,Disease,Specialization
0,Fungal infection,Dermatology
1,Allergy,Immunology
2,GERD,Gastroenterology
3,Chronic cholestasis,Hepatology
4,Drug Reaction,Immunology


In [8]:
doctor_df = pd.read_csv("corrected_doctor_dataset.csv")
doctor_df.head()

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee ($),Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID
0,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000
1,Dr. Sharma 13,Neurology,24,839,Kolkata,Weekends,Accepted,4.8,491,DOC00003
2,Dr. Iyer 22,Dermatology,11,631,Pune,Afternoon Shift,Not Accepted,3.2,368,DOC00004
3,Dr. Sharma 69,Neurology,16,504,Hyderabad,Evenings,Not Accepted,4.7,980,DOC00008
4,Dr. Iyer 88,Gastroenterology,30,768,Ahmedabad,Weekends,Not Accepted,3.9,12,DOC00010


In [9]:
doctor_df.rename(columns={'Consultation Fee ($)': 'Consultation Fee'}, inplace=True)


In [10]:
doctor_df.shape

(1000, 10)

In [11]:
rec_df = pd.merge(doctor_df, speciality_df, on="Specialization")
rec_df.head()

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee,Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID,Disease
0,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Fungal infection
1,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Acne
2,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Psoriasis
3,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Impetigo
4,Dr. Sharma 13,Neurology,24,839,Kolkata,Weekends,Accepted,4.8,491,DOC00003,Migraine


In [12]:
rec_df.shape

(3297, 11)

In [13]:
rec_df.isnull().sum()

Doctor Name           0
Specialization        0
Experience (Years)    0
Consultation Fee      0
Location              0
Availability          0
Insurance Accepted    0
Patient Rating        0
Number of Ratings     0
Doctor ID             0
Disease               0
dtype: int64

In [14]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(rec_df['Number of Ratings'].describe())

count   3297.000
mean     520.021
std      301.129
min       10.000
25%      255.000
50%      526.000
75%      792.000
max      999.000
Name: Number of Ratings, dtype: float64


In [15]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(rec_df['Patient Rating'].describe())

count   3297.000
mean       3.618
std        0.989
min        1.000
25%        3.100
50%        3.800
75%        4.400
max        5.000
Name: Patient Rating, dtype: float64


In [16]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(rec_df['Experience (Years)'].describe())

count   3297.000
mean      20.887
std       10.635
min        1.000
25%       11.000
50%       21.000
75%       30.000
max       40.000
Name: Experience (Years), dtype: float64


In [17]:
C = rec_df["Patient Rating"].mean()
m = rec_df["Number of Ratings"].quantile(0.25)
R = rec_df["Patient Rating"]
v = rec_df["Number of Ratings"]

In [18]:
rec_df["weighted_avg"] = (R*v + C*m)/(v+m)
rec_df.head()

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee,Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID,Disease,weighted_avg
0,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Fungal infection,4.289
1,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Acne,4.289
2,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Psoriasis,4.289
3,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Impetigo,4.289
4,Dr. Sharma 13,Neurology,24,839,Kolkata,Weekends,Accepted,4.8,491,DOC00003,Migraine,4.396


In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rec_df[["normalized_exp", "norm_weighted_avg"]] = scaler.fit_transform(rec_df[["Experience (Years)", "weighted_avg"]])
rec_df.head()

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee,Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID,Disease,weighted_avg,normalized_exp,norm_weighted_avg
0,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Fungal infection,4.289,0.128,0.867
1,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Acne,4.289,0.128,0.867
2,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Psoriasis,4.289,0.128,0.867
3,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Impetigo,4.289,0.128,0.867
4,Dr. Sharma 13,Neurology,24,839,Kolkata,Weekends,Accepted,4.8,491,DOC00003,Migraine,4.396,0.59,0.901


In [20]:
weights_W = rec_df["norm_weighted_avg"]*0.7
weights_exp = rec_df["normalized_exp"]*0.3

In [21]:
rec_df["Score"] = weights_W + weights_exp
rec_df.head()

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee,Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID,Disease,weighted_avg,normalized_exp,norm_weighted_avg,Score
0,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Fungal infection,4.289,0.128,0.867,0.645
1,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Acne,4.289,0.128,0.867,0.645
2,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Psoriasis,4.289,0.128,0.867,0.645
3,Dr. Sharma 89,Dermatology,6,967,Jaipur,Weekends,Not Accepted,4.5,810,DOC00000,Impetigo,4.289,0.128,0.867,0.645
4,Dr. Sharma 13,Neurology,24,839,Kolkata,Weekends,Accepted,4.8,491,DOC00003,Migraine,4.396,0.59,0.901,0.808


In [22]:
rec_df["Score"].min()

0.061538461538461535

In [23]:
rec_df["Score"].max()

0.9649572913228301

In [24]:
import pandas as pd

def recommend_doctors(predicted_diseases, user_location, top_n=5):
   
    recommendations = {}

    for disease in predicted_diseases:
      
        filtered_doctors = rec_df[(rec_df["Disease"] == disease) & (rec_df["Location"] == user_location)]
        
        if not filtered_doctors.empty:
            
            ranked_doctors = filtered_doctors.sort_values(by="Score", ascending=False).head(top_n)
        else:
            
            gp_doctors = rec_df[(rec_df["Specialization"] == "General Practitioner") & (rec_df["Location"] == user_location)]
            ranked_doctors = gp_doctors.sort_values(by="Score", ascending=False).head(top_n)

    
        if ranked_doctors.empty:
            recommendations[disease] = "No doctors available for this disease in this location. Search in your nearby location."
        else:
            recommendations[disease] = ranked_doctors[["Doctor ID", "Doctor Name"]].to_dict(orient="records")

    return recommendations

In [25]:
predicted_diseases = ["Diabetes", "Hypertension"]
user_location = "Delhi"


recommendations = recommend_doctors(predicted_diseases, user_location)


print("\nTest Output:")
for disease, doctors in recommendations.items():
    print(f"\nTop Doctors for {disease}:")
    if isinstance(doctors, str): 
        print(doctors)
    else:
        for doctor in doctors:
            print(f"Doctor ID: {doctor['Doctor ID']}, Name: {doctor['Doctor Name']}")


Test Output:

Top Doctors for Diabetes:
Doctor ID: DOC00186, Name: Dr. Kumar 12
Doctor ID: DOC00018, Name: Dr. Mehta 42
Doctor ID: DOC00123, Name: Dr. Ghosh 93
Doctor ID: DOC00144, Name: Dr. Sharma 87
Doctor ID: DOC00060, Name: Dr. Kumar 87

Top Doctors for Hypertension:
Doctor ID: DOC00550, Name: Dr. Sharma 79
Doctor ID: DOC00217, Name: Dr. Verma 65
Doctor ID: DOC00205, Name: Dr. Iyer 13
Doctor ID: DOC00462, Name: Dr. Rao 90
Doctor ID: DOC00431, Name: Dr. Mehta 14


In [26]:
df3 = rec_df[rec_df['Doctor ID'] == 'DOC00550']
df3

Unnamed: 0,Doctor Name,Specialization,Experience (Years),Consultation Fee,Location,Availability,Insurance Accepted,Patient Rating,Number of Ratings,Doctor ID,Disease,weighted_avg,normalized_exp,norm_weighted_avg,Score
1476,Dr. Sharma 79,Cardiology,16,428,Delhi,Weekends,Accepted,4.9,921,DOC00550,Hypertension,4.622,0.385,0.973,0.796
1477,Dr. Sharma 79,Cardiology,16,428,Delhi,Weekends,Accepted,4.9,921,DOC00550,Heart attack,4.622,0.385,0.973,0.796
