# Importing dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score
import pickle

# Data Collection & Processing

In [2]:
# Extracting zip file into a folder named 'data'

import zipfile
import os

zip_file_path = './DPS_data.zip'
extract_to_dir = 'data'

os.makedirs(extract_to_dir, exist_ok=True) # Making req folders

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

In [3]:
data = pd.read_csv('./data/FilteredDataset.csv')
data = data.drop(columns="Unnamed: 0")

In [4]:
#printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,redness in or around nose,wrinkles on skin,foot or toe weakness,hand or finger cramps or spasms,back stiffness or tightness,wrist lump or mass,skin pain,low urine output,sore in nose,ankle weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 329 entries, diseases to ankle weakness
dtypes: int64(328), object(1)
memory usage: 619.8+ MB


In [6]:
# Since each symptom only consists of 0 and 1 it will be much better to have the data in uint8 (1byte) instead of int64 (8bytes)
binary_columns = data.drop(columns='diseases').columns  # Exclude 'disease' column
data[binary_columns] = data[binary_columns].astype('uint8')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 329 entries, diseases to ankle weakness
dtypes: object(1), uint8(328)
memory usage: 79.1+ MB


Note the drastic decrease in memory usage (Almost 8 times!)

In [7]:
#number of rows and columns
data.shape

(246945, 329)

In [8]:
#number of unique diseases
len(data['diseases'].unique())

773

In [9]:
#check no. of missing values in each column
data.isnull().sum()

diseases                            0
anxiety and nervousness             0
depression                          0
shortness of breath                 0
depressive or psychotic symptoms    0
                                   ..
wrist lump or mass                  0
skin pain                           0
low urine output                    0
sore in nose                        0
ankle weakness                      0
Length: 329, dtype: int64

Our dataset comprises 773 diseases and  329 symptoms, with no missing values. As a result, there is no need for any missing value handling procedures.

# Spliting into training & test data

Seperating feature & target

In [10]:
X = data.drop(columns='diseases', axis=1)
y = data['diseases']

In [11]:
# To allow k-folds
class_counts = y.value_counts()
classes_to_remove = class_counts[class_counts <= 10].index

indices_to_remove = y[y.isin(classes_to_remove)].index

X_filtered = X.drop(indices_to_remove, axis=0)
y_filtered = y.drop(indices_to_remove, axis=0)

### Encoding the Categorical columns   
(not requried you can have the labels inputted directly)

In [115]:
# Y = LabelEncoder().fit_transform(y)

Checking encoded or not

In [13]:
y

0                 panic disorder
1                 panic disorder
2                 panic disorder
3                 panic disorder
4                 panic disorder
                   ...          
246940    open wound of the nose
246941    open wound of the nose
246942    open wound of the nose
246943    open wound of the nose
246944    open wound of the nose
Name: diseases, Length: 246945, dtype: object

In [28]:
y.unique()

array(['panic disorder', 'vocal cord polyp', 'turner syndrome',
       'cryptorchidism', 'poisoning due to ethylene glycol',
       'atrophic vaginitis', 'fracture of the hand',
       'cellulitis or abscess of mouth', 'eye alignment disorder',
       'headache after lumbar puncture', 'pyloric stenosis',
       'salivary gland disorder', 'osteochondrosis', 'injury to the knee',
       'metabolic disorder', 'vaginitis', 'sick sinus syndrome',
       'tinnitus of unknown cause', 'glaucoma', 'eating disorder',
       'transient ischemic attack', 'pyelonephritis',
       'rotator cuff injury', 'chronic pain disorder',
       'problem during pregnancy', 'liver cancer', 'atelectasis',
       'injury to the hand', 'choledocholithiasis', 'injury to the hip',
       'cirrhosis', 'thoracic aortic aneurysm', 'subdural hemorrhage',
       'diabetic retinopathy', 'fibromyalgia', 'ischemia of the bowel',
       'fetal alcohol syndrome', 'peritonitis', 'injury to the abdomen',
       'acute pancreati

Spliting the data into training data & test data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)

Shape of training data & test data

In [117]:
print(X_filtered.shape, X_train.shape, X_test.shape)

(246362, 328) (197089, 328) (49273, 328)


In [118]:
print(y_filtered.shape, y_train.shape, y_test.shape)

(246362,) (197089,) (49273,)


# Model training & evaluation

**Model Training :**
* Random Forest Classifier is well-suited
for this size of data, providing good performance with reasonable training times.
* It can effectively manage the feature space and offer insights into feature importance.


In [119]:
from joblib import load
# Change to true if you want to use a already saved model
if False: # Don't run the next 3 cells if you chose to run this
  model = load('./DRISHTI_DPS.joblib')

KeyboardInterrupt: 

In [57]:
#selecting randomforestclassifier as model
model = RandomForestClassifier(random_state=42)
# model = ExtraTreesClassifier(random_state=42)

To get the required hyper parameters for our data, let's perform RandomSearchCV (Not using GridSearch due to the shear size of our data)

In [24]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

param_dist = {
    'max_depth': [180, 190, 200],
    'n_estimators' : [90, 100],
    'min_samples_split': [5,6]
}

RS = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                        n_iter=10, scoring='accuracy', cv=4, n_jobs=3, verbose=2, random_state=42)

RS.fit(X_train, y_train)

print("Best parameters found: ", RS.best_params_)
print("Best cross-validation accuracy: ", RS.best_score_)

df = pd.DataFrame(RS.cv_results_)
df = df.sort_values("rank_test_score")
df.to_csv("randomForest.csv", index=False)  # Saving data for further analysis


Fitting 4 folds for each of 10 candidates, totalling 40 fits


KeyboardInterrupt: 

In [24]:
import numpy as np
from sklearn.model_selection import GridSearchCV  # Change here
import pandas as pd
from joblib import parallel_backend  

param_grid = {  # Change variable name to param_grid
    'n_estimators': [80, 90],
    'min_samples_split': [4, 5]
}

with parallel_backend('threading'):  # This enables threading
    GS = GridSearchCV(estimator=model, param_grid=param_grid,
                      scoring='accuracy', cv=4, n_jobs=3, verbose=2)
    
    GS.fit(X_train, y_train)

print("Best parameters found: ", GS.best_params_)
print("Best cross-validation accuracy: ", GS.best_score_)

df = pd.DataFrame(GS.cv_results_)  # Change GS to reflect the new object
df = df.sort_values("rank_test_score")
df.to_csv("randomForest.csv", index=False)  # Saving data for further analysis

Fitting 4 folds for each of 4 candidates, totalling 16 fits
[CV] END ...............min_samples_split=4, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=4, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=4, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=4, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=4, n_estimators=90; total time= 1.7min
[CV] END ...............min_samples_split=4, n_estimators=90; total time= 1.7min
[CV] END ...............min_samples_split=4, n_estimators=90; total time= 1.7min
[CV] END ...............min_samples_split=5, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=4, n_estimators=90; total time= 1.7min
[CV] END ...............min_samples_split=5, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=5, n_estimators=80; total time= 1.5min
[CV] END ...............min_samples_split=5, n_es

In [13]:
model = RandomForestClassifier(n_estimators= 90, min_samples_split= 5, min_samples_leaf= 20, max_depth= 200, bootstrap= False, random_state=42) # other params are the default

model.fit(X_train, y_train)

In [27]:
# Cross validation accuracy:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X_train, y_train, cv=4, scoring='accuracy')
print("Cross-validation accuracy scores:", cv_scores)
print("Mean cross-validation accuracy:", cv_scores.mean())


Cross-validation accuracy scores: [0.83163193 0.83235915 0.8325824  0.83093846]
Mean cross-validation accuracy: 0.8318779853064203


### Stratified K-Fold

In [23]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_folds = StratifiedKFold(n_splits = 5)
sk_scores = cross_val_score(model, X_train, y_train, cv=sk_folds)
print("Cross-validation accuracy scores:", sk_scores)
print("Mean cross-validation accuracy:", sk_scores.mean())



KeyboardInterrupt: 

## Model Evaluation

Accuracy on training data

In [121]:
X_train_prediction = model.predict(X_train)

In [122]:
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.8556997092684016


Precision metric on training data


In [29]:
training_data_precision= precision_score(y_train, X_train_prediction,average='weighted')
print('Precision score of training data : ', training_data_precision)

Precision score of training data :  0.8646479313302363


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall score of training data


In [30]:
training_data_recall= recall_score(y_train, X_train_prediction,average='weighted')
print('recall score of training data : ', training_data_recall)

recall score of training data :  0.8588045921156533


In [45]:
y_train, X_train_prediction

(68668                                   gout
 121111                          heart attack
 103701    polycystic ovarian syndrome (pcos)
 210928                  seborrheic keratosis
 214341                             gallstone
                          ...                
 119879                        tooth disorder
 103694    polycystic ovarian syndrome (pcos)
 131932        otitis externa (swimmer's ear)
 146867                                 croup
 121958         oral thrush (yeast infection)
 Name: diseases, Length: 197556, dtype: object,
 array(['gout', 'heart attack', 'polycystic ovarian syndrome (pcos)', ...,
        "otitis externa (swimmer's ear)", 'croup',
        'oral thrush (yeast infection)'], dtype=object))

Confusion Matrix for training data

In [103]:
pd.crosstab(y_train, X_train_prediction,rownames=['Actual values'],colnames=['Predicted values'])

Predicted values,abdominal aortic aneurysm,abdominal hernia,abscess of nose,abscess of the pharynx,acariasis,achalasia,acne,actinic keratosis,acute bronchiolitis,acute bronchitis,...,vitamin b12 deficiency,vitamin d deficiency,vitreous degeneration,vocal cord polyp,volvulus,vulvar disorder,vulvodynia,white blood cell disease,whooping cough,yeast infection
Actual values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abdominal aortic aneurysm,109,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdominal hernia,0,300,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abscess of nose,0,0,189,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
abscess of the lung,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abscess of the pharynx,0,0,0,243,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
white blood cell disease,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,374,0,0
whooping cough,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24,0
wilson disease,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yeast infection,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,374


f1 score of training data

In [31]:
training_data_F1= f1_score(y_train, X_train_prediction,average='weighted')
print('F1_score of training data : ', training_data_F1)

F1_score of training data :  0.8546308887787893


In [14]:
# Saving the model
from joblib import dump

# Save the trained model
dump(model, './DRISHTI_DPS.joblib')

['./DRISHTI_DPS.joblib']

Accuracy on test data

In [62]:
X_test_prediction = model.predict(X_test)

In [63]:
test_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.840399407383354


Precision on test data


In [51]:
test_data_precision = precision_score(Y_test, X_test_prediction,average='weighted')
print('Precision score of test data : ', test_data_precision)

NameError: name 'Y_test' is not defined

### Confusion Matrix (Test)

In [None]:
pd.crosstab(Y_test, X_test_prediction,rownames=['Actual values'],colnames=['Predicted values'])

Predicted values,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
Actual values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,26,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,23,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,29,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,21,...,0,0,0,0,0,0,0,0,0,0


Test data recall


In [None]:
test_data_recall= recall_score(Y_test, X_test_prediction,average='weighted')
print('recall score of testing data : ', test_data_recall)

recall score of testing data :  1.0


Test data f1 score

In [None]:
test_data_F1= f1_score(Y_test, X_test_prediction,average='weighted')
print('F1_score of test data : ', test_data_F1)

F1_score of test data :  1.0


# Single prediction testing

In [None]:
#load model
if False: # Change to true to load model
    with open('/content/model.pkl', 'rb') as file:
        model = pickle.load(file)

In [71]:
# Predict on the entire test set at once
predictions = model.predict(X_test)

success = (predictions == y_test).sum()

accuracy = success / len(X_test)
print(accuracy)

0.840399407383354


In [74]:
#test 1
print("Predicted Label : ",model.predict(X_test.iloc[0].values.reshape(1, -1)))
print("Actual Label : ",y_test)

Predicted Label :  ['nose disorder']
Actual Label :  95805                   nose disorder
126651           arthritis of the hip
91262                   alcohol abuse
203690               pleural effusion
69356                    otitis media
                     ...             
134440                     presbyopia
246478    conjunctivitis due to virus
19968         vaginal yeast infection
50067                bipolar disorder
215860             ulcerative colitis
Name: diseases, Length: 49273, dtype: object




In [123]:
#test 2
print("Predicted Label : ",model.predict(X_test.iloc[1].values.reshape(1, -1)))
print("Actual Label : ",y_test[126651])



Predicted Label :  ['arthritis of the hip']
Actual Label :  arthritis of the hip




# Logic for recommendations

Reference symptoms & disease dictionary

In [None]:
symptoms_dict = {'itching': 0, 'skin_rash': 1, 'nodal_skin_eruptions': 2, 'continuous_sneezing': 3,
                 'shivering': 4, 'chills': 5, 'joint_pain': 6, 'stomach_pain': 7, 'acidity': 8,
                 'ulcers_on_tongue': 9, 'muscle_wasting': 10, 'vomiting': 11, 'burning_micturition': 12,
                 'spotting_ urination': 13, 'fatigue': 14, 'weight_gain': 15, 'anxiety': 16, 'cold_hands_and_feets': 17,
                 'mood_swings': 18, 'weight_loss': 19, 'restlessness': 20, 'lethargy': 21, 'patches_in_throat': 22,
                 'irregular_sugar_level': 23, 'cough': 24, 'high_fever': 25, 'sunken_eyes': 26, 'breathlessness': 27,
                 'sweating': 28, 'dehydration': 29, 'indigestion': 30, 'headache': 31, 'yellowish_skin': 32, 'dark_urine': 33,
                 'nausea': 34, 'loss_of_appetite': 35, 'pain_behind_the_eyes': 36, 'back_pain': 37, 'constipation': 38,
                 'abdominal_pain': 39, 'diarrhoea': 40, 'mild_fever': 41, 'yellow_urine': 42, 'yellowing_of_eyes': 43,
                 'acute_liver_failure': 44, 'fluid_overload': 45, 'swelling_of_stomach': 46, 'swelled_lymph_nodes': 47,
                 'malaise': 48, 'blurred_and_distorted_vision': 49, 'phlegm': 50, 'throat_irritation': 51, 'redness_of_eyes': 52,
                 'sinus_pressure': 53, 'runny_nose': 54, 'congestion': 55, 'chest_pain': 56, 'weakness_in_limbs': 57,
                 'fast_heart_rate': 58, 'pain_during_bowel_movements': 59, 'pain_in_anal_region': 60, 'bloody_stool': 61,
                 'irritation_in_anus': 62, 'neck_pain': 63, 'dizziness': 64, 'cramps': 65, 'bruising': 66, 'obesity': 67,
                 'swollen_legs': 68, 'swollen_blood_vessels': 69, 'puffy_face_and_eyes': 70, 'enlarged_thyroid': 71,
                 'brittle_nails': 72, 'swollen_extremeties': 73, 'excessive_hunger': 74, 'extra_marital_contacts': 75,
                 'drying_and_tingling_lips': 76, 'slurred_speech': 77, 'knee_pain': 78, 'hip_joint_pain': 79, 'muscle_weakness': 80,
                 'stiff_neck': 81, 'swelling_joints': 82, 'movement_stiffness': 83, 'spinning_movements': 84, 'loss_of_balance': 85,
                 'unsteadiness': 86, 'weakness_of_one_body_side': 87, 'loss_of_smell': 88, 'bladder_discomfort': 89,
                 'foul_smell_of urine': 90, 'continuous_feel_of_urine': 91, 'passage_of_gases': 92, 'internal_itching': 93,
                 'toxic_look_(typhos)': 94, 'depression': 95, 'irritability': 96, 'muscle_pain': 97, 'altered_sensorium': 98,
                 'red_spots_over_body': 99, 'belly_pain': 100, 'abnormal_menstruation': 101, 'dischromic _patches': 102,
                 'watering_from_eyes': 103, 'increased_appetite': 104, 'polyuria': 105, 'family_history': 106, 'mucoid_sputum': 107,
                 'rusty_sputum': 108, 'lack_of_concentration': 109, 'visual_disturbances': 110, 'receiving_blood_transfusion': 111,
                 'receiving_unsterile_injections': 112, 'coma': 113, 'stomach_bleeding': 114, 'distention_of_abdomen': 115,
                 'history_of_alcohol_consumption': 116, 'fluid_overload.1': 117, 'blood_in_sputum': 118, 'prominent_veins_on_calf': 119,
                 'palpitations': 120, 'painful_walking': 121, 'pus_filled_pimples': 122, 'blackheads': 123, 'scurring': 124,
                 'skin_peeling': 125, 'silver_like_dusting': 126, 'small_dents_in_nails': 127, 'inflammatory_nails': 128, 'blister': 129,
                 'red_sore_around_nose': 130, 'yellow_crust_ooze': 131}

diseases_list = {14:'Fungal Infection',3: 'Allergy',15: 'GERD',8: 'Chronic Cholestasis',
       13: 'Drug Reaction',33: 'Peptic Ulcer Disease',0: 'AIDS',11: 'Diabetes ',
       16: 'Gastroenteritis',5: 'Bronchial Asthma',23: 'Hypertension ',30: 'Migraine',
       6: 'Cervical Spondylosis',32: 'Paralysis (brain hemorrhage)',28: 'Jaundice',
       29: 'Malaria',7: 'Chickenpox',10: 'Dengue',37: 'Typhoid',18: 'Hepatitis A',
       19: 'Hepatitis B',20: 'Hepatitis C',21: 'Hepatitis D',22: 'Hepatitis E',
       2: 'Alcoholic Hepatitis',36: 'Tuberculosis',9: 'Common Cold',34: 'Pneumonia',
       12: 'Dimorphic Hemmorhoids (piles)',17: 'Heart Attack',39: 'Varicose Veins',
       26: 'Hypothyroidism',24: 'Hyperthyroidism',25: 'Hypoglycemia',
       31: 'Osteoarthritis',4: 'Arthritis',40: 'Vertigo',1: 'Acne',
       38: 'Urinary Tract Infection',35: 'Psoriasis',27: 'Impetigo'}


**Disease Prediction Function**

In [20]:
column_names = ','.join(X.columns)
with open("./columns.txt", 'w') as file:
    file.write(column_names)

In [23]:
symptoms_list = X.columns
symptom_dict = {symptom: index for index, symptom in enumerate(symptoms_list)}

symptom_dict

{'anxiety and nervousness': 0,
 'depression': 1,
 'shortness of breath': 2,
 'depressive or psychotic symptoms': 3,
 'sharp chest pain': 4,
 'dizziness': 5,
 'insomnia': 6,
 'abnormal involuntary movements': 7,
 'chest tightness': 8,
 'palpitations': 9,
 'irregular heartbeat': 10,
 'breathing fast': 11,
 'hoarse voice': 12,
 'sore throat': 13,
 'difficulty speaking': 14,
 'cough': 15,
 'nasal congestion': 16,
 'throat swelling': 17,
 'diminished hearing': 18,
 'lump in throat': 19,
 'throat feels tight': 20,
 'difficulty in swallowing': 21,
 'skin swelling': 22,
 'retention of urine': 23,
 'groin mass': 24,
 'leg pain': 25,
 'hip pain': 26,
 'suprapubic pain': 27,
 'blood in stool': 28,
 'lack of growth': 29,
 'emotional symptoms': 30,
 'elbow weakness': 31,
 'back weakness': 32,
 'symptoms of the scrotum and testes': 33,
 'swelling of scrotum': 34,
 'pain in testicles': 35,
 'flatulence': 36,
 'pus draining from ear': 37,
 'jaundice': 38,
 'mass in scrotum': 39,
 'white discharge from

In [26]:
import numpy as np

def get_predicted_value(patient_symptoms, model):
    # Initialize input_vector with zeros
    input_vector = np.zeros(len(symptom_dict.keys()))

    # Populate input_vector based on patient_symptoms
    for userSymptom in patient_symptoms:
        input_vector[symptom_dict[userSymptom]] = 1

    print(input_vector.sum())
    predicted_disease = model.predict([input_vector])[0]
    return predicted_disease


**Testing Prediction Function**

In [27]:
heartburn_related_symptoms = [
    'burning chest pain'
]
get_predicted_value(heartburn_related_symptoms, model)

1.0




'venous insufficiency'