## Load the Dataset and Tools

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the data from the csv file
dataset = pd.read_csv("datasets/Training.csv")

In [3]:
# reading the database
dataset.head()

# Multiclass class Classification Problem

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [4]:
dataset.shape

(4920, 133)

In [5]:
# finding all the unique diseases in the prognosis column
dataset['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [6]:
# counting the number of diseases
len(dataset['prognosis'].unique())

41

In [7]:
# all the symptoms are input column and the prognosis is the output column



# Train Test Split

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
# labelencoder would be used to encode the last column 'prognosis'
# will divide the columns into X and y 
# X would take the first 132 columns and the output y would take the last column for output

In [9]:
# for X will drop the last column of dataset
X = dataset.drop("prognosis",axis=1)
# axis=1 will delete it columnwise
# axis=0 will delete it rowwise
# X.head()
# and the prognosis column will go to y
y = dataset['prognosis']

In [10]:
# print X and y
# X

In [11]:
# y

In [12]:
le = LabelEncoder()
le.fit(y)
Y = le.transform(y)
# convert the string array of y into an integer array
# we encoded the string to integer

In [13]:
Y

array([15, 15, 15, ..., 38, 35, 27])

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3, random_state = 20)
# will take all the input variable values of X i.e the 4920 rows and consider 70% from it for X_train and rest for the X_test
# same for y

In [15]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((3444, 132), (1476, 132), (3444,), (1476,))

## Training Top Models

In [16]:
# now we will train the top 5 multi classification models
# we need to import the algorithms

In [17]:
from sklearn.datasets import make_classification   # this is optional
from sklearn.svm import SVC                        # our first algorithm (Support Vector Machine)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # these 2 give good results
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# we are importing multiple algorithms so that we can see the accuracy of each model and select the best one
from sklearn.metrics import accuracy_score, confusion_matrix  
# accuracy_score -> hr model ka score
# confusion matrix -> how good or wrong the predictions are

# if we train all these individually it will consume a lot of time
# so instead we will create a dictionary to store the models

# create dictionary 
# dictionary contains key value pair
# key would be the model name
# and value is the 
models ={
    "SVC":SVC(kernel = 'linear'), # will take the linear kernel # this will create the hyperplane between 2 classes
    "RandomForest": RandomForestClassifier(n_estimators=100,random_state=42),
    "GradientBoosting":GradientBoostingClassifier(n_estimators=100,random_state=42),
    "KNeighbors":KNeighborsClassifier(n_neighbors=5),
    "MultinomialNB":MultinomialNB() #multinomial Naive Bayes
    # model_name : model
}
for model_name, model in models.items():
    print(model_name,":",model)
    # train model
    model.fit(X_train, y_train) # will take the 70% data of input and output for training
    
    #test model
    predictions = model.predict(X_test)
    
    #calculate accuracy
    accuracy = accuracy_score(y_test,predictions) # wo data jo abhi tak model ne nhi dekha i.e. y_test and jo model ne predict kiya hai
    
    #calculate confusion matrix
    cm = confusion_matrix(y_test,predictions)
    
    print(f"{model_name} accuracy : {accuracy}") 
    #print (f"()")  here f is used for formatting
    
    print(f"{model_name} Confusion Matrix:")
    #will need to convert cm from array to string
    
    print(np.array2string(cm,separator=', '))
    
    # the above for loop will run 5 times for each model
    
    # in confusion matrix is the values above and below the diagnol are 0 means our model is going perfectly with no error

SVC : SVC(kernel='linear')
SVC accuracy : 1.0
SVC Confusion Matrix:
[[40,  0,  0, ...,  0,  0,  0],
 [ 0, 43,  0, ...,  0,  0,  0],
 [ 0,  0, 28, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 34,  0,  0],
 [ 0,  0,  0, ...,  0, 41,  0],
 [ 0,  0,  0, ...,  0,  0, 31]]
RandomForest : RandomForestClassifier(random_state=42)
RandomForest accuracy : 1.0
RandomForest Confusion Matrix:
[[40,  0,  0, ...,  0,  0,  0],
 [ 0, 43,  0, ...,  0,  0,  0],
 [ 0,  0, 28, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 34,  0,  0],
 [ 0,  0,  0, ...,  0, 41,  0],
 [ 0,  0,  0, ...,  0,  0, 31]]
GradientBoosting : GradientBoostingClassifier(random_state=42)
GradientBoosting accuracy : 1.0
GradientBoosting Confusion Matrix:
[[40,  0,  0, ...,  0,  0,  0],
 [ 0, 43,  0, ...,  0,  0,  0],
 [ 0,  0, 28, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 34,  0,  0],
 [ 0,  0,  0, ...,  0, 41,  0],
 [ 0,  0,  0, ...,  0,  0, 31]]
KNeighbors : KNeighborsClassifier()
KNeighbors accuracy : 1.0
KNeighbors Confusion Matrix:
[[4

## Single Prediction

In [18]:
# All the models above are giving 100% accuracy for the test data
# so to choose the best model we need to check it on a single input 
# i.e. we need to make a single prediction

In [19]:
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
ypred = svc.predict(X_test)
accuracy_score(y_test,ypred)

1.0

In [20]:
# now we need to save the model
import pickle
pickle.dump(svc,open("models/svc.pkl",'wb')) # open me proper name and give binary mode
# while saving we give the write binary mode
# while loading the model we use the 'rb' i.e. the read binary (this we will see in the Pycharm in the production level)


In [21]:
# load the model to use it 
svc = pickle.load(open("models/svc.pkl",'rb'))

In [22]:
# X_test
# X_test.iloc[0] # gives the first record of X_test
# X_test.iloc[0].values  # array form 1D
#will convert this array to a 2d form

#test1
print("Predicted Label: ",svc.predict(X_test.iloc[0].values.reshape(1,-1)))
print("Actual Label: ", y_test[0])

Predicted Label:  [40]
Actual Label:  40




In [23]:
y_test


array([40,  6, 16, ...,  8, 10, 33])

In [24]:
# test2
print("Predicted Label: ",svc.predict(X_test.iloc[10].values.reshape(1,-1)))
print("Actual Label: ",y_test[10])

Predicted Label:  [20]
Actual Label:  20




## Recommendation System and Prediction

## Load database and use logic for recommendations

In [25]:
sym_des = pd.read_csv('Datasets/symtoms_df.csv')
precautions = pd.read_csv('Datasets/precautions_df.csv')
workout = pd.read_csv('Datasets/workout_df.csv')
description = pd.read_csv('Datasets/description.csv')
medications = pd.read_csv('Datasets/medications.csv')
diets = pd.read_csv('Datasets/diets.csv')

In [26]:
# sym_des

In [27]:
# precautions

In [28]:
# workout

In [29]:
# description

In [30]:
# medications

In [31]:
# dietsx

In [32]:
#============================================================
# custome and helping functions
#==========================helper funtions================
def helper(dis):
    desc = description[description['Disease'] == predicted_disease]['Description']
    desc = " ".join([w for w in desc])

    pre = precautions[precautions['Disease'] == dis][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']]
    pre = [col for col in pre.values]

    med = medications[medications['Disease'] == dis]['Medication']
    med = [med for med in med.values]

    die = diets[diets['Disease'] == dis]['Diet']
    die = [die for die in die.values]

    wrkout = workout[workout['disease'] == dis] ['workout']


    return desc,pre,med,die,wrkout


# will create a function that will predict the disease
# model prediction function

symptoms_dict = {'itching': 0, 'skin_rash': 1, 'nodal_skin_eruptions': 2, 'continuous_sneezing': 3, 'shivering': 4, 'chills': 5, 'joint_pain': 6, 'stomach_pain': 7, 'acidity': 8, 'ulcers_on_tongue': 9, 'muscle_wasting': 10, 'vomiting': 11, 'burning_micturition': 12, 'spotting_ urination': 13, 'fatigue': 14, 'weight_gain': 15, 'anxiety': 16, 'cold_hands_and_feets': 17, 'mood_swings': 18, 'weight_loss': 19, 'restlessness': 20, 'lethargy': 21, 'patches_in_throat': 22, 'irregular_sugar_level': 23, 'cough': 24, 'high_fever': 25, 'sunken_eyes': 26, 'breathlessness': 27, 'sweating': 28, 'dehydration': 29, 'indigestion': 30, 'headache': 31, 'yellowish_skin': 32, 'dark_urine': 33, 'nausea': 34, 'loss_of_appetite': 35, 'pain_behind_the_eyes': 36, 'back_pain': 37, 'constipation': 38, 'abdominal_pain': 39, 'diarrhoea': 40, 'mild_fever': 41, 'yellow_urine': 42, 'yellowing_of_eyes': 43, 'acute_liver_failure': 44, 'fluid_overload': 45, 'swelling_of_stomach': 46, 'swelled_lymph_nodes': 47, 'malaise': 48, 'blurred_and_distorted_vision': 49, 'phlegm': 50, 'throat_irritation': 51, 'redness_of_eyes': 52, 'sinus_pressure': 53, 'runny_nose': 54, 'congestion': 55, 'chest_pain': 56, 'weakness_in_limbs': 57, 'fast_heart_rate': 58, 'pain_during_bowel_movements': 59, 'pain_in_anal_region': 60, 'bloody_stool': 61, 'irritation_in_anus': 62, 'neck_pain': 63, 'dizziness': 64, 'cramps': 65, 'bruising': 66, 'obesity': 67, 'swollen_legs': 68, 'swollen_blood_vessels': 69, 'puffy_face_and_eyes': 70, 'enlarged_thyroid': 71, 'brittle_nails': 72, 'swollen_extremeties': 73, 'excessive_hunger': 74, 'extra_marital_contacts': 75, 'drying_and_tingling_lips': 76, 'slurred_speech': 77, 'knee_pain': 78, 'hip_joint_pain': 79, 'muscle_weakness': 80, 'stiff_neck': 81, 'swelling_joints': 82, 'movement_stiffness': 83, 'spinning_movements': 84, 'loss_of_balance': 85, 'unsteadiness': 86, 'weakness_of_one_body_side': 87, 'loss_of_smell': 88, 'bladder_discomfort': 89, 'foul_smell_of urine': 90, 'continuous_feel_of_urine': 91, 'passage_of_gases': 92, 'internal_itching': 93, 'toxic_look_(typhos)': 94, 'depression': 95, 'irritability': 96, 'muscle_pain': 97, 'altered_sensorium': 98, 'red_spots_over_body': 99, 'belly_pain': 100, 'abnormal_menstruation': 101, 'dischromic _patches': 102, 'watering_from_eyes': 103, 'increased_appetite': 104, 'polyuria': 105, 'family_history': 106, 'mucoid_sputum': 107, 'rusty_sputum': 108, 'lack_of_concentration': 109, 'visual_disturbances': 110, 'receiving_blood_transfusion': 111, 'receiving_unsterile_injections': 112, 'coma': 113, 'stomach_bleeding': 114, 'distention_of_abdomen': 115, 'history_of_alcohol_consumption': 116, 'fluid_overload.1': 117, 'blood_in_sputum': 118, 'prominent_veins_on_calf': 119, 'palpitations': 120, 'painful_walking': 121, 'pus_filled_pimples': 122, 'blackheads': 123, 'scurring': 124, 'skin_peeling': 125, 'silver_like_dusting': 126, 'small_dents_in_nails': 127, 'inflammatory_nails': 128, 'blister': 129, 'red_sore_around_nose': 130, 'yellow_crust_ooze': 131}
diseases_list = {15: 'Fungal infection', 4: 'Allergy', 16: 'GERD', 9: 'Chronic cholestasis', 14: 'Drug Reaction', 33: 'Peptic ulcer diseae', 1: 'AIDS', 12: 'Diabetes ', 17: 'Gastroenteritis', 6: 'Bronchial Asthma', 23: 'Hypertension ', 30: 'Migraine', 7: 'Cervical spondylosis', 32: 'Paralysis (brain hemorrhage)', 28: 'Jaundice', 29: 'Malaria', 8: 'Chicken pox', 11: 'Dengue', 37: 'Typhoid', 40: 'hepatitis A', 19: 'Hepatitis B', 20: 'Hepatitis C', 21: 'Hepatitis D', 22: 'Hepatitis E', 3: 'Alcoholic hepatitis', 36: 'Tuberculosis', 10: 'Common Cold', 34: 'Pneumonia', 13: 'Dimorphic hemmorhoids(piles)', 18: 'Heart attack', 39: 'Varicose veins', 26: 'Hypothyroidism', 24: 'Hyperthyroidism', 25: 'Hypoglycemia', 31: 'Osteoarthristis', 5: 'Arthritis', 0: '(vertigo) Paroymsal  Positional Vertigo', 2: 'Acne', 38: 'Urinary tract infection', 35: 'Psoriasis', 27: 'Impetigo'}

# symptoms
# prognosis using label encoding

def get_predicted_value(patient_symptoms):
    input_vector = np.zeros(len(symptoms_dict)) # will create array containing all the values as 0 of sizeof(symptoms_dict)  (132 empty zeros)
    
    for item in patient_symptoms: 
        input_vector[symptoms_dict[item]] = 1   # will take input of the symptoms one by one and store it in the array input_vector
        
        # first we had a vector containing 0's and the length of the vector was 132 which was exactly equal to number of symptoms
        # whenever the user inputs a symptom we will assign the value as 1 for that particular symptom
        # likewise doing so we can easily predict the disease
    return diseases_list[svc.predict([input_vector])[0]] # [0] is to discard the [] at starting and only get the pure values
        

In [33]:
# Test 1
# Split the user's input into a list of symptoms (assuming they are comma-separated) # itching,skin_rash,nodal_skin_eruptions
symptoms = input("Enter your symptoms.......")
user_symptoms = [s.strip() for s in symptoms.split(',')]
# Remove any extra characters, if any
user_symptoms = [symptom.strip("[]' ") for symptom in user_symptoms]
predicted_disease = get_predicted_value(user_symptoms)

desc, pre, med, die, wrkout = helper(predicted_disease)

print("=================predicted disease============")
print(predicted_disease)
print("=================description==================")
print(desc)
print("=================precautions==================")
i = 1
for p_i in pre[0]:
    print(i, ": ", p_i)
    i += 1

print("=================medications==================")
for m_i in med:
    print(i, ": ", m_i)
    i += 1

print("=================workout==================")
for w_i in wrkout:
    print(i, ": ", w_i)
    i += 1

print("=================diets==================")
for d_i in die:
    print(i, ": ", d_i)
    i += 1

Enter your symptoms.......itching,skin_rash
Fungal infection
Fungal infection is a common skin condition caused by fungi.
1 :  bath twice
2 :  use detol or neem in bathing water
3 :  keep infected area dry
4 :  use clean cloths
5 :  ['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']
6 :  Avoid sugary foods
7 :  Consume probiotics
8 :  Increase intake of garlic
9 :  Include yogurt in diet
10 :  Limit processed foods
11 :  Stay hydrated
12 :  Consume green tea
13 :  Eat foods rich in zinc
14 :  Include turmeric in diet
15 :  Eat fruits and vegetables
16 :  ['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']




In [34]:
# predicted_disease
descr = description[description['Disease'] == predicted_disease]['Description']
# descr
# descr = " ".join([w for w in descr])

In [35]:
# let's use pycharm flask app
# but install this version in pycharm
import sklearn
print(sklearn.__version__)

1.2.1
