### *Imports*

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

### *Data Preparation*

In [2]:
# loading dataset
df = pd.read_csv('/content/disease_dataset.csv')
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,disease
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [3]:
# checking the shape
df.shape

(4961, 133)

In [4]:
# checking for missing values
df.dropna().shape

(4961, 133)

In [5]:
# converting target values to lower case
df['disease'] = df['disease'].str.lower()

In [6]:
# removing unnecessary spaces  
df['disease'] = df['disease'].str.strip()

In [7]:
# checking if the dataset is imbalanced 
df['disease'].value_counts()

fungal infection                           121
hepatitis c                                121
hepatitis e                                121
alcoholic hepatitis                        121
tuberculosis                               121
common cold                                121
pneumonia                                  121
dimorphic hemmorhoids(piles)               121
heart attack                               121
varicose veins                             121
hypothyroidism                             121
hyperthyroidism                            121
hypoglycemia                               121
osteoarthristis                            121
arthritis                                  121
(vertigo) paroymsal  positional vertigo    121
acne                                       121
urinary tract infection                    121
psoriasis                                  121
hepatitis d                                121
hepatitis b                                121
allergy      

In [8]:
# checking the diseases
df['disease'].unique()

array(['fungal infection', 'allergy', 'gerd', 'chronic cholestasis',
       'drug reaction', 'peptic ulcer diseae', 'aids', 'diabetes',
       'gastroenteritis', 'bronchial asthma', 'hypertension', 'migraine',
       'cervical spondylosis', 'paralysis (brain hemorrhage)', 'jaundice',
       'malaria', 'chicken pox', 'dengue', 'typhoid', 'hepatitis a',
       'hepatitis b', 'hepatitis c', 'hepatitis d', 'hepatitis e',
       'alcoholic hepatitis', 'tuberculosis', 'common cold', 'pneumonia',
       'dimorphic hemmorhoids(piles)', 'heart attack', 'varicose veins',
       'hypothyroidism', 'hyperthyroidism', 'hypoglycemia',
       'osteoarthristis', 'arthritis',
       '(vertigo) paroymsal  positional vertigo', 'acne',
       'urinary tract infection', 'psoriasis', 'impetigo'], dtype=object)

In [9]:
# checking the number of diseases
df['disease'].nunique()

41

In [10]:
# fetching data for common diseases
filt = ['allergy', 'common cold', 'malaria', 'chicken pox', 'dengue', 'typhoid', 'pneumonia', 'heart attack', 'diabetes', 
        'acne', 'tuberculosis', 'fungal infection', 'drug reaction', 'hypertension', 'migraine', 'jaundice', 'gerd', 'aids']

In [11]:
# applying the filter to fetch the required data
df = df.loc[df['disease'].isin(filt)]

In [12]:
# checking the shape
df.shape

(2178, 133)

In [13]:
# encoding the target feature
encoder = LabelEncoder()
df['class'] = encoder.fit_transform(df['disease'])

In [14]:
# checking the total number of common diseases
df['disease'].nunique()

18

In [15]:
# checking the total number of classes newly created
df['class'].nunique()

18

### *Model Building*

In [16]:
# dividing the data into X & y
X = df.drop(columns=['disease', 'class'], axis=1)
y = df['class']

In [17]:
# splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=90)

In [18]:
# checking the shape of splitted sets
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)
print("y_train Shape: ", y_train.shape)
print("y_test Shape: ", y_test.shape)

X_train Shape:  (1742, 132)
X_test Shape:  (436, 132)
y_train Shape:  (1742,)
y_test Shape:  (436,)


In [19]:
# creating an object for RFC
rf = RandomForestClassifier(random_state=10)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=10)

In [20]:
# feature selection with RF
imp_features = pd.DataFrame(rf.feature_importances_* 100, X_train.columns, columns=['importance'])
imp_features.sort_values(by='importance', ascending=False, inplace=True)
imp_features

Unnamed: 0,importance
dischromic _patches,2.360297
shivering,2.308106
dark_urine,2.164143
loss_of_balance,2.147337
nodal_skin_eruptions,2.053836
...,...
drying_and_tingling_lips,0.000000
swollen_extremeties,0.000000
brittle_nails,0.000000
enlarged_thyroid,0.000000


In [21]:
# finding the count of features with 0% importance 
zeros = np.array(imp_features[imp_features['importance'] == 0.000000].index)
print("There are {} features with 0% importance".format(len(zeros)))

There are 57 features with 0% importance


In [22]:
# selecting 75 features after finding out 57 features have 0% importance
select = SelectFromModel(estimator=rf, max_features=75)
select.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(random_state=10),
                max_features=75)

In [23]:
# transforming the data
X_train = select.transform(X_train)
X_test = select.transform(X_test)

In [24]:
# training a model
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=10)

In [25]:
# making predictions
y_pred = rf.predict(X_test)

In [26]:
 # actual data
y_test.ravel()[:20]

array([10, 13,  6, 11, 12,  5,  2, 14,  3, 13, 13, 12, 17,  0, 17, 15,  0,
       16, 12,  2])

In [27]:
# predicted data
y_pred.ravel()[:20]

array([10, 13,  6, 11, 12,  5,  2, 14,  3, 13, 13, 12, 17,  0, 17, 15,  0,
       16, 12,  2])

In [28]:
# checking the accuracy
print("Accuracy: ", (accuracy_score(y_test, y_pred) * 100).round(2))

Accuracy:  100.0


In [29]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        37
           4       1.00      1.00      1.00        16
           5       1.00      1.00      1.00        26
           6       1.00      1.00      1.00        18
           7       1.00      1.00      1.00        22
           8       1.00      1.00      1.00        20
           9       1.00      1.00      1.00        27
          10       1.00      1.00      1.00        31
          11       1.00      1.00      1.00        23
          12       1.00      1.00      1.00        20
          13       1.00      1.00      1.00        26
          14       1.00      1.00      1.00        21
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        28
          17       1.00    