# Basic Baruta Check

In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 


import copy
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
# !pip install Boruta


In [3]:
unneeded_data = ['Place_of_isolation','Patient_In','Patient_out']
catigorical_data=['Blood_Group','CONTROL_blood_group','Risk_Factor']
labels = ['Severeity','Outcome']
droped_col = unneeded_data+catigorical_data+labels

In [4]:
df = pd.read_csv("Covid-19_dataset.csv",header=1)


df['is_Male'] = pd.get_dummies(df['Gender'])['MALE']#one hot encoding gender so male = #1 and female = 0

blood_types = pd.get_dummies(df['Blood_Group']).columns
blood_group_columns = pd.get_dummies(df[['Blood_Group','CONTROL_blood_group']]).columns


df = pd.concat([df, pd.get_dummies(df[['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type']])], axis=1)



df = df.drop(['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type','Gender','Place_of_isolation','Patient_In','Patient_out'],axis=1)

columns = df.columns



# getting rid of blank values
print("before",df.shape)
for name in columns:
    df = df[df[name] != " "]
    df = df.dropna()
    
label_Severeity = df['Severeity'] #Severeity vs Outcome
label_Outcome = df['Outcome']
df = df.drop(['Severeity','Outcome'],axis=1)


df = df.applymap(lambda x: pd.to_numeric(x, errors='coerce'))#strings to numeric
all_columns = df.columns
# df = df.reset_index()

print("after",df.shape)


df_no_blood_data = copy.deepcopy(df.drop(blood_group_columns,axis=1))
df_no_blood_data


before (5668, 39)
after (5641, 37)


Unnamed: 0,Age,Incubation period (days),Fever,Chills,Cough,Dyspnea,Anosmia_Ageusia,Loss_of_appetite,Asthenia,Headache,...,Cyanosis,Rhinorrhea,Sore_throat,Diarrhea,Nausea_vomiting,is_Male,Infection_type_Asymptomatic,Infection_type_Asymtomatic,Infection_type_Symptomatic,Infection_type_Symtomatic
0,61.0,14,1,1,1,1,1,1,1,1,...,0,0,1,1,0,1,0,0,1,0
1,49.0,28,1,0,1,0,1,1,1,1,...,0,0,1,1,1,1,0,0,1,0
2,88.0,1,1,1,1,1,0,0,1,1,...,1,0,1,0,0,1,0,0,1,0
3,56.0,17,1,0,1,0,1,1,1,1,...,0,1,0,1,0,1,0,0,1,0
4,41.0,1,0,0,1,0,1,1,1,1,...,0,1,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5663,65.0,14,1,0,1,1,1,1,1,1,...,0,1,0,0,1,1,0,0,1,0
5664,32.0,7,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,0
5665,28.0,4,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,0
5666,40.0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,0


In [5]:
# X = df
# y = label_Severeity

def borutaSupportFeatures(X,y,verbose_level=1,print_fail=True,print_pass=True,tentive=False):
    np.random.seed(seed=1)
    # let's initialize a RF model 
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5,random_state=1)

    # let's initialize Boruta
    feat_selector = BorutaPy(
            verbose=verbose_level,
            estimator=model,
            n_estimators='auto',
            max_iter=250  # number of iterations to perform
        )

    # train Boruta


    feat_selector.fit(np.array(X), np.array(y))
    support = [] #saves only conformed  features

    if(print_fail):
        # print support and ranking for each feature
        print("\n------Support and Ranking for each feature------")
        for i in range(len(feat_selector.support_)):
            if not feat_selector.support_[i]:
                print("Doesn't pass the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
            
            


    if(print_pass):
        for i in range(len(feat_selector.support_)):
            if feat_selector.support_[i]:
                print("Passes the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
                support.append(X.columns[i])
                
    if(tentive):
        for i in range(len(feat_selector.support_weak_)):
            if feat_selector.support_[i]:
                print("May pass the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
                support.append(X.columns[i])
        
            
    return support
    


# Multiple itterations of Boruta droping the confirmed features each subsequent iteration. 

In [6]:
use_features = df.columns
iteration = 1
saved={}
useful = []
tentive = []
while(True):
    print("iteration: ",iteration)
    if(iteration > 4):
        v=1
    else:
        v=0
    
    positive_features = borutaSupportFeatures(df[use_features],label_Severeity,verbose_level=1,print_fail=False)
    if(len(positive_features)==0):
        print("No new features")
        temp = borutaSupportFeatures(df[use_features+tentive],label_Severeity,verbose_level=1,print_fail=False,tentive=True)
        tentive = tentive + temp
        if(len(temp) == 0):
            break
        else:
            print("temp: ",temp)
    else:
        useful = useful+ positive_features
        saved[iteration] = positive_features
        print(positive_features)
        use_features = list(set(use_features) -set(positive_features))
    iteration = iteration+1
    

iteration:  1
Iteration: 1 / 250
Iteration: 2 / 250
Iteration: 3 / 250
Iteration: 4 / 250
Iteration: 5 / 250
Iteration: 6 / 250
Iteration: 7 / 250
Iteration: 8 / 250


BorutaPy finished running.

Iteration: 	9 / 250
Confirmed: 	3
Tentative: 	0
Rejected: 	34
Passes the test:  Age  - Ranking:  1
Passes the test:   Incubation period (days)  - Ranking:  1
Passes the test:  Cyanosis  - Ranking:  1
['Age', ' Incubation period (days)', 'Cyanosis']
iteration:  2
Iteration: 1 / 250
Iteration: 2 / 250
Iteration: 3 / 250
Iteration: 4 / 250
Iteration: 5 / 250
Iteration: 6 / 250
Iteration: 7 / 250
Iteration: 8 / 250
Iteration: 9 / 250
Iteration: 10 / 250
Iteration: 11 / 250
Iteration: 12 / 250
Iteration: 13 / 250
Iteration: 14 / 250
Iteration: 15 / 250
Iteration: 16 / 250
Iteration: 17 / 250
Iteration: 18 / 250
Iteration: 19 / 250
Iteration: 20 / 250
Iteration: 21 / 250
Iteration: 22 / 250
Iteration: 23 / 250
Iteration: 24 / 250
Iteration: 25 / 250
Iteration: 26 / 250
Iteration: 27 / 250
Iteration:

In [7]:
use_features

['Loss_of_appetite',
 'CONTROL_blood_group_A-',
 'Infection_type_Symptomatic',
 'Blood_Group_O+',
 'Diarrhea',
 'CONTROL_blood_group_O-',
 'Fever ',
 'Muscle_ache',
 'Dyspnea',
 'Rhinorrhea',
 'Chills',
 'Cough',
 'Asthenia',
 'Blood_Group_AB+',
 'Blood_Group_B-',
 'Sore_throat',
 'Blood_Group_B+',
 'Anosmia_Ageusia',
 'Blood_Group_A-',
 'Headache',
 'CONTROL_blood_group_B-',
 'Blood_Group_O-',
 'Blood_Group_AB-',
 'Infection_type_Symtomatic',
 'CONTROL_blood_group_A+',
 'Infection_type_Asymptomatic',
 'CONTROL_blood_group_O+',
 'Infection_type_Asymtomatic',
 'CONTROL_blood_group_AB-',
 'CONTROL_blood_group_B+']

In [8]:
tentive

[]

In [9]:
len(useful)

7

In [10]:
useful

['Age',
 ' Incubation period (days)',
 'Cyanosis',
 'Blood_Group_A+',
 'CONTROL_blood_group_AB+',
 'is_Male',
 'Nausea_vomiting']

In [11]:
STOP

NameError: name 'STOP' is not defined