<a href="https://colab.research.google.com/github/OmriMan/EnsembleLearning_EnsembleDiversity_Assignment4/blob/main/Ensemble_Ass4_Stroke.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Packages

In [371]:
import numpy as np  
import pandas as pd  

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from matplotlib import pyplot as plt  
from sklearn.tree import plot_tree 

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score

import random

from sklearn.model_selection import RandomizedSearchCV

from tabulate import tabulate

#Dataset Stroke Prediction#
source: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

Classify / predict whether a patient can suffer a stroke.

Dataset Attributes :

id : unique identifier

gender : "Male", "Female" or "Other"

age : age of the patient

hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

heart_disease : 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

ever_married : "No" or "Yes"

work_type : "children", "Govt_job", "Never_worked", "Private" or "Self-employed"

Residence_type : "Rural" or "Urban"

avg_glucose_level : average glucose level in blood

bmi : body mass index

smoking_status : "formerly smoked", "never smoked", "smokes" or "Unknown"*

stroke : 1 if the patient had a stroke or 0 if not

The data contains 5110 observations with 12 attributes.

source: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

In [372]:
def preprocess(df):
    # Gender
    change = { 'Other': 2,'Female':1,'Male':0}
    df.gender = df.gender.map(change)

    # BMI
    df['bmi'] = df['bmi'].replace(to_replace = np.nan, value =df['bmi'].mean())

    # ever_married
    change = {'No':0,'Yes':1}
    df.ever_married = df.ever_married.map(change)

    # Residence_type	
    change = {"Rural":1,"Urban":2}
    df.Residence_type	 = df.Residence_type.map(change)

    #work_type
    change = {"children":0, "Govt_job":1, "Never_worked":2, "Private":3 , "Self-employed":4}
    df.work_type = df.work_type.map(change)

    #smoking_status
    change = {"never smoked":0, "formerly smoked":1, "smokes":2, "Unknown":3}
    df.smoking_status	 = df.smoking_status.map(change)

    df = df.drop(columns=['id'])
    return df

Load the data set

In [373]:
df = pd.read_csv('/content/sample_data/stroke.csv')
df = preprocess(df)
features = ['gender',	'age',	'hypertension',	'heart_disease',	'ever_married',	'work_type',	'Residence_type',	'avg_glucose_level',	'bmi',	'smoking_status']
X = df.loc[:, features]
y = df.loc[:, ['stroke']]

Split the data into training and testin sets

In [374]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2954,1,18.0,0,0,0,3,2,70.54,23.5,3
1933,1,70.0,0,0,1,3,2,91.25,36.0,3
4311,1,65.0,0,0,1,3,1,205.78,41.7,0
2365,1,36.0,0,0,0,3,2,216.96,34.5,3
927,0,64.0,0,0,1,3,2,86.05,23.0,3
...,...,...,...,...,...,...,...,...,...,...
4931,1,53.0,1,0,1,1,1,98.61,38.8,2
3264,1,61.0,0,0,1,3,1,72.01,26.0,1
1653,0,44.0,0,0,1,3,1,94.71,28.4,2
2607,0,21.0,0,0,0,3,2,120.94,29.7,1


In [375]:
train = X_train

In [376]:
df = pd.read_csv('/content/sample_data/stroke.csv')
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

#Target class distribution#

In [377]:
target = df['stroke']
target.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

#Define and train the models #

Random Forest Classifier

In [378]:
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train,np.ravel(y_train))

RandomForestClassifier()

In [379]:
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier = model_rf.predict(
    inputs
)
accuracy_RandomForestClassifier = accuracy_score(y_test,prediction_RandomForestClassifier)
print(f"Accuracy score of RandomForestClassifier = {accuracy_RandomForestClassifier}")

precision_RandomForestClassifier = precision_score(y_test,prediction_RandomForestClassifier,average='weighted')
print(f"Precision score of RandomForestClassifier = {precision_RandomForestClassifier}")

Accuracy score of RandomForestClassifier = 0.9483568075117371
Precision score of RandomForestClassifier = 0.9008275704868732


Extremely Randomized Trees

In [380]:
model_ExtraTreesClassifier = ExtraTreesClassifier(n_estimators=100)
model_ExtraTreesClassifier.fit(
    X_train,
    np.ravel(y_train)
)

ExtraTreesClassifier()

In [381]:
inputs = pd.DataFrame(X_test)
prediction_ExtraTreesClassifier = model_ExtraTreesClassifier.predict(
    inputs
)
accuracy_ExtraTreesClassifier = accuracy_score(y_test,prediction_ExtraTreesClassifier)
print(f"Accuracy score of ExtraTreesClassifier = {accuracy_ExtraTreesClassifier}")

precision_ExtraTreesClassifier = precision_score(y_test,prediction_ExtraTreesClassifier,average='weighted')
print(f"Precision score of ExtraTreesClassifier = {precision_ExtraTreesClassifier}")

Accuracy score of ExtraTreesClassifier = 0.9413145539906104
Precision score of ExtraTreesClassifier = 0.9004846640304497


Summary - Stroke dataset

In [382]:
print(tabulate([["Accuracy",accuracy_RandomForestClassifier,accuracy_ExtraTreesClassifier],["Precision",precision_RandomForestClassifier,precision_ExtraTreesClassifier]], headers=["Metric\Model","RandomForestClassifier","ExtraTreesClassifier"], tablefmt="grid"))

+----------------+--------------------------+------------------------+
| Metric\Model   |   RandomForestClassifier |   ExtraTreesClassifier |
| Accuracy       |                 0.948357 |               0.941315 |
+----------------+--------------------------+------------------------+
| Precision      |                 0.900828 |               0.900485 |
+----------------+--------------------------+------------------------+


In [383]:
# i=0
# print(len(model_rf.estimators_))
# for t in model_rf.estimators_:
#   if i%2==0:
#     model_rf.estimators_.remove(t)
#   i+=0
# print("DHODI")
# print(len(model_rf.estimators_))

In [384]:
a = [5, 7, 11, 4, 5]
for i in range(1,len(a)):
  for previous, current in zip(a, a[i:]):
      print(previous, current)

5 7
7 11
11 4
4 5
5 11
7 4
11 5
5 4
7 5
5 5


In [411]:
from sklearn import tree
inputs_train = pd.DataFrame(X_train)
def DoubleFaultMeasure(s, set_of_classifiers):
  train_set=s[0]
  sum = 0
  i_index = 0
  for i in set_of_classifiers.estimators_:
    # i.fit(X_train,np.ravel(y_train))
    preds_i = i.predict(inputs_train.values)
    for j in set_of_classifiers.estimators_[i_index:]:
      num_of_instances = 0
      if i != j: # if not the same tree
        preds_j = j.predict(inputs_train.values)
        for k in range(len(np.ravel(y_train))):
          if (preds_i[k] != np.ravel(y_train)[k] and preds_j[k] != np.ravel(y_train)[k]): # if both trees misclasified then add 1 to num of instances...
            num_of_instances += 1

        sum += num_of_instances
    i_index += 1
    # print(i_index)

  res = (2*sum)/(len(train_set)*len(set_of_classifiers.estimators_)*(len(set_of_classifiers.estimators_)-1))
  return res

In [412]:
DoubleFaultMeasure((X_train,y_train),model_rf)

0.00677076676181955

In [387]:
def kohavi_wolpert(s,m):
  '''
  s - the original training set tuple of (X_train,y_train) --> ((dataframe),((dataframe))
  s[0] = x train
  s[1] = y train
  m - list of classifiers examined
  '''
  x = s[0]
  y=s[1]
  predictions =[]
  sum =0
  # For each classifier, insert all its predictions (depending on the index of x) into the list .
  for classifier in m:
    preds = classifier.predict(x.values)
    predictions.append(preds)
  # Run through all the records in the training set and for each record : count the number of classifiers that made an incorrect classification(misclassified)
  l_i=0
  for (index_x, x_i),(index_y, y_i),pred_index in zip(x.iterrows(),y.iterrows(),range(len(predictions[0]))):
    l_i=0
    for model_i_prediction in predictions:
      # l_i = number of classifiers misclassified x_i,y_i
      if model_i_prediction[pred_index] != y_i.values:
        l_i+=1
      #end sum all misclassification for x_i,y_i
    #sum = sum + l_i * (len(m) - l_i)
    sum = sum + ( l_i * ( len(m)-l_i ) ) 
  # print(f"sum/(len(x)*(len(m)**2)) : {sum/(len(x)*(len(m)**2))}")
  return (sum) / ( len(y) * (len(m)**2) )



In [388]:
kohavi_wolpert((X_train,y_train),model_rf.estimators_)

0.02539151878914405

In [389]:
def inter_rater_measure(s,m):
  '''
  s - the original training set tuple of (X_train,y_train) --> ((dataframe),((dataframe))
  s[0] = x train
  s[1] = y train
  m - list of classifiers examined
  '''
  x = s[0]
  y=s[1]
  predictions =[]
  sum =0
  # For each classifier, insert all its predictions (depending on the index of x) into the list .
  for classifier in m:
    preds = classifier.predict(x.values)
    predictions.append(preds)
  # Run through all the records in the training set and for each record : count the number of classifiers that made an incorrect classification(misclassified)
  l=[]
  for (index_x, x_i),(index_y, y_i),pred_index in zip(x.iterrows(),y.iterrows(),range(len(predictions[0]))):
    l_i=0
    for model_i_prediction in predictions:
      # l_i = number of classifiers misclassified x_i,y_i
      if model_i_prediction[pred_index] != y_i.values:
        l_i+=1
      #end sum all misclassification for x_i,y_i
    l.append(l_i)
    #sum = sum + l_i * (len(m) - l_i)
    sum = sum + ( l_i * ( len(m) - l_i) )
  # p = 1 - ( sum(l) / (|s| *|m|) )
  sum_l=0
  for val in l:
    sum_l+=val
  p = 1 - ( sum_l / (len(x)*len(m)) )
  result = 1 - (sum / ( len(x)*len(m)*(len(m)-1)*p*(1-p)  ) )
  return result

In [390]:
inter_rater_measure((X_train,y_train),model_rf.estimators_)

0.1586347431252514

In [391]:
import statistics
def general_diversity(s,m):
  '''
  s - the original training set tuple of (X_train,y_train) --> ((dataframe),((dataframe))
  s[0] = x train
  s[1] = y train
  m - list of classifiers examined
  '''
  x = s[0]
  y=s[1]
  predictions =[]
  # For each classifier, insert all its predictions (depending on the index of x) into the list .
  for classifier in m:
    preds = classifier.predict(x.values)
    predictions.append(preds)
  # Run through all the records in the training set and for each record : count the number of classifiers that made an incorrect classification(misclassified)
  v=[]
  for (index_x, x_i),(index_y, y_i),pred_index in zip(x.iterrows(),y.iterrows(),range(len(predictions[0]))):
    l_i=0
    for model_i_prediction in predictions:
      # l_i = number of classifiers misclassified x_i,y_i
      if model_i_prediction[pred_index] != y_i.values:
        l_i+=1
      #end sum all misclassification for x_i,y_i
    v_i= (len(m)-l_i) / len(m)
    v.append(v_i)
  # end of run through all the records in the training set and for each record : count the number of classifiers that made an incorrect classification(misclassified)
  # compute variance of v using statistics.variance
  variance_v = statistics.variance(v)
  return variance_v

In [392]:
general_diversity((X_train,y_train),model_rf.estimators_)

0.005093596839920896

In [393]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2954,1,18.0,0,0,0,3,2,70.54,23.5,3
1933,1,70.0,0,0,1,3,2,91.25,36.0,3
4311,1,65.0,0,0,1,3,1,205.78,41.7,0
2365,1,36.0,0,0,0,3,2,216.96,34.5,3
927,0,64.0,0,0,1,3,2,86.05,23.0,3
...,...,...,...,...,...,...,...,...,...,...
4931,1,53.0,1,0,1,1,1,98.61,38.8,2
3264,1,61.0,0,0,1,3,1,72.01,26.0,1
1653,0,44.0,0,0,1,3,1,94.71,28.4,2
2607,0,21.0,0,0,0,3,2,120.94,29.7,1


In [394]:
# X = df.loc[:, features]
# y = df.loc[:, ['stroke']]
train_set_x_and_y = X_train.loc[:, features]
train_set_x_and_y['stroke'] = y_train['stroke']

In [395]:
train_set_x_and_y

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2954,1,18.0,0,0,0,3,2,70.54,23.5,3,0
1933,1,70.0,0,0,1,3,2,91.25,36.0,3,0
4311,1,65.0,0,0,1,3,1,205.78,41.7,0,0
2365,1,36.0,0,0,0,3,2,216.96,34.5,3,0
927,0,64.0,0,0,1,3,2,86.05,23.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
4931,1,53.0,1,0,1,1,1,98.61,38.8,2,0
3264,1,61.0,0,0,1,3,1,72.01,26.0,1,0
1653,0,44.0,0,0,1,3,1,94.71,28.4,2,0
2607,0,21.0,0,0,0,3,2,120.94,29.7,1,0


In [396]:
from sklearn.metrics import classification_report, confusion_matrix
from functools import reduce

def bagging_using_Diversity(i,t,s,d):
  '''
  i - a base inducer
  t - number of iterations
  s - the original training set
  d- diversity measure - { ""  :kohavi_wolpert , "" :inter_rater_measure ,"general diversity": general_diversity }
  '''
  diversity_measure = {"Double Fault Measure": DoubleFaultMeasure, "kohavi wolpert"  :kohavi_wolpert , "inter rater measure" :inter_rater_measure ,"general diversity": general_diversity }
  m_ts =[]
  #m_ts - list of tuples - each tuple in the following format : (model,number_of_misclassifieds)
  # m_ts --> [(model1,number_of_misclassifieds1),(model2,number_of_misclassifieds2),...,(model_t,number_of_misclassifieds_t)]
  for iter in range(t):
    s_tag = s.sample(frac = 1)
    m_t = i.estimators_[iter]
    x = s_tag.loc[:, features]
    y = s_tag.loc[:, ['stroke']]
    preds = m_t.predict(x.values)
    # confusion_matrix(y,preds) - [[ture_positive, false_positive][false_neg,true_neg]]
    number_of_misclassifieds = confusion_matrix(y,preds)[0][1] + confusion_matrix(y,preds)[1][0]
    m_ts.append((m_t,number_of_misclassifieds))
  # end of rows 1-3
  x = s.loc[:, features]
  y = s.loc[:, ['stroke']]
  # 5
  m_tag=[]
  # use reduce() and min() to find the minimum value in the second element of each tuple
  min_misclassifields = reduce(lambda x, y: min(x, y[1]), m_ts, float("inf"))
  # Find an element (model,number_of_misclassifieds) in m_ts(list of tuples).
  m_tags_optinals = [item for item in m_ts if item[1] == min_misclassifields]
  # m_tags_optinals - list of tuples [(model1,min_misclassifields),...]
  m_tag.append(m_tags_optinals[0][0])
  m_ts.remove(m_tags_optinals[0])
  # 6
  for i in range(1,int((t/2)-1)):
    results_of_d =[]
    # results_of_d - list of tuples - each tuple in the following format : (model,diversity measure value)
    if (d=="Double Fault Measure" or d=="kohavi wolpert" or d=="general diversity"):
      # need to take the model that return maximum diversity measure
      print(i)
      for temp_model,number_of_misclassifieds in m_ts:
        # m_tag_only_models = [tup[0] for tup in m_tag]
        results_of_d.append( (temp_model, diversity_measure[d]((x,y),m_tag+[temp_model])) )
      # need to find maximum value of diversity measure
      # use reduce() and min() to find the minimum value in the second element of each tuple
      # results_of_d[0] = (model,diversity measure value)
      max_d = reduce(lambda x, y: max(x, y[1]), results_of_d, float("-inf"))
      # Find an element (model,diversity measure value) in results_of_d(list of tuples).
      m_tags_optinals = [item for item in  results_of_d if item[1] == max_d]
      # m_tags_optinals[0] = (model,diversity measure value), e.g: (DecisionTreeClassifier(max_features='auto', random_state=1913246406), 0.014287578288100209)
      # m_tags_optinals[0][0] - temp_model
      # m_tags_optinals[0][1] - maximum diversity measure value , and its for m_tag with temp_model
      # we want to add only the model to m_tag so we add m_tags_optinals[0][0]
      m_tag.append(m_tags_optinals[0][0])
      # need to find m_tags_optinals[0][0] (the model) in m_ts and remove it from m_ts
      model_num_of_mis_tuple_to_remove_from_m_ts = [item for item in m_ts if item[0] == m_tags_optinals[0][0]]
      m_ts.remove(model_num_of_mis_tuple_to_remove_from_m_ts[0])
    else:
      # need to take the model that return minimum diversity measure
      print(i)
      for temp_model,number_of_misclassifieds in m_ts:
        # m_tag_only_models = [tup[0] for tup in m_tag]
        results_of_d.append( (temp_model, diversity_measure[d]((x,y),m_tag+[temp_model])) )
      # need to find maximum value of diversity measure
      # use reduce() and min() to find the minimum value in the second element of each tuple
      # results_of_d[0] = (model,diversity measure value)
      min_d = reduce(lambda x, y: min(x, y[1]), results_of_d, float("inf"))
      # Find an element (model,diversity measure value) in results_of_d(list of tuples).
      m_tags_optinals = [item for item in  results_of_d if item[1] == min_d]
      # m_tags_optinals[0] = (model,diversity measure value), e.g: (DecisionTreeClassifier(max_features='auto', random_state=1913246406), 0.014287578288100209)
      # m_tags_optinals[0][0] - temp_model
      # m_tags_optinals[0][1] - minimum diversity measure value , and its for m_tag with temp_model
      # we want to add only the model to m_tag so we add m_tags_optinals[0][0]
      m_tag.append(m_tags_optinals[0][0])
      # need to find m_tags_optinals[0][0] (the model) in m_ts and remove it from m_ts
      model_num_of_mis_tuple_to_remove_from_m_ts = [item for item in m_ts if item[0] == m_tags_optinals[0][0]]
      m_ts.remove(model_num_of_mis_tuple_to_remove_from_m_ts[0])

  return m_tag
  


In [397]:
RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure = model_rf
RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert = model_rf
RandomForestClassifier_bagging_using_Diversity_inter_rater_measure = model_rf
RandomForestClassifier_bagging_using_Diversity_general_diversity = model_rf

In [398]:
models_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert = bagging_using_Diversity(RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert.n_estimators,train_set_x_and_y,"kohavi wolpert")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


In [399]:
models_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure = bagging_using_Diversity(RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,RandomForestClassifier_bagging_using_Diversity_inter_rater_measure.n_estimators,train_set_x_and_y,"inter rater measure")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


In [400]:
models_RandomForestClassifier_bagging_using_Diversity_general_diversity = bagging_using_Diversity(RandomForestClassifier_bagging_using_Diversity_general_diversity,RandomForestClassifier_bagging_using_Diversity_general_diversity.n_estimators,train_set_x_and_y,"general diversity")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


In [None]:
# Double_Fault_Measure
models_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure = bagging_using_Diversity(RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure,RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure.n_estimators,train_set_x_and_y,"Double Fault Measure")

Double Fault Measure

In [422]:
RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure.estimators_ = models_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure
print("Random Forest Classifier - bagging using diversity Double Fault Measure")
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure = RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure.predict(inputs)
accuracy_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure = accuracy_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure)
print(f"Accuracy score of RandomForestClassifier using diversity Double Fault Measure = {accuracy_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure}")

precision_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure = precision_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure,average='weighted')
print(f"Precision score of RandomForestClassifier using diversity Double Fault Measure = {precision_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure}")

NameError: ignored

Bagging Using Diversity-Kohavi-Wolpert

In [403]:
RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert.estimators_ = models_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert
print("Random Forest Classifier - bagging using diversity kohavi-wolpert")
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert = RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert.predict(inputs)
accuracy_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert = accuracy_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert)
print(f"Accuracy score of RandomForestClassifier using diversity kohavi-wolpert = {accuracy_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert}")

precision_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert = precision_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,average='weighted')
print(f"Precision score of RandomForestClassifier using diversity kohavi-wolpert = {precision_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert}")

Random Forest Classifier - bagging using diversity kohavi-wolpert
Accuracy score of RandomForestClassifier using diversity kohavi-wolpert = 0.94679186228482
Precision score of RandomForestClassifier using diversity kohavi-wolpert = 0.9007517874129306


Inter-Rater Measure

In [404]:
RandomForestClassifier_bagging_using_Diversity_inter_rater_measure.estimators_ = models_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure
print("Random Forest Classifier - bagging using diversity Inter-Rater-Measure")
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure = RandomForestClassifier_bagging_using_Diversity_inter_rater_measure.predict(inputs)
accuracy_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure = accuracy_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure)
print(f"Accuracy score of RandomForestClassifier using diversity Inter-Rater-Measure = {accuracy_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure}")

precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure = precision_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,average='weighted')
print(f"Precision score of RandomForestClassifier using diversity Inter-Rater-Measure = {precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure}")

Random Forest Classifier - bagging using diversity Inter-Rater-Measure
Accuracy score of RandomForestClassifier using diversity Inter-Rater-Measure = 0.9483568075117371
Precision score of RandomForestClassifier using diversity Inter-Rater-Measure = 0.9184497836693364


General Diversity

In [405]:
RandomForestClassifier_bagging_using_Diversity_general_diversity.estimators_ = models_RandomForestClassifier_bagging_using_Diversity_general_diversity
print("Random Forest Classifier - bagging using diversity General Diversity")
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier_bagging_using_Diversity_general_diversity = RandomForestClassifier_bagging_using_Diversity_general_diversity.predict(inputs)
accuracy_RandomForestClassifier_bagging_using_Diversity_general_diversity = accuracy_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_general_diversity)
print(f"Accuracy score of RandomForestClassifier using diversity General Diversity = {accuracy_RandomForestClassifier_bagging_using_Diversity_general_diversity}")

precision_RandomForestClassifier_bagging_using_Diversity_general_diversity = precision_score(y_test,prediction_RandomForestClassifier_bagging_using_Diversity_general_diversity,average='weighted')
print(f"Precision score of RandomForestClassifier using diversity General Diversity = {precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure}")

Random Forest Classifier - bagging using diversity General Diversity
Accuracy score of RandomForestClassifier using diversity General Diversity = 0.9491392801251957
Precision score of RandomForestClassifier using diversity General Diversity = 0.9184497836693364


#Summary

In [407]:
print(tabulate([["Accuracy",accuracy_RandomForestClassifier,accuracy_ExtraTreesClassifier,accuracy_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,accuracy_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,accuracy_RandomForestClassifier_bagging_using_Diversity_general_diversity],["Precision",precision_RandomForestClassifier,precision_ExtraTreesClassifier,precision_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure]], headers=["Metric\Model","RandomForestClassifier","ExtraTreesClassifier","Random Forest-\nbagging using diversity\nkohavi-wolpert","Random Forest-\nbagging using diversity\nInter-Rater-Measure","Random Forest-\nbagging using diversity\nGeneral Diversity"], tablefmt="grid"))

+----------------+--------------------------+------------------------+---------------------------+---------------------------+---------------------------+
| Metric\Model   |   RandomForestClassifier |   ExtraTreesClassifier |            Random Forest- |            Random Forest- |            Random Forest- |
|                |                          |                        |   bagging using diversity |   bagging using diversity |   bagging using diversity |
|                |                          |                        |            kohavi-wolpert |       Inter-Rater-Measure |         General Diversity |
| Accuracy       |                 0.948357 |               0.941315 |                  0.946792 |                  0.948357 |                  0.949139 |
+----------------+--------------------------+------------------------+---------------------------+---------------------------+---------------------------+
| Precision      |                 0.900828 |               0.900485 |

In [421]:
print(tabulate([["Accuracy",accuracy_RandomForestClassifier,accuracy_ExtraTreesClassifier,accuracy_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,accuracy_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,accuracy_RandomForestClassifier_bagging_using_Diversity_general_diversity,accuracy_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure],["Precision",precision_RandomForestClassifier,precision_ExtraTreesClassifier,precision_RandomForestClassifier_bagging_using_Diversity_kohavi_wolpert,precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,precision_RandomForestClassifier_bagging_using_Diversity_inter_rater_measure,precision_RandomForestClassifier_bagging_using_Diversity_Double_Fault_Measure]], headers=["Metric\Model","RandomForestClassifier","ExtraTreesClassifier","Random Forest-\nbagging using diversity\nkohavi-wolpert","Random Forest-\nbagging using diversity\nInter-Rater-Measure","Random Forest-\nbagging using diversity\nGeneral Diversity","Random Forest-\nbagging using diversity\nDouble Fault Measure"], tablefmt="grid"))

+----------------+--------------------------+------------------------+---------------------------+---------------------------+---------------------------+---------------------------+
| Metric\Model   |   RandomForestClassifier |   ExtraTreesClassifier |            Random Forest- |            Random Forest- |            Random Forest- |            Random Forest- |
|                |                          |                        |   bagging using diversity |   bagging using diversity |   bagging using diversity |   bagging using diversity |
|                |                          |                        |            kohavi-wolpert |       Inter-Rater-Measure |         General Diversity |      Double Fault Measure |
| Accuracy       |                 0.948357 |               0.941315 |                  0.946792 |                  0.948357 |                  0.949139 |                  0.949139 |
+----------------+--------------------------+------------------------+---------------