In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools as it
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import utils, metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
def gini_impurity(y):
  '''
  Given a Pandas Series, it calculates the Gini Impurity.
  y: variable with which calculate Gini Impurity.
  '''
  if isinstance(y, pd.Series):
    p = y.value_counts()/y.shape[0]
    gini = 1-np.sum(p**2)
    return(gini)

  else:
    raise('Object must be a Pandas Series.')

In [3]:
def make_prediction(data, target_factor):
  '''
  Given the target variable, make a prediction.
  data: pandas series for target variable
  target_factor: boolean considering if the variable is a factor or not
  '''

  # Make predictions
  if target_factor:
    pred = data.value_counts().idxmax()
  else:
    pred = data.mean()

  return pred

In [4]:
def make_split(variable, value, data, is_numeric):
  '''
  Given a data and a split conditions, do the split.
  variable: variable with which make the split.
  value: value of the variable to make the split.
  data: data to be splitted.
  is_numeric: boolean considering if the variable to be splitted is numeric or not.
  '''
  if is_numeric:
    data_1 = data[data[variable] < value]
    data_2 = data[(data[variable] < value) == False]

  else:
    data_1 = data[data[variable].isin(value)]
    data_2 = data[(data[variable].isin(value)) == False]

  return(data_1,data_2)

In [5]:
def entropy(y):
  '''
  Given a Pandas Series, it calculates the entropy.
  y: variable with which calculate entropy.
  '''
  if isinstance(y, pd.Series):
    a = y.value_counts()/y.shape[0]
    entropy = np.sum(-a*np.log2(a+1e-9))
    return(entropy)

  else:
    raise('Object must be a Pandas Series.')

In [6]:
def variance(y):
  '''
  Function to help calculate the variance avoiding nan.
  y: variable to calculate variance to. It should be a Pandas Series.
  '''
  if(len(y) == 1):
    return 0
  else:
    return y.var()

In [7]:
def categorical_options(a):
  '''
  Creates all possible combinations from a Pandas Series.
  a: Pandas Series from where to get all possible combinations.
  '''
  a = a.unique()

  opt = []
  for L in range(0, len(a)+1):
      for subset in it.combinations(a, L):
          subset = list(subset)
          opt.append(subset)

  return opt[1:-1]

In [8]:
def information_gain_ent(y, mask, func=entropy):
  '''
  It returns the Information Gain of a variable given a loss function.
  y: target variable.
  mask: split choice.
  func: function to be used to calculate Information Gain in case os classification.
  '''

  a = sum(mask)
  b = mask.shape[0] - a

  if(a == 0 or b ==0):
    ig = 0

  else:
    ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])

  return ig

In [9]:
def information_gain_gini(y, mask, func=gini_impurity):
  '''
  It returns the Information Gain of a variable given a loss function.
  y: target variable.
  mask: split choice.
  func: function to be used to calculate Information Gain in case os classification.
  '''

  a = sum(mask)
  b = mask.shape[0] - a

  if(a == 0 or b ==0):
    ig = 0

  else:
    ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])

  return ig

In [10]:
def max_information_gain_split_ent(x, y, func=entropy):
  '''
  Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
  x: predictor variable as Pandas Series.
  y: target variable as Pandas Series.
  func: function to be used to calculate the best split.
  '''

  split_value = []
  ig = []

  numeric_variable = True if x.dtypes != 'O' else False

  # Create options according to variable type
  if numeric_variable:
    options = x.sort_values().unique()[1:]
  else:
    options = categorical_options(x)

  # Calculate ig for all values
  for val in options:
    mask =   x < val if numeric_variable else x.isin(val)
    val_ig = information_gain_ent(y, mask, func)
    # Append results
    ig.append(val_ig)
    split_value.append(val)

  # Check if there are more than 1 results if not, return False
  if len(ig) == 0:
    return(None,None,None, False)

  else:
  # Get results with highest IG
    best_ig = max(ig)
    best_ig_index = ig.index(best_ig)
    best_split = split_value[best_ig_index]
    return(best_ig,best_split,numeric_variable, True)

In [11]:
def max_information_gain_split_gini(x, y, func=gini_impurity):
  '''
  Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
  x: predictor variable as Pandas Series.
  y: target variable as Pandas Series.
  func: function to be used to calculate the best split.
  '''

  split_value = []
  ig = []

  numeric_variable = True if x.dtypes != 'O' else False

  # Create options according to variable type
  if numeric_variable:
    options = x.sort_values().unique()[1:]
  else:
    options = categorical_options(x)

  # Calculate ig for all values
  for val in options:
    mask =   x < val if numeric_variable else x.isin(val)
    val_ig = information_gain_gini(y, mask, func)
    # Append results
    ig.append(val_ig)
    split_value.append(val)

  # Check if there are more than 1 results if not, return False
  if len(ig) == 0:
    return(None,None,None, False)

  else:
  # Get results with highest IG
    best_ig = max(ig)
    best_ig_index = ig.index(best_ig)
    best_split = split_value[best_ig_index]
    return(best_ig,best_split,numeric_variable, True)

In [12]:
def get_best_split_ent(y, data):
  '''
  Given a data, select the best split and return the variable, the value, the variable type and the information gain.
  y: name of the target variable
  data: dataframe where to find the best split.
  '''
  masks = data.drop(y, axis= 1).apply(max_information_gain_split_ent, y = data[y])
  if sum(masks.loc[3,:]) == 0:
    return(None, None, None, None)

  else:
    # Get only masks that can be splitted
    masks = masks.loc[:,masks.loc[3,:]]

    # Get the results for split with highest IG
    split_variable = masks.iloc[0].astype(np.float32).idxmax()
    #split_valid = masks[split_variable][]
    split_value = masks[split_variable][1]
    split_ig = masks[split_variable][0]
    split_numeric = masks[split_variable][2]

    return(split_variable, split_value, split_ig, split_numeric)

In [13]:
def get_best_split_gini(y, data):
  '''
  Given a data, select the best split and return the variable, the value, the variable type and the information gain.
  y: name of the target variable
  data: dataframe where to find the best split.
  '''
  masks = data.drop(y, axis= 1).apply(max_information_gain_split_gini, y = data[y])
  if sum(masks.loc[3,:]) == 0:
    return(None, None, None, None)

  else:
    # Get only masks that can be splitted
    masks = masks.loc[:,masks.loc[3,:]]

    # Get the results for split with highest IG
    split_variable = masks.iloc[0].astype(np.float32).idxmax()
    #split_valid = masks[split_variable][]
    split_value = masks[split_variable][1]
    split_ig = masks[split_variable][0]
    split_numeric = masks[split_variable][2]

    return(split_variable, split_value, split_ig, split_numeric)

In [14]:
def train_tree_ent(data,y, target_factor, max_depth = None,min_samples_split = None, min_information_gain = 1e-20, counter=0, max_categories = 20):
  '''
  Trains a Decission Tree
  data: Data to be used to train the Decission Tree
  y: target variable column name
  target_factor: boolean to consider if target variable is factor or numeric.
  max_depth: maximum depth to stop splitting.
  min_samples_split: minimum number of observations to make a split.
  min_information_gain: minimum ig gain to consider a split to be valid.
  max_categories: maximum number of different values accepted for categorical values. High number of values will slow down learning process. R
  '''

  # Check that max_categories is fulfilled
  if counter==0:
    types = data.dtypes
    check_columns = types[types == "object"].index
    for column in check_columns:
      var_length = len(data[column].value_counts())
      if var_length > max_categories:
        raise ValueError('The variable ' + column + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))

  # Check for depth conditions
  if max_depth == None:
    depth_cond = True

  else:
    if counter < max_depth:
      depth_cond = True

    else:
      depth_cond = False

  # Check for sample conditions
  if min_samples_split == None:
    sample_cond = True

  else:
    if data.shape[0] > min_samples_split:
      sample_cond = True

    else:
      sample_cond = False

  # Check for ig condition
  if depth_cond & sample_cond:

    var,val,ig,var_type = get_best_split_ent(y, data)

    # If ig condition is fulfilled, make split
    if ig is not None and ig >= min_information_gain:

      counter += 1

      left,right = make_split(var, val, data,var_type)

      # Instantiate sub-tree
      split_type = "<=" if var_type else "in"
      question =   "{} {}  {}".format(var,split_type,val)
      # question = "\n" + counter*" " + "|->" + var + " " + split_type + " " + str(val)
      subtree = {question: []}

      # Find answers (recursion)
      yes_answer = train_tree_ent(left, y, target_factor, max_depth, min_samples_split, min_information_gain, counter)

      no_answer = train_tree_ent(right, y, target_factor, max_depth, min_samples_split, min_information_gain, counter)

      if yes_answer == no_answer:
        subtree = yes_answer

      else:
        subtree[question].append(yes_answer)
        subtree[question].append(no_answer)

    # If it doesn't match IG condition, make prediction
    else:
      pred = make_prediction(data[y],target_factor)
      return pred

   # Drop dataset if doesn't match depth or sample conditions
  else:
    pred = make_prediction(data[y],target_factor)
    return pred

  return subtree

In [15]:
def train_tree_gini(data,y, target_factor, max_depth = None,min_samples_split = None, min_information_gain = 1e-20, counter=0, max_categories = 20):
  '''
  Trains a Decission Tree
  data: Data to be used to train the Decission Tree
  y: target variable column name
  target_factor: boolean to consider if target variable is factor or numeric.
  max_depth: maximum depth to stop splitting.
  min_samples_split: minimum number of observations to make a split.
  min_information_gain: minimum ig gain to consider a split to be valid.
  max_categories: maximum number of different values accepted for categorical values. High number of values will slow down learning process. R
  '''

  # Check that max_categories is fulfilled
  if counter==0:
    types = data.dtypes
    check_columns = types[types == "object"].index
    for column in check_columns:
      var_length = len(data[column].value_counts())
      if var_length > max_categories:
        raise ValueError('The variable ' + column + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))

  # Check for depth conditions
  if max_depth == None:
    depth_cond = True

  else:
    if counter < max_depth:
      depth_cond = True

    else:
      depth_cond = False

  # Check for sample conditions
  if min_samples_split == None:
    sample_cond = True

  else:
    if data.shape[0] > min_samples_split:
      sample_cond = True

    else:
      sample_cond = False

  # Check for ig condition
  if depth_cond & sample_cond:

    var,val,ig,var_type = get_best_split_gini(y, data)

    # If ig condition is fulfilled, make split
    if ig is not None and ig >= min_information_gain:

      counter += 1

      left,right = make_split(var, val, data,var_type)

      # Instantiate sub-tree
      split_type = "<=" if var_type else "in"
      question =   "{} {}  {}".format(var,split_type,val)
      # question = "\n" + counter*" " + "|->" + var + " " + split_type + " " + str(val)
      subtree = {question: []}

      # Find answers (recursion)
      yes_answer = train_tree_gini(left, y, target_factor, max_depth, min_samples_split, min_information_gain, counter)

      no_answer = train_tree_gini(right, y, target_factor, max_depth, min_samples_split, min_information_gain, counter)

      if yes_answer == no_answer:
        subtree = yes_answer

      else:
        subtree[question].append(yes_answer)
        subtree[question].append(no_answer)

    # If it doesn't match IG condition, make prediction
    else:
      pred = make_prediction(data[y],target_factor)
      return pred

   # Drop dataset if doesn't match depth or sample conditions
  else:
    pred = make_prediction(data[y],target_factor)
    return pred

  return subtree

In [16]:
def classifier_data(observation, arbol):
  question = list(arbol.keys())[0]

  if question.split()[1] == '<=':

    if observation[question.split()[0]] <= float(question.split()[2]):
      answer = arbol[question][0]
    else:
      answer = arbol[question][1]

  else:

    if observation[question.split()[0]] in (question.split()[2]):
      answer = arbol[question][0]
    else:
      answer = arbol[question][1]

  # If the answer is not a dictionary
  if not isinstance(answer, dict):
    return answer
  else:
    residual_tree = answer
    return classifier_data(observation, answer)

In [17]:
Names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
data = pd.read_csv('iris.data', header = None, names = Names)
df = pd.DataFrame(data)
df

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [18]:
x = df[['sepal length', 'petal length']]
y = df[['sepal width', 'petal width']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
...     x, y, test_size=0.3, train_size=0.7, random_state=50)

In [20]:
X_train , X_test

(     sepal length  petal length
 17            5.1           1.4
 42            4.4           1.3
 18            5.7           1.7
 146           6.3           5.0
 114           5.8           5.1
 ..            ...           ...
 70            5.9           4.8
 132           6.4           5.6
 33            5.5           1.4
 109           7.2           6.1
 139           6.9           5.4
 
 [105 rows x 2 columns],
      sepal length  petal length
 88            5.6           4.1
 72            6.3           4.9
 20            5.4           1.7
 16            5.4           1.3
 147           6.5           5.2
 140           6.7           5.6
 113           5.7           5.0
 23            5.1           1.7
 12            4.8           1.4
 68            6.2           4.5
 39            5.1           1.5
 130           7.4           6.1
 34            4.9           1.5
 112           6.8           5.5
 55            5.7           4.5
 25            5.0           1.6
 82            5

In [21]:
y_train, y_test

(     sepal width  petal width
 17           3.5          0.3
 42           3.2          0.2
 18           3.8          0.3
 146          2.5          1.9
 114          2.8          2.4
 ..           ...          ...
 70           3.2          1.8
 132          2.8          2.2
 33           4.2          0.2
 109          3.6          2.5
 139          3.1          2.1
 
 [105 rows x 2 columns],
      sepal width  petal width
 88           3.0          1.3
 72           2.5          1.5
 20           3.4          0.2
 16           3.9          0.4
 147          3.0          2.0
 140          3.1          2.4
 113          2.5          2.0
 23           3.3          0.5
 12           3.0          0.1
 68           2.2          1.5
 39           3.4          0.2
 130          2.8          1.9
 34           3.1          0.1
 112          3.0          2.1
 55           2.8          1.3
 25           3.0          0.2
 82           2.7          1.2
 48           3.7          0.2
 81         

In [22]:
max_depth = 5
min_samples_split = 20
min_information_gain  = 1e-5

In [23]:
decision_XP_ent = train_tree_ent(X_train, 'petal length',True, max_depth,min_samples_split,min_information_gain)
decision_XP_ent

{'sepal length <=  5.8': [{'sepal length <=  5.4': [{'sepal length <=  4.8': [1.3,
      1.4]},
    1.5]},
  {'sepal length <=  6.6': [{'sepal length <=  6.3': [5.1, 5.6]}, 4.4]}]}

In [24]:
decision_XP_gini = train_tree_gini(X_train, 'petal length',True, max_depth,min_samples_split,min_information_gain)
decision_XP_gini

{'sepal length <=  5.6': [{'sepal length <=  4.6': [1.3,
    {'sepal length <=  5.2': [{'sepal length <=  4.7': [1.4,
        {'sepal length <=  4.9': [1.6, 1.4]}]},
      1.5]}]},
  {'sepal length <=  6.1': [{'sepal length <=  5.8': [4.2, 5.1]},
    {'sepal length <=  7.2': [{'sepal length <=  6.5': [5.6, 4.4]}, 6.7]}]}]}

In [25]:
decision_XS_ent = train_tree_ent(X_train, 'sepal length',True, max_depth,min_samples_split,min_information_gain)
decision_XS_ent

{'petal length <=  3.6': [{'petal length <=  1.4': [4.4,
    {'petal length <=  1.6': [5.1, 5.0]}]},
  {'petal length <=  5.2': [{'petal length <=  4.7': [{'petal length <=  4.3': [5.7,
        6.6]},
      6.3]},
    {'petal length <=  5.8': [6.4, 7.7]}]}]}

In [26]:
decision_XS_gini = train_tree_gini(X_train, 'sepal length',True, max_depth,min_samples_split,min_information_gain)
decision_XS_gini

{'petal length <=  3.6': [{'petal length <=  1.4': [4.4,
    {'petal length <=  1.6': [5.1, 5.0]}]},
  {'petal length <=  5.8': [{'petal length <=  5.2': [{'petal length <=  4.0': [5.6,
        {'petal length <=  5.0': [6.1, 5.8]}]},
      6.4]},
    7.7]}]}

In [27]:
decision_yP_ent = train_tree_ent(y_train, 'petal width',True, max_depth, min_samples_split, min_information_gain)
decision_yP_ent

{'sepal width <=  3.0': [{'sepal width <=  2.8': [{'sepal width <=  2.5': [1.0,
      1.9]},
    1.3]},
  {'sepal width <=  3.4': [{'sepal width <=  3.2': [{'sepal width <=  3.1': [1.8,
        0.1]},
      0.2]},
    {'sepal width <=  3.8': [0.2, 0.4]}]}]}

In [28]:
decision_yP_gini = train_tree_gini(y_train, 'petal width',True, max_depth, min_samples_split, min_information_gain)
decision_yP_gini

{'sepal width <=  3.2': [{'sepal width <=  3.0': [{'sepal width <=  2.3': [1.0,
      {'sepal width <=  2.8': [1.9, 1.3]}]},
    {'sepal width <=  3.1': [1.8, 0.1]}]},
  {'sepal width <=  3.8': [0.2, 0.4]}]}

In [29]:
decision_yS_ent = train_tree_ent(y_train, 'sepal width',True, max_depth,min_samples_split,min_information_gain)
decision_yS_ent

{'petal width <=  1.0': [{'petal width <=  0.3': [{'petal width <=  0.2': [3.1,
      3.2]},
    3.5]},
  {'petal width <=  1.4': [2.9,
    {'petal width <=  2.0': [{'petal width <=  1.6': [3.0, 2.7]}, 2.8]}]}]}

In [30]:
decision_yS_gini = train_tree_gini(y_train, 'sepal width',True, max_depth,min_samples_split,min_information_gain)
decision_yS_gini

{'petal width <=  1.0': [{'petal width <=  0.2': [3.1,
    {'petal width <=  0.3': [3.2, 3.5]}]},
  {'petal width <=  1.4': [2.9,
    {'petal width <=  2.0': [{'petal width <=  1.9': [3.0, 2.7]}, 2.8]}]}]}

In [31]:
# petal and sepal length prediction
DT_model = DecisionTreeRegressor(max_depth=5).fit(X_train,y_train)
DT_predict = DT_model.predict(X_test)
pd.DataFrame(DT_predict)

Unnamed: 0,0,1
0,2.6375,1.15
1,2.881818,1.745455
2,3.9,0.4
3,3.633333,0.233333
4,3.055,2.07
5,3.055,2.07
6,2.8,2.0
7,3.9,0.4
8,3.12,0.19
9,3.0625,1.45


In [32]:
# petal and sepal width prediction
DT_model_y = DecisionTreeRegressor(max_depth=5).fit(y_train,X_train)
DT_predict_y = DT_model_y.predict(y_test)
pd.DataFrame(DT_predict_y)

Unnamed: 0,0,1
0,5.7,4.2
1,6.1,5.6
2,4.971429,1.542857
3,5.4,1.7
4,6.757143,5.6
5,6.05,5.35
6,6.757143,5.6
7,4.971429,1.542857
8,4.3,1.1
9,6.1,5.6


In [33]:
Accuracy = metrics.accuracy_score((y_test['petal width'].astype('int')), (DT_predict[:,0].astype('int')))
# Menghitung Presisi
Precision = metrics.precision_score((y_test['petal width'].astype('int')), (DT_predict[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Sensitivitas
Sensitivity_recall = metrics.recall_score((y_test['petal width'].astype('int')), (DT_predict[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Spesifisitas
Specificity = metrics.recall_score((y_test['petal width'].astype('int')), (DT_predict[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung F-Score
F1_score = metrics.f1_score((y_test['petal width'].astype('int')), (DT_predict[:,0].astype('int')), average = 'micro', zero_division= 1)

print("Accuracy Petal Width : ", Accuracy)
print("Precision Petal Width : ", Precision)
print("Sensitivity_recall Petal Width : ", Sensitivity_recall)
print("Specificity Petal Width : ", Specificity) 
print("F1_score Petal Width : ", F1_score)

Accuracy Petal Width :  0.022222222222222223
Precision Petal Width :  0.022222222222222223
Sensitivity_recall Petal Width :  0.022222222222222223
Specificity Petal Width :  0.022222222222222223
F1_score Petal Width :  0.022222222222222223


In [34]:
Accuracy = metrics.accuracy_score((y_test['sepal width'].astype('int')), (DT_predict[:,1].astype('int')))
# Menghitung Presisi
Precision = metrics.precision_score((y_test['sepal width'].astype('int')), (DT_predict[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Sensitivitas
Sensitivity_recall = metrics.recall_score((y_test['sepal width'].astype('int')), (DT_predict[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Spesifisitas
Specificity = metrics.recall_score((y_test['sepal width'].astype('int')), (DT_predict[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung F-Score
F1_score = metrics.f1_score((y_test['sepal width'].astype('int')), (DT_predict[:,1].astype('int')), average = 'micro', zero_division= 1)

print("Accuracy Sepal Width : ", Accuracy)
print("Precision Sepal Width : ", Precision)
print("Sensitivity_recall Sepal Width : ", Sensitivity_recall)
print("Specificity Sepal Width : ", Specificity) 
print("F1_score Sepal Width : ", F1_score) 

Accuracy Sepal Width :  0.06666666666666667
Precision Sepal Width :  0.06666666666666667
Sensitivity_recall Sepal Width :  0.06666666666666667
Specificity Sepal Width :  0.06666666666666667
F1_score Sepal Width :  0.06666666666666667


In [35]:
Accuracy = metrics.accuracy_score((X_test['petal length'].astype('int')), (DT_predict_y[:,0].astype('int')))
# Menghitung Presisi
Precision = metrics.precision_score((X_test['petal length'].astype('int')), (DT_predict_y[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Sensitivitas
Sensitivity_recall = metrics.recall_score((X_test['petal length'].astype('int')), (DT_predict_y[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Spesifisitas
Specificity = metrics.recall_score((X_test['petal length'].astype('int')), (DT_predict_y[:,0].astype('int')), average = 'micro', zero_division= 1)
# Menghitung F-Score
F1_score = metrics.f1_score((X_test['petal length'].astype('int')), (DT_predict_y[:,0].astype('int')), average = 'micro', zero_division= 1)

print("Accuracy Petal length : ", Accuracy)
print("Precision Petal length : ", Precision)
print("Sensitivity_recall Petal length : ", Sensitivity_recall)
print("Specificity Petal length : ", Specificity) 
print("F1_score Petal length : ", F1_score) 

Accuracy Petal length :  0.06666666666666667
Precision Petal length :  0.06666666666666667
Sensitivity_recall Petal length :  0.06666666666666667
Specificity Petal length :  0.06666666666666667
F1_score Petal length :  0.06666666666666667


In [36]:
Accuracy = metrics.accuracy_score((X_test['sepal length'].astype('int')), (DT_predict_y[:,1].astype('int')))
# Menghitung Presisi
Precision = metrics.precision_score((X_test['sepal length'].astype('int')), (DT_predict_y[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Sensitivitas
Sensitivity_recall = metrics.recall_score((X_test['sepal length'].astype('int')), (DT_predict_y[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung Spesifisitas
Specificity = metrics.recall_score((X_test['sepal length'].astype('int')), (DT_predict_y[:,1].astype('int')), average = 'micro', zero_division= 1)
# Menghitung F-Score
F1_score = metrics.f1_score((X_test['sepal length'].astype('int')), (DT_predict_y[:,1].astype('int')), average = 'micro', zero_division= 1)

print("Accuracy Sepal length : ", Accuracy)
print("Precision Sepal length : ", Precision)
print("Sensitivity_recall Sepal length : ", Sensitivity_recall)
print("Specificity Sepal length : ", Specificity) 
print("F1_score Sepal length : ", F1_score) 

Accuracy Sepal length :  0.022222222222222223
Precision Sepal length :  0.022222222222222223
Sensitivity_recall Sepal length :  0.022222222222222223
Specificity Sepal length :  0.022222222222222223
F1_score Sepal length :  0.022222222222222223


In [37]:
confusion_matrix(y_true = (y_test['petal width'].astype('int')), y_pred = (DT_predict[:,0].astype('int')))

array([[ 0,  0,  0, 13,  1],
       [ 0,  0, 12,  8,  0],
       [ 0,  0,  1, 10,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0]], dtype=int64)

In [38]:
confusion_matrix(y_true = (y_test['sepal width'].astype('int')), y_pred = (DT_predict[:,1].astype('int')))

array([[ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 2, 13,  3,  0,  0],
       [11,  5, 10,  0,  0],
       [ 1,  0,  0,  0,  0]], dtype=int64)

In [39]:
confusion_matrix(y_true = (X_test['petal length'].astype('int')), y_pred = (DT_predict_y[:,0].astype('int')))

array([[ 0,  0, 10,  4,  0,  0],
       [ 0,  0,  0,  5,  1,  0],
       [ 0,  0,  0,  2,  8,  0],
       [ 0,  0,  0,  0, 10,  0],
       [ 0,  0,  0,  1,  3,  1],
       [ 0,  0,  0,  0,  0,  0]], dtype=int64)

In [40]:
confusion_matrix(y_true = (X_test['sepal length'].astype('int')), y_pred = (DT_predict_y[:,1].astype('int')))

array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       [ 5,  1,  0,  0,  0,  0],
       [ 9,  3,  5,  1,  0,  0],
       [ 0,  0,  5, 11,  0,  0],
       [ 0,  0,  1,  3,  1,  0]], dtype=int64)