#**Decision Tree from scratch**

---



In [1]:
headers = ["color", "diameter", "label"]
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

### Some helper functions

In [29]:
def count_classes(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [31]:
print(count_classes(training_data))
print(is_numeric(22))

{'Apple': 2, 'Grape': 2, 'Lemon': 1}
True


### Question class

In [81]:
class Question:
    #Color = red?
    #['Red', 1, 'Grape'],
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def evaluate(self, row):
      val = row[self.column]
      if is_numeric(val):
        return val >= self.value
      else: 
        return val == self.value


    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (header[self.column], condition, str(self.value))

In [85]:
q = Question(0, "Green")
q.evaluate(['Green', 1, 'Grape'])

True

### Split data into two, one satisfying the question and the other one failing

In [86]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
      if question.evaluate(row):
        true_rows.append(row)
      else:
        false_rows.append(row)

    return true_rows, false_rows

In [89]:
true_rows, false_rows = partition(training_data, Question(1, 3))
print(true_rows)

[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]


### Calculate Gini impurity and information gain for rows

In [90]:
 def gini(rows):
    impurity = 1
    classes_counts = count_classes(rows)
    for label in classes_counts:
      prob = classes_counts[label] / float(len(rows))
      impurity -= prob ** 2
    return impurity

In [93]:
arr = [["Apple"],["Grape"],["Grape"],["Apple"]]
gini(arr)

0.0

In [94]:
def info_gain(true_rows, false_rows, current_uncertainty):
    p = float(len(true_rows)) / (len(true_rows) + len(false_rows))
    return current_uncertainty - p * gini(true_rows) - (1 - p) * gini(false_rows)


### Finding best possible question

In [100]:
def find_best_split(rows):
    best_gain = 0 
    best_question = None 
    current_uncertainty = gini(rows)

    for col in range(len(rows[0]) - 1):
      values = set([row[col] for row in rows])
      for value in values:
        quest = Question(col, value)
        true_rows, false_rows = partition(rows, quest)
        if len(true_rows) == 0  or len(false_rows) == 0:
          continue
        gain = info_gain(true_rows, false_rows, current_uncertainty)
        if gain >= best_gain:
          best_gain = gain
          best_question = quest

    return best_gain, best_question

In [101]:
bg, bq = find_best_split(training_data)
bq

Is diameter >= 3?

### Tree classes


In [106]:
class Node:
    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

class Leaf (Node):
    def __init__(self, rows):
        self.predictions = count_classes(rows)
        self.question = None
        self.true_branch = None
        self.false_branch = None

### Building the tree

In [107]:
def build_tree(rows):
    gain, question = find_best_split(rows)

    if gain == 0:
        return Leaf(rows)
        
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Node(question, true_branch, false_branch)

In [108]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict", node.predictions)
        return

    print (spacing + str(node.question))
    
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [109]:
tree = build_tree(training_data)

In [110]:
print_tree(tree)

Is diameter >= 3?
--> True:
  Is color == Yellow?
  --> True:
    Predict {'Apple': 1, 'Lemon': 1}
  --> False:
    Predict {'Apple': 1}
--> False:
  Predict {'Grape': 2}


### Predict function

In [113]:
def predict(row, tree):
    if isinstance(tree, Leaf):
        return tree.predictions

    if tree.question.evaluate(row):
        return predict(row, tree.true_branch)
    else:
        return predict(row, tree.false_branch)

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for label in counts.keys():
        probs[label] = str(int(counts[label] / total * 100)) + "%"
    return probs

In [118]:
print_leaf(predict( ['Yellow', 3], tree))

{'Apple': '100%'}

#**Decision Tree using sklearn**

---



In [47]:
import pandas as pd
import numpy as np
from google.colab import files
import io
uploaded = files.upload()
#Dataset link: https://drive.google.com/file/d/13nw-uRXPY8XIZQxKRNZ3yYlho-CYm_Qt/view

Saving bill_authentication.csv to bill_authentication (4).csv


In [49]:
dataset = pd.read_csv(io.BytesIO(uploaded['bill_authentication.csv']))
dataset.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 


In [75]:
from sklearn.tree import DecisionTreeClassifier
regressor = DecisionTreeClassifier()
regressor = regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred = (y_pred > 0.5)

In [76]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[155   2]
 [  0 118]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       157
           1       0.98      1.00      0.99       118

    accuracy                           0.99       275
   macro avg       0.99      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275

0.9927272727272727


# **Random forrest using sklearn**

---



In [79]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred = (y_pred > 0.5)

In [80]:

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[155   2]
 [  0 118]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       157
           1       0.98      1.00      0.99       118

    accuracy                           0.99       275
   macro avg       0.99      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275

0.9927272727272727
