In [22]:
import csv
import sys
import pandas as pd
import numpy as np
from random import shuffle

In [23]:
def variety(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)
  

def partition(rows, question):
    true, false = [], []
    for row in rows:
        if question.match(row):
            true.append(row)
        else:
            false.append(row)
    return true, false


def entropy(rows):
    counts = variety(rows)
    entropy = 0
    for label in counts:
        prob_of_lbl = counts[label] / float(len(rows))
        entropy += -prob_of_lbl * np.log(prob_of_lbl)
    return entropy


def information_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * entropy(left) - (1 - p) * entropy(right), - p * entropy(left) - (1 - p) * entropy(
        right)


def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = entropy(rows)
    features_count = len(rows[0]) - 1

    # print("Entropy: ", current_uncertainty)

    for col in range(features_count):

        values = set([row[col] for row in rows])

        for val in values:

            question = Question(col, val)

            true, false = partition(rows, question)

            if len(true) == 0 or len(false) == 0:
                continue

            gain, remainder = information_gain(true, false, current_uncertainty)
            # print("Remainder: ", remainder)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question


def build_tree(rows):
    gain, question = find_best_split(rows)

    # print("Gain: ", gain)

    if gain == 0:
        return Leaf(rows)

    true, false = partition(rows, question)

    true_branch = build_tree(true)

    false_branch = build_tree(false)

    return DecisionNode(question, true_branch, false_branch)


def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Prediction", node.predictions)
        return

    print(spacing + str(node.question))

    print(spacing + ' Yes:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + ' No:')
    print_tree(node.false_branch, spacing + "  ")


def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)


def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for label in counts.keys():
        probs[label] = str(int(counts[label] / total * 100)) + "%"
    return probs


def radius_finder(continuous):
    continuous = list(continuous)
    max = sys.maxsize * (-1)
    min = sys.maxsize
    for i in range(0, continuous.__len__()):
        if float(continuous[i]) < min:
            min = float(continuous[i])
        elif float(continuous[i]) > max:
            max = float(continuous[i])
    radius = (max - min) / 3

    return radius


def map_continuous_discrete(continuous, radius):
    continuous = list(continuous)

    for i in range(0, continuous.__len__()):
        continuous[i] = float(continuous[i]) // radius
    return continuous


def combiner(lists):
    lists = list(lists)
    combined = []
    newList = []
    for i in range(0, lists[0].__len__() - 1):
        for j in range(0, lists.__len__()):
            newList.append(lists[j][i])
        combined.append(newList)
        newList = []
    return combined


def replacer(data, discrete, index):
    data = list(data)
    discrete = list(discrete)
    index = int(index)

    for i in range(0, data.__len__()):
        data[i][index] = discrete[i]
    return data

In [24]:
class Question:

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            attributes[self.column], condition, str(self.value))
        
class Leaf:
    def __init__(self, rows):
        self.predictions = variety(rows)


class DecisionNode:

    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch


In [25]:
from google.colab import drive
drive.mount('/content/drive')

attributes = []
with open("/content/drive/MyDrive/Datasets/diabetes.csv", newline='') as f:
    reader = csv.reader(f)
    data = list(reader)
    attributes = data.pop(0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
combined = combiner(data)
for i in range(0, data[0].__len__() - 1):
    data = replacer(data, map_continuous_discrete(combined[i], radius_finder(combined[i])), i)

shuffle(data)

tree = build_tree(data[0:600])
print()
print("Decision Tree:")
print()
print_tree(tree)


Decision Tree:

Is Glucose >= 2.0?
 Yes:
  Is BMI >= 1.0?
   Yes:
    Is BloodPressure >= 1.0?
     Yes:
      Is Age >= 4.0?
       Yes:
        Prediction {'0': 1}
       No:
        Is Age >= 2.0?
         Yes:
          Is Pregnancies >= 2.0?
           Yes:
            Prediction {'1': 4}
           No:
            Is Insulin >= 1.0?
             Yes:
              Is SkinThickness >= 1.0?
               Yes:
                Is BloodPressure >= 2.0?
                 Yes:
                  Prediction {'1': 2}
                 No:
                  Is Age >= 3.0?
                   Yes:
                    Prediction {'1': 1}
                   No:
                    Is BMI >= 2.0?
                     Yes:
                      Prediction {'1': 1}
                     No:
                      Prediction {'0': 1, '1': 1}
               No:
                Prediction {'1': 3}
             No:
              Is Age >= 3.0?
               Yes:
                Is Pregnancies >= 1.0?
 

In [27]:
testing_data = data[600:]

counter = 0
test_num = 1

for row in testing_data:
    print(
        "Test Number %d : \nReal Answer: %s , Prediction: %s" % (
            test_num, row[-1], print_leaf(classify(row, tree))), end=", ")
    test_num += 1
    if str(print_leaf(classify(row, tree)))[2].__eq__(str(row[-1])):
        print("Result : True")
        counter += 1
    else:
        print("Result : False")

Test Number 1 : 
Real Answer: 0 , Prediction: {'1': '100%'}, Result : False
Test Number 2 : 
Real Answer: 1 , Prediction: {'0': '33%', '1': '66%'}, Result : False
Test Number 3 : 
Real Answer: 1 , Prediction: {'0': '72%', '1': '27%'}, Result : False
Test Number 4 : 
Real Answer: 0 , Prediction: {'0': '88%', '1': '11%'}, Result : True
Test Number 5 : 
Real Answer: 0 , Prediction: {'1': '64%', '0': '35%'}, Result : False
Test Number 6 : 
Real Answer: 0 , Prediction: {'1': '66%', '0': '33%'}, Result : False
Test Number 7 : 
Real Answer: 1 , Prediction: {'0': '100%'}, Result : False
Test Number 8 : 
Real Answer: 0 , Prediction: {'1': '100%'}, Result : False
Test Number 9 : 
Real Answer: 1 , Prediction: {'1': '83%', '0': '16%'}, Result : True
Test Number 10 : 
Real Answer: 0 , Prediction: {'1': '50%', '0': '50%'}, Result : False
Test Number 11 : 
Real Answer: 1 , Prediction: {'1': '100%'}, Result : True
Test Number 12 : 
Real Answer: 0 , Prediction: {'1': '9%', '0': '90%'}, Result : False
T

In [28]:
print("Number Of Right Answers: ", counter)
print("Number Of Tests: ", testing_data.__len__())
print("Accuracy: ", (counter / testing_data.__len__()) * 100, "%")

Number Of Right Answers:  109
Number Of Tests:  168
Accuracy:  64.88095238095238 %
