<a href="https://colab.research.google.com/github/Rufidatul726/5th-Semeter/blob/main/DBMSII/DecisionTree/newDecisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [127]:
import csv

def load_csv(filename):
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        headers = next(csvreader)
        for row in csvreader:
            dataset.append(row)
    return dataset, headers
    
filename = '/content/DecisionTreee.csv'
dataset, headers = load_csv(filename)


In [136]:
import random

def split_dataset(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    test_set = list(dataset)
    while len(train_set) < train_size:
        index = random.randrange(len(test_set))
        train_set.append(test_set.pop(index))
    return train_set, test_set

split_ratio = 0.8
train_set, test_set = split_dataset(dataset, split_ratio)


In [137]:
import math

def entropy(rows):
    counts = {}
    for row in rows:
        target = row[-1]
        if target not in counts:
            counts[target] = 0
        counts[target] += 1
    entropy = 0.0
    for target in counts:
        probability = float(counts[target]) / len(rows)
        entropy -= probability * math.log2(probability)
    return entropy

In [138]:
def split_rows(rows, column, value):
    left_rows = []
    right_rows = []
    for row in rows:
        if row[column] == value:
            left_rows.append(row)
        else:
            right_rows.append(row)
    return left_rows, right_rows

In [139]:
def information_gain(current_entropy, left_rows, right_rows):
    p = float(len(left_rows)) / (len(left_rows) + len(right_rows))
    return current_entropy - p * entropy(left_rows) - (1 - p) * entropy(right_rows)


In [140]:
def find_best_split(rows):
    current_entropy = entropy(rows)
    best_information_gain = 0.0
    best_attribute = None
    best_value = None
    for column in range(len(rows[0]) - 1):
        values = set([row[column] for row in rows])
        for value in values:
            left_rows, right_rows = split_rows(rows, column, value)
            if len(left_rows) == 0 or len(right_rows) == 0:
                continue
            gain = information_gain(current_entropy, left_rows, right_rows)
            if gain > best_information_gain:
                best_information_gain = gain
                best_attribute = column
                best_value = value
    return best_attribute, best_value

In [141]:
def build_tree(rows):
    if len(rows) == 0:
        return None
    target_values = set([row[-1] for row in rows])
    if len(target_values) == 1:
        return target_values.pop()
    best_attribute, best_value = find_best_split(rows)
    if best_attribute is None:
        return None
    left_rows, right_rows = split_rows(rows, best_attribute, best_value)
    left_subtree = build_tree(left_rows)
    right_subtree = build_tree(right_rows)
    return (best_attribute, best_value, left_subtree, right_subtree)

tree = build_tree(train_set)

In [142]:
def print_tree(tree, indent=''):
    if tree is None:
        return None
    if isinstance(tree, str):
        print(indent + tree)
    else:
        attribute, value, left_subtree, right_subtree = tree
        print(indent + f'Attribute {attribute} = {value}')
        print_tree(left_subtree, indent + '  ')
        print_tree(right_subtree, indent + '  ')


In [143]:
def predict(tree, row):
    if tree is None:
        return None
    if isinstance(tree, str):
        return tree
    attribute, value, left_subtree, right_subtree = tree
    if row[attribute] == value:
        return predict(left_subtree, row)
    else:
        return predict(right_subtree, row)

print_tree(tree)
predictions = []
for row in test_set:
    prediction = predict(tree, row)
    predictions.append(prediction)

accuracy = sum([1 for i in range(len(test_set)) if test_set[i][-1] == predictions[i]]) / float(len(test_set))
print('Accuracy: %.3f%%' % (accuracy * 100.0))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                Attribute 5 = slightly_prob
                  Attribute 4 = critical
                    Attribute 1 = critical
                    inconv
                  Attribute 4 = critical
                    Attribute 1 = critical
                Attribute 1 = very_crit
                  Attribute 5 = slightly_prob
                    Attribute 0 = great_pret
                    Attribute 4 = critical
                      Attribute 0 = great_pret
                  Attribute 5 = slightly_prob
                    Attribute 0 = pretentious
                      Attribute 4 = critical
                        Attribute 1 = critical
                      Attribute 1 = critical
                        Attribute 4 = critical
                        Attribute 4 = critical
                          Attribute 1 = improper
                    Attribute 0 = great_pret
                      Attribute 1 = improper
             