In [None]:
import csv

def parse(filename):
    data = []
    with open(filename, 'r') as file:
        csv_file = csv.reader(file)
        headers = next(csv_file)
        for row in csv_file:
            data.append(dict(zip(headers, row)))

    return data

name_of_file =  "/content/imageseg.csv"
data = parse(name_of_file)
print(*data, sep = "\n")
print()
print(str(len(data)))

{'Class': 'BRICKFACE', 'REGION-CENTROID-ROW': '3', 'REGION-PIXEL-COUNT': '3', 'SHORT-LINE-DENSITY-5': '1', 'SHORT-LINE-DENSITY-2': '1', 'VEDGE-MEAN': '1', 'VEDGE-SD': '1', 'HEDGE-MEAN': '1', 'HEDGE-SD': '1', 'INTENSITY-MEAN': '1', 'RAWRED-MEAN': '1', 'RAWBLUE-MEAN': '2', 'RAWGREEN-MEAN': '1', 'EXRED-MEAN': '1', 'EXBLUE-MEAN': '4', 'EXGREEN-MEAN': '2', 'VALUE-MEAN': '3', 'SATURATION-MEAN': '1', 'HUE-MEAN': '4'}
{'Class': 'BRICKFACE', 'REGION-CENTROID-ROW': '3', 'REGION-PIXEL-COUNT': '3', 'SHORT-LINE-DENSITY-5': '1', 'SHORT-LINE-DENSITY-2': '1', 'VEDGE-MEAN': '1', 'VEDGE-SD': '1', 'HEDGE-MEAN': '1', 'HEDGE-SD': '1', 'INTENSITY-MEAN': '1', 'RAWRED-MEAN': '2', 'RAWBLUE-MEAN': '2', 'RAWGREEN-MEAN': '1', 'EXRED-MEAN': '2', 'EXBLUE-MEAN': '4', 'EXGREEN-MEAN': '1', 'VALUE-MEAN': '3', 'SATURATION-MEAN': '1', 'HUE-MEAN': '3'}
{'Class': 'BRICKFACE', 'REGION-CENTROID-ROW': '2', 'REGION-PIXEL-COUNT': '3', 'SHORT-LINE-DENSITY-5': '1', 'SHORT-LINE-DENSITY-2': '1', 'VEDGE-MEAN': '1', 'VEDGE-SD': '1', 

In [None]:
import csv
import random
from collections import Counter

class Node:
    def __init__(self, label):
        self.label = label
        self.attribute = None
        self.attribute_values = []
        self.children = {}
        self.instances_labeled = []
        self.parent_attribute = None
        self.parent_attribute_value = None
        self.pruned = False

def parse(filename):
    data = []
    with open(filename, 'r') as file:
        csv_file = csv.reader(file)
        headers = next(csv_file)
        for row in csv_file:
            data.append(dict(zip(headers, row)))
    return data

def get_five_folds(instances):
    fold0 = []
    fold1 = []
    fold2 = []
    fold3 = []
    fold4 = []

    random.shuffle(instances)
    classes = []
    for instance in instances:
        classes.append(instance['Class'])
    unique_classes = list(Counter(classes).keys())
    for uniqueclass in unique_classes:
        counter = 0
        for instance in instances:
            if uniqueclass == instance['Class']:
                if counter == 0:
                    fold0.append(instance)
                    counter += 1
                elif counter == 1:
                    fold1.append(instance)
                    counter += 1
                elif counter == 2:
                    fold2.append(instance)
                    counter += 1
                elif counter == 3:
                    fold3.append(instance)
                    counter += 1
                else:
                    fold4.append(instance)
                    counter = 0

    random.shuffle(fold0)
    random.shuffle(fold1)
    random.shuffle(fold2)
    random.shuffle(fold3)
    random.shuffle(fold4)

    return fold0, fold1, fold2, fold3, fold4

def ID3(instances, default):
    if len(instances) == 0:
        return Node(default)
    classes = []
    for instance in instances:
        classes.append(instance['Class'])
    if len(Counter(classes)) == 1 or len(classes) == 1:
        tree = Node(mode_class(instances))
        return tree
    else:
        best_attribute = most_informative_attribute(instances)
        tree = Node(mode_class(instances))
        tree.attribute = best_attribute
        best_attribute_values = []
        for instance in instances:
            try:
                best_attribute_values.append(instance[best_attribute])
            except:
                no_best_attribute = True
        tree.attribute_values = list(set(best_attribute_values))
        for best_attr_value_i in tree.attribute_values:
            instances_i = []
            for instance in instances:
                if instance[best_attribute] == best_attr_value_i:
                    instances_i.append(instance)
            subtree = ID3(instances_i, mode_class(instances))
            subtree.instances_labeled = instances_i
            subtree.parent_attribute = best_attribute
            subtree.parent_attribute_value = best_attr_value_i
            tree.children[best_attr_value_i] = subtree
        return tree

def mode_class(instances):
    classes = []
    for instance in instances:
        classes.append(instance['Class'])
    return Counter(classes).most_common(1)[0][0]

def prior_entropy(instances):
    classes = []
    for instance in instances:
        classes.append(instance['Class'])
    counter = Counter(classes)
    if len(counter) == 1:
        return 0
    else:
        entropy = 0
        for c, count_of_c in counter.items():
            probability = count_of_c / len(classes)
            entropy += probability * (log(probability, 2))
        return -entropy

def entropy(instances, attribute, attribute_value):
    classes = []
    for instance in instances:
        if instance[attribute] == attribute_value:
            classes.append(instance['Class'])
    counter = Counter(classes)
    if len(counter) == 1:
        return 0
    else:
        entropy = 0
        for c, count_of_c in counter.items():
            probability = count_of_c / len(classes)
            entropy += probability * (log(probability, 2))
        return -entropy

def gain_ratio(instances, attribute):
    priorentropy = prior_entropy(instances)
    values = []
    for instance in instances:
        values.append(instance[attribute])
    counter = Counter(values)
    remaining_entropy = 0
    split_information = 0
    for attribute_value, attribute_value_count in counter.items():
        probability = attribute_value_count / len(values)
        remaining_entropy += (probability * entropy(
            instances, attribute, attribute_value))
        split_information += probability * (log(probability, 2))
    information_gain = priorentropy - remaining_entropy
    split_information = -split_information
    gainratio = None
    if split_information != 0:
        gainratio = information_gain / split_information
    else:
        gainratio = -1000
    return gainratio

def most_informative_attribute(instances):
    selected_attribute = None
    max_gain_ratio = -1000
    attributes = [key for key, value in instances[0].items()]
    attributes.remove('Class')
    for attribute in attributes:
        gain = gain_ratio(instances, attribute)
        if gain > max_gain_ratio:
            max_gain_ratio = gain
            selected_attribute = attribute
    return selected_attribute

def accuracy(trained_tree, test_instances):
    no_of_correct_predictions = 0
    for test_instance in test_instances:
        if predict(trained_tree, test_instance) == test_instance['Class']:
            no_of_correct_predictions += 1
    return no_of_correct_predictions / len(test_instances)

def predict(node, test_instance):
    if len(node.children) == 0:
        return node.label
    else:
        attribute_value = test_instance[node.attribute]
        if attribute_value in node.children and node.children[
            attribute_value].pruned == False:
            return predict(node.children[attribute_value], test_instance)
        else:
            instances = []
            for attr_value in node.attribute_values:
                instances += node.children[attr_value].instances_labeled
            return mode_class(instances)

TREE = None
def prune(node, val_instances):
    global TREE
    TREE = node

    def prune_node(node, val_instances):
        if len(node.children) == 0:
            accuracy_before_pruning = accuracy(TREE, val_instances)
            node.pruned = True
            if accuracy_before_pruning >= accuracy(TREE, val_instances):
                node.pruned = False
            return
        for value, child_node in node.children.items():
            prune_node(child_node, val_instances)
        accuracy_before_pruning = accuracy(TREE, val_instances)
        node.pruned = True
        if accuracy_before_pruning >= accuracy(TREE, val_instances):
            node.pruned = False

    prune_node(TREE, val_instances)


In [None]:
#import id3
#import parse
#import random
#import five_fold_stratified_cv
from matplotlib import pyplot as plt

ALGORITHM_NAME = "Iterative Dichotomiser 3"

def main():

    print("Welcome to the " +  ALGORITHM_NAME + " Program!")
    print()
    file_name = input("Enter the name of your input file (e.g. car.txt): ")

    trace_runs_file = input("Enter the name of your trace runs file (e.g. car_id3_trace_runs.txt): ")
    imagefile = input("Enter the name of the graphed results file (e.g. foo.png): ")
    outfile_tr = open(trace_runs_file,"w")
    outfile_tr.write("Welcome to the " +  ALGORITHM_NAME + " Program!" + "\n")
    outfile_tr.write("\n")
    data = parse.parse(file_name)
    pruned_accuracies_avgs = []
    unpruned_accuracies_avgs = []
    random.shuffle(data)
    upper_limit = (round(len(data) * 0.9 * 0.8) - round(len(data) * 0.9 * 0.8) % 10) + 10
    if upper_limit <= 10:
        upper_limit = 50

    default = id3.mode_class(data)
    validation_set = data[: 1*len(data)//10]
    data = data[1*len(data)//10 : len(data)]

    fold0, fold1, fold2, fold3, fold4 = five_fold_stratified_cv.get_five_folds(data)

    testset = []
    trainset = []
    testset.append(fold0)
    trainset.append(fold1 + fold2 + fold3 + fold4)

    testset.append(fold1)
    trainset.append(fold0 + fold2 + fold3 + fold4)

    testset.append(fold2)
    trainset.append(fold0 + fold1 + fold3 + fold4)

    testset.append(fold3)
    trainset.append(fold0 + fold1 + fold2 + fold4)

    testset.append(fold4)
    trainset.append(fold0 + fold1 + fold2 + fold3)

    step_size = len(trainset[0])//20

    for length in range(10, upper_limit, step_size):
        print('Number of Training Instances:', length)
        outfile_tr.write('Number of Training Instances:' + str(length) +"\n")
        pruned_accuracies = []
        unpruned_accuracies = []

        for experiment in range(5):
            train = trainset[experiment][: length]
            test = testset[experiment]

            tree = id3.ID3(train, default)
            id3.prune(tree, validation_set)
            acc = id3.accuracy(tree, test)
            pruned_accuracies.append(acc)

            tree = id3.ID3(train, default)
            acc = id3.accuracy(tree, test)
            unpruned_accuracies.append(acc)

        avg_pruned_accuracies = sum(pruned_accuracies) / len(pruned_accuracies)
        avg_unpruned_accuracies = sum(unpruned_accuracies) / len(unpruned_accuracies)

        print("Classification Accuracy for Pruned Tree:", avg_pruned_accuracies)
        print("Classification Accuracy for Unpruned Tree:", avg_unpruned_accuracies)
        print()
        outfile_tr.write("Classification Accuracy for Pruned Tree:" + str(
            avg_pruned_accuracies) + "\n")
        outfile_tr.write("Classification Accuracy for Unpruned Tree:" + str(avg_unpruned_accuracies) +"\n\n")
        pruned_accuracies_avgs.append(avg_pruned_accuracies)
        unpruned_accuracies_avgs.append(avg_unpruned_accuracies)

    outfile_tr.close()

    plt.plot(range(10, upper_limit, step_size), pruned_accuracies_avgs, label='pruned tree')
    plt.plot(range(10, upper_limit, step_size), unpruned_accuracies_avgs, label='unpruned tree')
    plt.xlabel('Number of Training Instances')
    plt.ylabel('Classification Accuracy on Test Instances')
    plt.grid(True)
    plt.title("Learning Curve for " +  str(file_name))
    plt.legend()
    plt.savefig(imagefile)
    plt.show()


main()

Welcome to the Iterative Dichotomiser 3 Program!

Enter the name of your input file (e.g. car.txt): imageseg.csv
Enter the name of your trace runs file (e.g. car_id3_trace_runs.txt): imagetrace.csv
Enter the name of the graphed results file (e.g. foo.png): food.png


AttributeError: ignored