# ID3 Decision Tree

In [191]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import KBinsDiscretizer

## Data Processing

### Reading Data

##### Wine Quality Dataset

In [192]:
def filterColumnNames(df):
    """
    Filter column names to remove any non-numeric characters
    """
    toRemove = ['"']
    columnNames = df.columns.values.tolist()
    filteredColumnNames = [''.join(letter for letter in column if letter not in toRemove) for column in columnNames]
    df.set_axis(filteredColumnNames, axis=1, inplace=True)
    return df

In [193]:
def getWineQualityData(datatype, path):
    """
    Get the wine quality data from the given path, having the option to process only red wine, only white wine, or both
    """    
    if datatype == "red":
        df = pd.read_csv(path + 'winequality-red.csv', sep='[,;]', engine='python', index_col=None, header=0)
        pass
    elif datatype == "white":
        df = pd.read_csv(path + 'winequality-white.csv', sep='[,;]', engine='python')
        pass
    else:
        wineList = []
        whiteDF = pd.read_csv(path + 'winequality-white.csv', sep='[,;]', engine='python')
        wineList.append(whiteDF)
        
        redDF = pd.read_csv(path + 'winequality-red.csv', sep='[,;]', engine='python')
        wineList.append(redDF)
        
        df = pd.concat(wineList, axis=0, ignore_index=True)
        pass
    
    df = filterColumnNames(df)
    #print(df)
    return df

##### Wine Dataset

In [194]:
def getWineData(path):
    """
    Get the wine data from the given path
    """
    #fill the header list with the values above
    headers = ['Class', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavanoids', 'Nonflavanoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
    
    df = pd.read_csv(path, sep='[,;]', engine='python', index_col=None, header=None, names=headers)
    
    #df = filterColumnNames(df)
    #data.set_index('Locality', inplace=True)
    return df

##### Zoo Dataset

In [195]:
def getZooData(path):
    """
    Get the zoo data from the given path
    """
    headers = ['animal name', 'hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic', 'predator', 'toothed', 'backbone', 'breathes', 'venomous', 'fins', 'legs', 'tail', 'domestic', 'catsize', 'type']
    
    df = pd.read_csv(path, engine='python', index_col=0, header=None, names=headers)
    
    #df = filterColumnNames(df)
    #data.set_index('Locality', inplace=True)
    return df

### Discretisation

In [196]:
def discretiseDataSKLearn(data):
    """
    :n_bins: The number of bins to produce. Raises ValueError if n_bins < 2.
    :encode: Method used to encode the transformed result.
        onehot:
            Encode the transformed result with one-hot encoding and return a sparse matrix. Ignored features are always stacked to the right.

        onehot-dense:
            Encode the transformed result with one-hot encoding and return a dense array. Ignored features are always stacked to the right.

        ordinal:
            Return the bin identifier encoded as an integer value.
    :strategy: Strategy used to define the widths of the bins.
        uniform:
            All bins in each feature have identical widths.

        quantile:
            All bins in each feature have the same number of points.

        kmeans:
            Values in each bin have the same nearest center of a 1D k-means cluster.
    """
    enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
    enc.fit(data)
    print(enc.transform(data))
    return data

In [197]:
def discretiseDatawithBoundaries(dataCol, boundaries, strategy='uniform'):
    """
    Discretise the data using the boundaries
    
    :dataCol: The column to discretise
    :boundaries: The boundaries to use
    :strategy: The strategy to use for discretisation, can be 'uniform', 'quantile' or 'kmeans'
    """
    dataCol = dataCol.to_numpy()
    # add min and max values of your data
    boundaries = sorted({dataCol.min(), dataCol.max() + 1} | set(boundaries))
    
    #print("data", '\n', dataCol)
    #print("min data", '\n', dataCol.min())
    #print("max data", '\n', dataCol.max() + 1)

    if strategy == 'uniform':
        #Uniformly-sized bins
        uniformBins = pd.cut(dataCol, bins=boundaries, right=False)
        
        print('\n', "Uniformly-sized bins")
        print(uniformBins, '\n', uniformBins.dtype, '\n')
        
        return uniformBins
    
    elif strategy == 'quantile':
        #Bins with "equal" numbers of samples inside
        equalBins = pd.cut(dataCol, bins=boundaries, labels=range(len(boundaries) - 1), right=False)
        
        #print('\n', "Bins with 'equal' numbers of samples inside")
        #print(equalBins, '\n', equalBins.dtype, '\n')
        
        return equalBins
    
    elif strategy == 'kmeans':
        #Bins based on K-means clustering
        clusteringBins = pd.cut(dataCol, bins=boundaries, labels=range(len(boundaries) - 1), right=False).astype(int)
        
        #print('\n', "Bins based on K-means clustering")
        #print(clusteringBins, '\n', len(list(set(clusteringBins))), '\n', clusteringBins.dtype, '\n')
        
        return clusteringBins
    
    else:
        print("Invalid bin width strategy specified.\n", "Please choose from 'uniform', 'quantile' or 'kmeans'.")


In [198]:
def discretize(dataset, cont_attributes, target_attribute_name, strategy='uniform'):
    """
    Finding the boundaries for each continuous attribute, and then discretizing the data

    :dataset: The dataset to discretize
    :cont_attributes: The continuous attributes to discretize
    :target_attribute_name: The target attribute name
    :strategy: The strategy to use for discretisation, can be 'uniform', 'quantile' or 'kmeans'
    """
    
    for cont_att in cont_attributes:
        # print(cont_att)
        pd.to_numeric(dataset[cont_att])
        dataset = dataset.sort_values(by=[cont_att])
        dataset['Changes']  = dataset[target_attribute_name].shift() != dataset[target_attribute_name]
        
        changes_lst = dataset['Changes'].to_numpy()
        
        dataChanges = []
        
        for i in range(len(changes_lst)):
            if changes_lst[i] and i!=0:
                dataChanges.append(dataset[cont_att][i])
                
        # removing duplicate entries
        dataChanges = list(set(dataChanges))
        dataChanges.sort()
        averageDiff = (max(dataChanges)-min(dataChanges))/(len(dataChanges)-1)
        
        boundaries = []
        previousBound = dataChanges[0]
        
        for boundary in dataChanges:        
            diff = boundary - previousBound
            
            #print("Difference: ", diff, 'Avg Diff: ', averageDiff)
            #print("Previous Bound: ", previousBound)
            
            if diff > averageDiff:
                # print("Added to boundaries: ", boundary , '\n')
                boundaries.append(boundary)
                previousBound = boundary
        
        boundaries.sort()
        # print(boundaries)
        
        # restoring dataset to original format
        dataset = dataset.drop('Changes',axis=1)
        dataset = dataset.sort_index()
        
        dataset[cont_att] = discretiseDatawithBoundaries(dataset[cont_att], boundaries, strategy)
    return dataset

## ID3 Decision Tree

In [199]:
class ID3Classifier:
    def fit(self, input_data, output):
        """
        Method to fit the model

        :input_data: The training data
        :output: The training labels
        """

        data = input_data.copy()
        data[output.name] = output
        self.tree = self.decisionTree(
            data, data, input_data.columns, output.name)

    def predict(self, input_data):
        """
        Method to predict the labels

        :input_data: The data to predict
        """

        predictions = []

        # convert input_data data into a list of dictionaries
        samples = input_data.to_dict(orient='records')

        # predict the label for each dictionary in the list of samples
        for sample in samples:
            predictions.append(self.getPrediction(sample, self.tree, 1.0))

        return predictions

    def getEntropy(self, attribute_column):
        """
        :attribute_column: Data from the attribute column
        """

        # get the unique values and their counts for the given attribute
        values, counts = np.unique(attribute_column, return_counts=True)

        # storing the getEntropy values for each unique value
        getEntropy_list = []

        for i in range(len(values)):
            # calculate the probability of the value
            probability = counts[i]/np.sum(counts)
            # calculate the getEntropy for the value
            getEntropy_list.append(-probability*np.log2(probability))

        # calculate the total getEntropy
        total_getEntropy = np.sum(getEntropy_list)

        return total_getEntropy

    def getInformationGain(self, data, feature_attribute_name, target_attribute_name):
        """
        :data: Dataframe containing data
        :feature_attribute_name: String containing the name of the feature attribute
        :target_attribute_name: String containing the name of the target attribute
        """

        # get total getEntropy of given subset
        total_getEntropy = self.getEntropy(data[target_attribute_name])

        # get the unique values and their counts for the given attribute to be split
        values, counts = np.unique(
            data[feature_attribute_name], return_counts=True)

        # storing the weighted getEntropy values for each unique value
        weighted_getEntropy_list = []

        for i in range(len(values)):
            # calculate the probability of the value
            subset_probability = counts[i]/np.sum(counts)
            subset_getEntropy = self.getEntropy(data.where(
                data[feature_attribute_name] == values[i]).dropna()[target_attribute_name])  # calculate the getEntropy for the value
            # calculate the weighted getEntropy for the value
            weighted_getEntropy_list.append(
                subset_probability*subset_getEntropy)

        # calculate the weighted total getEntropy
        total_weighted_getEntropy = np.sum(weighted_getEntropy_list)

        # calculate information gain
        getInformationGain = total_getEntropy - total_weighted_getEntropy

        return getInformationGain

    def decisionTree(self, data, original_data, feature_attribute_names, target_attribute_name, parent_node_class=None, max_depth=6, min_samples_split=None):
        """
        :data: Dataframe containing data
        :original_data: Dataframe containing the original data
        :feature_attribute_names: List containing the names of the feature attributes
        :target_attribute_name: String containing the name of the target attribute
        :parent_node_class: String containing the class of the parent node
        """

        if (max_depth is not None) and (min_samples_split is not None):
            if (max_depth == 0) or (len(data) < min_samples_split):
                return parent_node_class

        # get the unique classes of the target attribute
        unique_classes = np.unique(data[target_attribute_name])

        # if all the classes are the same, return the class
        if len(unique_classes) <= 1:
            return unique_classes[0]

        # if there are no samples, return majority class of original data
        elif len(data) == 0:
            majority_class_index = np.argmax(
                np.unique(original_data[target_attribute_name], return_counts=True)[1])

            return np.unique(original_data[target_attribute_name])[majority_class_index]

        # if data set contains no features to train with, return parent node class
        elif len(feature_attribute_names) == 0:
            return parent_node_class
        # if none of the above are true, construct a branch:
        else:
            # determine parent node class of current branch
            majority_class_index = np.argmax(
                np.unique(data[target_attribute_name], return_counts=True)[1])
            parent_node_class = unique_classes[majority_class_index]

            # determine information gain values for each feature
            # choose feature which best splits the data, ie. highest value
            ig_values = [self.getInformationGain(
                data, feature, target_attribute_name) for feature in feature_attribute_names]
            best_feature_index = np.argmax(ig_values)
            best_feature = feature_attribute_names[best_feature_index]

            # create tree structure, empty at first
            tree = {best_feature: {}}

            # remove best feature from available features, it will become the parent node
            feature_attribute_names = [
                i for i in feature_attribute_names if i != best_feature]

            # create nodes under parent node
            parent_attribute_values = np.unique(data[best_feature])
            for value in parent_attribute_values:
                sub_data = data.where(data[best_feature] == value).dropna()

                # call the algorithm recursively
                subtree = self.decisionTree(
                    sub_data, original_data, feature_attribute_names, target_attribute_name, parent_node_class, max_depth=max_depth-1, min_samples_split=min_samples_split)

                # add subtree to original tree
                tree[best_feature][value] = subtree

            return tree

    def getPrediction(self, sample, tree, default=1):
        """
        :sample: Dictionary containing the sample to predict
        :tree: Dictionary containing the decision tree
        :default: Default value to return if no prediction can be made
        """
        # map sample data to tree
        for attribute in list(sample.keys()):
            # check if feature exists in tree
            if attribute in list(tree.keys()):
                try:
                    result = tree[attribute][sample[attribute]]
                except:
                    return default

                result = tree[attribute][sample[attribute]]

                # if more attributes exist within result, recursively find best result
                if isinstance(result, dict):
                    return self.getPrediction(sample, result)
                else:
                    return result

    # perform PostPruning on the tree
    def postPrune(self, tree, validation_data, validation_labels, max_depth=None):
        """
        :tree: Dictionary containing the decision tree
        :validation_data: Dataframe containing the validation data
        :validation_labels: Dataframe containing the validation labels
        :max_depth: Integer containing the maximum depth of the tree
        """
        # if tree is a leaf, return the label
        if not isinstance(tree, dict):
            return tree
        # if tree is a node, postPrune the branches
        else:

            # determine majority class of validation data
            majority_class_index = np.argmax(
                np.unique(validation_labels, return_counts=True)[1])
            majority_class = np.unique(validation_labels)[majority_class_index]

            # calcluate the nested length of the tree
            nested_length = 0
            for attribute, branches in tree.items():
                nested_length += len(branches)

            if nested_length == 1:
                return majority_class

            # determine the depth of the tree
            depth = 0
            current_node = tree
            while isinstance(current_node, dict):
                current_node = list(current_node.values())[0]
                depth += 1

            # if max depth is not reached, postPrune the branches
            if max_depth is None or depth < max_depth:
                for attribute, branches in tree.items():
                    for value, subtree in branches.items():
                        # postPrune the subtree
                        subtree = self.postPrune(
                            subtree, validation_data, validation_labels, max_depth)

                        # if subtree is a leaf, replace the branch with the majority class
                        if not isinstance(subtree, dict):
                            tree[attribute][value] = majority_class
                        # if subtree is a node, replace the branch with the subtree
                        else:
                            tree[attribute][value] = subtree
                return tree
            else:
                return majority_class

    # print the id3 tree in a readable horizontal format
    def printTree(self, tree, depth=0):
        """
        :tree: Dictionary containing the decision tree
        :depth: Integer containing the depth of the tree
        """
        # if tree is a leaf, print the label
        if not isinstance(tree, dict):
            print('|\t' * depth + str(tree))
        # if tree is a node, print the feature and recursively print the branches
        else:
            for attribute, branches in tree.items():
                print('|\t' * depth + str(attribute))
                for value, subtree in branches.items():
                    print('|\t' * depth + '-->', str(value))
                    self.printTree(subtree, depth + 1)


In [200]:
def runModelWithPostPruning(dataDf, classes, test_size=0.2, validation_size=0.2):
    # create an empty dataframe
    data_df = dataDf

    # drop rows with missing values, missing = ?
    data_df = data_df.replace("?", np.nan)
    data_df = data_df.dropna()

    # organize data into input and output
    X = data_df.drop(columns=classes)
    y = data_df[classes]

    # split data into training, testing and validation sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=5)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size, random_state=5)
    
    # initialize and fit model
    model = ID3Classifier()
    model.fit(X_train, y_train)

    # print the tree
    model.printTree(model.tree)

    # make predictions
    predictions = model.predict(X_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)

    # postPrune the tree
    # model.postPrune(model.tree, X_validation, y_validation)
    model.postPrune(model.tree, X_test, y_test, max_depth=3)

    # print the tree
    model.printTree(model.tree)

    # make predictions
    predictions = model.predict(X_test)

    # calculate accuracy
    accuracy_after_postPrune = accuracy_score(y_test, predictions)
    print("Accuracy after PostPruning:", accuracy_after_postPrune)

    return accuracy, accuracy_after_postPrune

In [201]:
def runModel(dataDf, classes, testSize=0.33):
    # create an empty dataframe
    data_df = dataDf

    # drop rows with missing values, missing = ?
    data_df = data_df.replace("?", np.nan)
    data_df = data_df.dropna()

    # organize data into input and output
    X = data_df.drop(columns=classes)
    y = data_df[classes]

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=testSize)

    # initialize and fit model
    model = ID3Classifier()
    model.fit(x_train, y_train)

    # print the tree
    model.printTree(model.tree)

    # make predictions
    predictions = model.predict(x_test)

    # calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)

    return accuracy

# Runner Code

In [202]:
zooData = getZooData("Datasets/Zoo/zoo.data")
display(zooData)

accuracy = runModelWithPostPruning(zooData, 'type')

Unnamed: 0_level_0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
animal name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wallaby,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,1
wasp,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,6
wolf,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
worm,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,7


legs
--> 0
|	fins
|	--> 0.0
|	|	toothed
|	|	--> 0.0
|	|	|	7.0
|	|	--> 1.0
|	|	|	3.0
|	--> 1.0
|	|	eggs
|	|	--> 0.0
|	|	|	1.0
|	|	--> 1.0
|	|	|	4.0
--> 2
|	hair
|	--> 0.0
|	|	2.0
|	--> 1.0
|	|	1.0
--> 4
|	hair
|	--> 0.0
|	|	aquatic
|	|	--> 0.0
|	|	|	3.0
|	|	--> 1.0
|	|	|	toothed
|	|	|	--> 0.0
|	|	|	|	7.0
|	|	|	--> 1.0
|	|	|	|	5.0
|	--> 1.0
|	|	1.0
--> 6
|	6.0
--> 8
|	7.0
Accuracy: 0.9523809523809523
legs
--> 0
|	fins
|	--> 0.0
|	|	toothed
|	|	--> 0.0
|	|	|	7.0
|	|	--> 1.0
|	|	|	3.0
|	--> 1.0
|	|	eggs
|	|	--> 0.0
|	|	|	1.0
|	|	--> 1.0
|	|	|	4.0
--> 2
|	hair
|	--> 0.0
|	|	2.0
|	--> 1.0
|	|	1.0
--> 4
|	hair
|	--> 0.0
|	|	aquatic
|	|	--> 0.0
|	|	|	3.0
|	|	--> 1.0
|	|	|	toothed
|	|	|	--> 0.0
|	|	|	|	7.0
|	|	|	--> 1.0
|	|	|	|	5.0
|	--> 1.0
|	|	1.0
--> 6
|	6.0
--> 8
|	7.0
Accuracy after PostPruning: 0.9523809523809523


---------------------------------------------------------------------------------------------------------

In [203]:
wineData = getWineData("Datasets/Wine/wine.data")
wineData = discretize(wineData, cont_attributes= ['Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavanoids', 'Nonflavanoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'], target_attribute_name='Class', strategy="kmeans")
display(wineData)

accuracy = runModel(wineData, 'Class')

Unnamed: 0,Class,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,29,10,17,5,16,20,12,5,19,16,14,32,23
1,1,20,11,8,1,8,18,11,4,7,11,14,29,22
2,1,20,16,25,10,8,20,13,6,20,16,13,25,26
3,1,29,13,19,7,12,27,14,3,18,20,8,29,28
4,1,21,18,29,15,13,20,10,10,14,11,14,21,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,25,35,18,14,6,7,1,15,5,20,3,6,14
174,3,22,28,19,19,9,8,1,11,9,20,4,4,15
175,3,21,30,12,13,14,5,1,11,8,26,2,4,16
176,3,20,18,15,13,14,6,1,15,10,24,2,5,16


Flavanoids
--> 0
|	Alcohol
|	--> 7.0
|	|	3.0
|	--> 8.0
|	|	3.0
|	--> 10.0
|	|	2.0
|	--> 15.0
|	|	3.0
|	--> 17.0
|	|	3.0
|	--> 20.0
|	|	3.0
|	--> 23.0
|	|	3.0
|	--> 25.0
|	|	3.0
|	--> 27.0
|	|	3.0
--> 1
|	3.0
--> 2
|	Alcohol
|	--> 8.0
|	|	3.0
|	--> 9.0
|	|	2.0
|	--> 21.0
|	|	3.0
|	--> 22.0
|	|	3.0
|	--> 24.0
|	|	3.0
|	--> 26.0
|	|	3.0
|	--> 27.0
|	|	3.0
--> 3
|	Alcohol
|	--> 9.0
|	|	2.0
|	--> 14.0
|	|	3.0
|	--> 15.0
|	|	3.0
|	--> 22.0
|	|	3.0
--> 4
|	Alcohol
|	--> 5.0
|	|	2.0
|	--> 6.0
|	|	2.0
|	--> 8.0
|	|	2.0
|	--> 29.0
|	|	3.0
--> 5
|	Ash
|	--> 2.0
|	|	2.0
|	--> 3.0
|	|	2.0
|	--> 5.0
|	|	2.0
|	--> 6.0
|	|	2.0
|	--> 13.0
|	|	2.0
|	--> 14.0
|	|	2.0
|	--> 15.0
|	|	3.0
|	--> 16.0
|	|	2.0
|	--> 19.0
|	|	3.0
|	--> 20.0
|	|	2.0
--> 6
|	2.0
--> 7
|	2.0
--> 8
|	Alcohol
|	--> 3.0
|	|	2.0
|	--> 6.0
|	|	2.0
|	--> 7.0
|	|	2.0
|	--> 11.0
|	|	2.0
|	--> 16.0
|	|	1.0
|	--> 17.0
|	|	1.0
|	--> 27.0
|	|	1.0
--> 9
|	Alcohol
|	--> 1.0
|	|	2.0
|	--> 9.0
|	|	2.0
|	--> 10.0
|	|	2.0
|	--> 23.0
|	|	1.0
|	--> 2

In [204]:
data = getWineQualityData("both", "Datasets/Wine Quality/")
wineQualityData = discretize(data, cont_attributes=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], target_attribute_name='quality', strategy="kmeans")
display(wineQualityData)

accuracy = runModel(wineQualityData, 'quality')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,15,18,18,73,8,15,82,170,11,10,5,6
1,11,21,17,3,9,4,63,83,26,12,12,6
2,21,19,20,24,9,10,45,98,24,9,18,6
3,16,14,16,30,11,16,90,105,21,7,16,6
4,16,14,16,30,11,16,90,105,21,7,16,6
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,10,51,4,5,19,10,19,95,34,16,22,5
6493,8,46,5,6,12,13,22,98,37,25,29,6
6494,11,42,6,6,16,9,17,107,32,25,27,6
6495,8,55,6,5,15,10,19,103,40,23,19,5


density
--> 0
|	8.0
--> 1
|	5.0
--> 2
|	fixed acidity
|	--> 2.0
|	|	7.0
|	--> 4.0
|	|	5.0
|	--> 5.0
|	|	7.0
|	--> 6.0
|	|	7.0
--> 4
|	7.0
--> 6
|	fixed acidity
|	--> 1.0
|	|	6.0
|	--> 13.0
|	|	7.0
--> 7
|	fixed acidity
|	--> 7.0
|	|	7.0
|	--> 10.0
|	|	6.0
--> 8
|	7.0
--> 9
|	6.0
--> 10
|	fixed acidity
|	--> 6.0
|	|	8.0
|	--> 9.0
|	|	6.0
|	--> 14.0
|	|	6.0
--> 11
|	fixed acidity
|	--> 4.0
|	|	7.0
|	--> 6.0
|	|	7.0
|	--> 8.0
|	|	6.0
|	--> 11.0
|	|	7.0
|	--> 12.0
|	|	6.0
|	--> 13.0
|	|	7.0
--> 12
|	fixed acidity
|	--> 6.0
|	|	6.0
|	--> 9.0
|	|	7.0
|	--> 11.0
|	|	6.0
--> 13
|	fixed acidity
|	--> 5.0
|	|	7.0
|	--> 6.0
|	|	6.0
|	--> 8.0
|	|	7.0
|	--> 11.0
|	|	8.0
|	--> 13.0
|	|	8.0
--> 14
|	fixed acidity
|	--> 6.0
|	|	6.0
|	--> 7.0
|	|	8.0
|	--> 8.0
|	|	7.0
|	--> 9.0
|	|	6.0
|	--> 10.0
|	|	6.0
|	--> 12.0
|	|	7.0
|	--> 13.0
|	|	7.0
|	--> 14.0
|	|	6.0
|	--> 16.0
|	|	6.0
--> 15
|	fixed acidity
|	--> 0.0
|	|	8.0
|	--> 1.0
|	|	8.0
|	--> 8.0
|	|	6.0
|	--> 9.0
|	|	6.0
|	--> 10.0
|	|	8.0
|	--> 12.0


Some of the lack of accuracy on the 'Wine Quality' dataset can be explained because of the way the dataset is set up. Since the result catagories are for a range for quality from 1 to 10, the values for a wine with quality 8 and another with quality 9 may not be very different, but it will still be counted as wrong and equivalent to if they were predicted as lets say a 1.