# 0. Preparations

## 0.1. Import required packages

In [3]:
from time import time

import pandas as pd
import numpy as np

# Import the normal distribution function from the stats package
from scipy.stats import norm

# Import the accuracy_score method from sklearn to compute the accuracies of predictions
from sklearn.metrics import accuracy_score

## 0.2. Define general functions

In [4]:
# Define which indices and headings to use for the timing DataFrames
timing_index = ['train', 'test', 'total']
timing_columns = ['fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5', 'fold 6', 'fold 7', 'fold 8', 'fold 9', 'fold 10', 'mean', 'std']

In [5]:
def cross_validation_train_test_split(data_set, buckets_location, bucket_number):
    # Read all the bucket indices for the specified bucket number from the provided CSV file
    bucket_indices = pd.read_csv(buckets_location).iloc[bucket_number].dropna().values

    # Split the entire data set into testing and training data
    data_test = data_set.iloc[bucket_indices]
    data_train = data_set.drop(bucket_indices, axis='index')

    return data_train, data_test

def cross_validation(all_data, target, buckets_location, e, enhanced):
   
    # Define an empty list to store the accuracies in
    accuracies = []
    
    # Define empty lists and a DataFrame to store the timing measurements in
    training_times = []
    testing_times = []
    timings = pd.DataFrame(index=timing_index, columns=timing_columns)
    
    # Run the classification with 10-fold cross-validation
    for i in range(0, 10):

        # Retrieve the training and testing data according to a specific bucket number (starts from 0)
        training_data, testing_data = cross_validation_train_test_split(all_data, buckets_location=buckets_location, bucket_number=i)
        
        # Split the data into features and labels
        features_train = training_data.drop(target, axis='columns')
        features_test = testing_data.drop(target, axis='columns')
        
        labels_train = pd.DataFrame(training_data[target])
        labels_test = pd.DataFrame(testing_data[target])
        
        # Instantiate a new instance of the classifier
        classifier = CustomNaiveBayes(e, enhanced)
        
        
        
        # --- Start training time --- #
        t_train_start = time()

        # Train the classifier with the given data
        classifier.fit(features_train, labels_train)

        t_train_end = time()
        # --- End training time --- #  
        
        
        
        # --- Start prediction time ---#
        t_predict_start = time()

        # Make predictions with the classifier
        predictions = classifier.predict(features_test)

        t_predict_end = time()
        # --- End prediction time --- #

        
    
        # Compute the accuracy for the classifier
        accuracy = accuracy_score(labels_test, predictions)

        # Add the accuracy to the list of accuracies
        accuracies.append(accuracy)
        
        # Add the timing measurements
        training_times.append(t_train_end - t_train_start)
        testing_times.append(t_predict_end - t_predict_start)
        
        timings.loc['train'][i] = t_train_end - t_train_start
        timings.loc['test'][i] = t_predict_end - t_predict_start
        
    timings.loc['total'] = timings.loc['train'] + timings.loc['test']
    timings['mean'] = timings.mean(axis='columns')
    timings['std'] = timings.std(axis='columns')

    return accuracies, timings

def improve_accuracy_output(df_accuracies):
    
    # Move the regular version of the classifier to the top
    original = pd.DataFrame(df_accuracies.loc['None']).T
    df_accuracies = pd.concat([original, df_accuracies.drop('None', axis='index').sort_index(axis='index', ascending=False)])
    df_accuracies.aggregate(['mean', 'min', 'max']).T.reset_index(drop=True)

    df_accuracies['min'] = df_accuracies.min(axis='columns')
    df_accuracies['max'] = df_accuracies.max(axis='columns')
    df_accuracies['mean'] = df_accuracies.mean(axis='columns')
    df_accuracies['mean_diff'] = (df_accuracies['mean'] - df_accuracies.shift(periods=1, axis='index')['mean'])    # Compute how much the mean has changed
    df_accuracies['std'] = df_accuracies.std(axis='columns')
    
    return df_accuracies

## 0.3. Define classifier class

In [6]:
class CustomNaiveBayes:
    """
    Custom version of the Naive Bayes classifier.

    If a Privacy Parameter e is set, this class will be differentially private.
    If no Privacy Parameter is set, the class will act as a regular Naive Bayes classifier.
    """

    # Set the value that should be used instead of zero
    _zero_value = 0.00001

    # Define some class variables that should be remembered between the fitting and prediction methods
    _target = None
    _target_values = None
    _num_features_train = None
    _cat_features_train = None
    _labels_train = None
    _priors = None
    _means = None
    _stds = None
    
    # Add a variable to indicate whether or not the algorithm should output some intermediate steps
    _output = False
    
    # Define whether or not the "enhanced" algorithm should be used.
    # The original implementation is based on the paper by Vaidya, Shafiq, Basu & Hong (see README).
    # The enhanced implementation includes some improvements from the paper by Zafarani & Clifton (see README).
    _enhanced_algorithm = True

    def __init__(self, e=None, enhanced=True):
        """Initialize a new instance of the class."""

        if self._output:
            # Print which Privacy Parameter was set
            self.print_privacy_parameter(e)

        # Set the Privacy Parameter
        self._e = e
        
        self._enhanced_algorithm = enhanced

        return

    @staticmethod
    def print_privacy_parameter(e):
        """Print the Privacy Parameter that was set."""

        if e is None:
            print("No privacy parameter was set")
            print()
        else:
            print("The following privacy parameter was set:", e)
            print()

        return

    @staticmethod
    def laplace(mean, scale):
        """Generate some noise by sampling the Laplace distribution with the given mean and scale."""

        # Sample the Laplace distribution to generate some noise
        return np.random.laplace(loc=mean, scale=scale)

    def get_noise(self, scale_factor):
        """Generate some noise for a given scale factor."""
        
        return self.laplace(0, scale_factor)

    @staticmethod
    def split_data_on_type(features, labels=None):
        """Split the data based on its feature types."""

        # Split features into numerical and categorical features
        num_features = features.select_dtypes(exclude=['object'])
        cat_features = features.select_dtypes(include=['object'])

        # If no labels were specified
        if labels is None:

            # Return just the features
            return num_features, cat_features

        else:

            # Join features and labels to get the full data set
            num_features_incl_labels = num_features.join(labels)
            cat_features_incl_labels = cat_features.join(labels)

            # Return the features AND the labels
            return num_features, cat_features, num_features_incl_labels, cat_features_incl_labels

    def compute_priors(self, training_data, feature):
        """Compute the prior probability for a given feature."""

        # Get the counts for the specified feature
        counts = training_data[feature].value_counts()

        # Add noise to the counts if a privacy parameter was set
        if self._e is not None:
            # Corresponds to LINE 18 of the pseudocode in the original paper
            if self._enhanced_algorithm:
                counts = counts.add(self.laplace(mean=0, scale=(1 / self._e)))
            else:
                counts = counts.add(self.laplace(mean=0, scale=1))

        # Ensure all counts are positive
        counts[counts <= 0] = self._zero_value

        # Compute priors
        priors = counts.div(training_data.shape[0])

        # Ensure all priors are positive
        priors[priors <= 0] = self._zero_value
        
        # Take the logarithm of all priors
        priors = np.log(priors)

        return priors

    def compute_numerical_likelihoods(self, num_training_data, target):
        """Compute the numerical likelihoods for a given target."""

        # Compute the mean per numerical feature and target value
        means = num_training_data.groupby(target).mean()

        # Compute the standard deviation per numerical feature and target value
        stds = num_training_data.groupby(target).std()

        # Add noise to the means and standard deviations if a privacy parameter was set
        if self._e is not None:
            # Compute the noise for the numeric attributes

            # Compute general statistics
            minimums = num_training_data.groupby(target).min()
            maximums = num_training_data.groupby(target).max()

            # Compute the sensitivities for the means and standard deviations
            sensitivities_means = maximums.sub(minimums).div(num_training_data.shape[0] + 1)
            sensitivities_stds = sensitivities_means.mul(np.sqrt(num_training_data.shape[0]))

            # Compute the scale factors
            sf_means = sensitivities_means.div(self._e)
            sf_stds = sensitivities_stds.div(self._e)

            # Compute the noise
            noise_means = sf_means.apply(self.get_noise)
            noise_stds = sf_stds.apply(self.get_noise)

            # Add the noise
            means = means.add(noise_means)  # Corresponds to LINE 10 of the pseudocode in the original paper
            stds = stds.add(noise_stds)  # Corresponds to LINE 13 of the pseudocode in the original paper

        # Ensure all means are positive
        means[means <= 0] = self._zero_value

        # Ensure all standard deviations are positive
        stds[stds <= 0] = self._zero_value

        return means, stds

    def num_cond_probs(self, num_features_y, target_value):
        """This method computes the conditional probabilities for numerical attributes by using the normal probability density function."""

        # Retrieve the feature names from the new instance
        feat_names = num_features_y.columns.values

        # Only consider the relevant feature names for the specified target value
        means = self._means.loc[target_value][feat_names]
        stds = self._stds.loc[target_value][feat_names]

        # Compute the conditional probabilities (returns numpy arrays)
        cond_probs = norm(loc=means.values, scale=stds.values).pdf(num_features_y.values)
        
        # Convert the conditional probabilities to a dataFrame
        cond_probs = pd.DataFrame(cond_probs, columns=feat_names, index=num_features_y.index.values)
          
        cond_probs[cond_probs <= 0] = self._zero_value
        
        # Take the logarithm of the probabilities in order to prevent potential underflows
        log_cond_probs =  np.log(cond_probs)
        
        return log_cond_probs

    def update_privacy_parameter(self, e, cat_features_train, num_features_train):
        """Update the Privacy Parameter for a better representation of the overall privacy."""

        # Don't change the parameter if it has not been set in the first place
        if e is None:
            return None

        if self._output:
            print("Previous value of e:")
            display(e)
            print()
        
        # If the privacy parameter was set, replace it with the overall privacy
        e = e / (1 + cat_features_train.shape[0] + (2 * num_features_train.shape[0]))
        
        if self._output:
            print("New value of e:")
            display(e)
            print()
        
        return e

    def fit(self, features_train, labels_train):
        """Train the classifier with the given features and labels."""

        # Abort if the training labels DataFrame does not contain exactly one column
        if labels_train.shape[1] != 1:
            raise KeyError("The labels DataFrame does not contain exactly one column")

        # Get the target name from the training labels
        target = labels_train.columns.values[0]

        # Abort if the training data contains the target
        if target in features_train.columns.values:
            raise KeyError("The training data should not contain the target")

        # Get the distinct target values
        target_values = labels_train[target].unique()

        # Split features and training data (features incl. labels) into numerical and categorical variables
        num_features_train, cat_features_train, num_training_data, cat_training_data \
            = self.split_data_on_type(features_train, labels_train)

        if self._enhanced_algorithm:
            # Update the privacy parameter if it was set
            self._e = self.update_privacy_parameter(self._e, cat_features_train, num_features_train)

        # Compute the prior probabilities for the target values
        priors = self.compute_priors(labels_train, target)
        
        if self._output:
            print("Priors:")
            display(priors)
            print()
            
        # Only compute numerical likelihoods if the dataset actually contains any numerical features
        if num_features_train.shape[1] > 0:
        
            # Compute the numerical likelihoods for the training data
            means, stds = self.compute_numerical_likelihoods(num_training_data, target)

            if self._output:
                print("Means:")
                display(means)
                print()

                print("Standard Deviations:")
                display(stds)
                print()
                   
            # Store the variables as class variables
            self._means = means
            self._stds = stds  

        # Store the variables as class variables
        self._target = target
        self._target_values = target_values
        self._num_features_train = num_features_train
        self._cat_features_train = cat_features_train
        self._labels_train = labels_train
        self._priors = priors
           
        return

    def predict(self, features_test):
        """Make predictions for the given features."""

        # Reset the indices
        features_test = features_test.reset_index(drop=True)

        # Split testing features into numerical and categorical features
        num_features_test, cat_features_test = self.split_data_on_type(features_test)

        # Retrieve the test features as an array
        test_features = cat_features_test.columns.values

        # Define a results DataFrame that already contains these priors.
        # This DataFrame will later be multiplied with the conditional probability for each feature.
        results = pd.DataFrame([self._priors], columns=self._target_values, index=features_test.index.values)
        
        if self._output:
            print("Results with priors:")
            display(results.head())
            print()

        # Loop over all target values to compute and store the conditional probabilities
        for target_value in self._target_values:

            # Create a target DataFrame that solely consists of the current target value that is being evaluated
            target_labels = pd.DataFrame(target_value, columns=[self._target], index=features_test.index.values)
            
            
            #############
            # NUMERICAL #
            #############

            
            # Only compute numerical conditional probabilities if the training data contains any numerical features
            if self._num_features_train.shape[1] > 0:

                # Compute the intermediate conditional probabilities for all numerical test features, given the target value
                intermediate_cond_probs = self.num_cond_probs(num_features_test, target_value)

                # Compute the sum of the intermediate conditional probabilities and add this to the results.
                # Normally, one would multiply here, but because logarithms are used, we take the sum since log(a*b) = log(a) + log(b)
                results[target_value] += intermediate_cond_probs.sum(axis='columns')

                if self._output:
                    print("Numerical conditional probabilities for label", target_value, ":")
                    display(intermediate_cond_probs.head())
                    print()

                    print("Original data for label", target_value, ":")
                    display(num_features_test.head())
                    print()
               
            
            ###############
            # CATEGORICAL #
            ###############


            # Only compute categorical conditional probabilities if the training data contains any categorical features
            if self._cat_features_train.shape[1] > 0:
                
                # Retrieve the categorical training data
                cat_training_data = self._cat_features_train.join(self._labels_train)

                # Create a subset for which the target values match
                class_matches = cat_training_data.loc[cat_training_data[self._target] == target_value]

                # Define a DataFrame to store the conditional probabilities per target value in
                intermediate_cond_probs = pd.DataFrame()

                # Loop over all the test features
                for feature in test_features:

                    # Compute the value frequencies for each feature
                    value_frequencies = class_matches[feature].value_counts()

                    if self._output:
                        vf = pd.DataFrame(index=['original', 'noisy'], columns=value_frequencies.index.values)
                        vf.loc['original'] = value_frequencies

                    # Add noise to the counts if a privacy parameter was set
                    if self._e is not None:
                        # Corresponds to LINE 5 of the pseudocode in the original paper
                        value_frequencies = value_frequencies.add(self.laplace(mean=0, scale=(1 / self._e)))

                    # Ensure the (noisy) value frequencies are positive
                    value_frequencies[value_frequencies <= 0] = self._zero_value

                    if self._output:    
                        vf.loc['noisy'] = value_frequencies

                        print("Value frequencies for label", target_value, ":")
                        display(vf)
                        print()

                    # Compute the number of records of class matches
                    n_class_matches = class_matches.shape[0]

                    if n_class_matches <= 0:
                        n_class_matches = self._zero_value

                    # Compute the conditional probabilities for the categorical feature
                    #    by considering each feature value, getting its frequency of occurrence
                    #    and dividing it by the total number of records for which the target value matches
                    cond_probs = value_frequencies[self._cat_features_train[feature]] / n_class_matches

                    # Store the conditional probabilities
                    intermediate_cond_probs[feature] = np.log(cond_probs.values)


                if self._output:
                    print("Categorical conditional probabilities for label", target_value, ":")
                    display(intermediate_cond_probs.head())
                    print()

                    print("Original data for label", target_value, ":")
                    display(cat_features_test.head())
                    print()

                # Compute the sum of the intermediate conditional probabilities and add this to the results.
                # Normally, one would multiply here, but because logarithms are used, we take the sum since log(a*b) = log(a) + log(b)
                results[target_value] += intermediate_cond_probs.sum(axis='columns')

        # Make predictions by takin the column with the highest score as prediction
        predictions = results.idxmax(axis='columns')

        # Return the predictions
        return predictions
    
    def enable_output(self):
        """Enable intermediate output for the classifier."""
        
        self._output = True
        
    def disable_output(self):
        """Disable intermediate output for the classifier."""
        
        self._output = False

In [7]:
def run_classifier(dataset, target, buckets_loc):
    """Run the classifier on a given dataset, for a specific target."""
    
    # Define a list of privacy parameters for which the results should be generated
    privacy_parameters = [None, 1.0, 0.75, 0.5, 0.25, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000000001]
    privacy_parameters_without_None = privacy_parameters[-(len(privacy_parameters) - 1):]

    # Define which headings to use for the accuracy DataFrames
    accuracy_headings = ['accuracy 1', 'accuracy 2', 'accuracy 3', 'accuracy 4', 'accuracy 5', 'accuracy 6', 'accuracy 7', 'accuracy 8', 'accuracy 9', 'accuracy 10']

    # Define a DataFrame to store the computed accuracies in
    df_accuracies_original = pd.DataFrame([], columns=accuracy_headings, index=privacy_parameters_without_None)
    df_accuracies_enhanced = pd.DataFrame([], columns=accuracy_headings, index=privacy_parameters_without_None)

    # Run the classifier for each epsilon value
    for e in privacy_parameters:
        
        if e is None:
            index = 'None'
        else:
            index = e

        # Generate results for both the original and the enhanced implementation.
        # The original implementation is based on the paper by Vaidya, Shafiq, Basu & Hong (see README).
        # The enhanced implementation includes some improvements from the paper by Zafarani & Clifton (see README).
        df_accuracies_original.loc[index], timings_original = cross_validation(dataset, target, buckets_loc, e, enhanced=False)
        df_accuracies_enhanced.loc[index], timings_enhanced = cross_validation(dataset, target, buckets_loc, e, enhanced=True)

        print("Timings (in seconds) for original algorithm with e =", e, ":")
        display(timings_original)
        print()

        print("Timings (in seconds) for enhanced algorithm with e =", e, ":")
        display(timings_enhanced)
        print()


    print("Original Algorithm:")
    display(improve_accuracy_output(df_accuracies_original))
    print()

    print("Enhanced Algorithm:")
    display(improve_accuracy_output(df_accuracies_enhanced))
    print()
    
    return

# Adult Dataset

In [8]:
%%time

# Define the headings to be used
headings = ['age', 'workclass', 'final-weight', 'education', 'education-num', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income']

# Read in the official training and testing data separately
data_train = pd.read_csv('datasets/Adult/adult.data', names=headings)
data_test = pd.read_csv('datasets/Adult/adult.test', names=headings, skiprows=1)

# Combine both data sets into one data set
adult_data = data_train.append(data_test)

# Apply some pre-processing
adult_data['age'] = adult_data['age'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['workclass'] = adult_data['workclass'].astype('category').str.strip()
adult_data['final-weight'] = adult_data['final-weight'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['education'] = adult_data['education'].astype('category').str.strip()
adult_data['education-num'] = adult_data['education-num'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['marital-status'] = adult_data['marital-status'].astype('category').str.strip()
adult_data['occupation'] = adult_data['occupation'].astype('category').str.strip()
adult_data['relationship'] = adult_data['relationship'].astype('category').str.strip()
adult_data['race'] = adult_data['race'].astype('category').str.strip()
adult_data['sex'] = adult_data['sex'].astype('category').str.strip()
adult_data['capital-gain'] = adult_data['capital-gain'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['capital-loss'] = adult_data['capital-loss'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['hours-per-week'] = adult_data['hours-per-week'].astype('int').apply(pd.to_numeric, downcast="unsigned")
adult_data['native-country'] = adult_data['native-country'].astype('category').str.strip()
adult_data['income'] = adult_data['income'].astype('category').str.strip()

# Remove the dot from the income column
adult_data['income'] = adult_data['income'].replace({'\.': ''}, regex=True)

# Convert every '?' to NaN, so Pandas' built-in functions can be used
adult_data = adult_data.replace({'?': np.NaN})

# Try to fill missing values with the mean (this only works for numerical attributes)
adult_data = adult_data.fillna(adult_data.mean())

# Get the number of records in the data set
n_records_before = adult_data.shape[0]

# If values are still missing (they must be categorical attributes), drop the rows with missing data
adult_data = adult_data.dropna()

# Get the number of records in the data set
n_records_after = adult_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
adult_data = adult_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

3620 records were dropped due to missing values.
This is 7.4 % of the entire data set.
The resulting data set contains 45222 records.

Wall time: 20.5 s


In [9]:
display(adult_data.head())

adult_data.shape

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


(45222, 15)

In [10]:
%%time

run_classifier(adult_data, 'income', 'datasets/Adult/buckets_adult.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.059963,0.0479681,0.0459874,0.0479755,0.0479705,0.047971,0.0469732,0.0539668,0.0472512,0.0469718,0.0493,0.004105
test,0.266015,0.272833,0.271266,0.269088,0.268094,0.265876,0.266286,0.268834,0.27105,0.277131,0.269647,0.003359
total,0.325978,0.320801,0.317253,0.317064,0.316064,0.313847,0.313259,0.322801,0.318301,0.324103,0.318947,0.004097



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0489681,0.0473366,0.0470426,0.0509682,0.0449715,0.0449739,0.0469708,0.0459719,0.0489712,0.049969,0.047614,0.001948
test,0.280245,0.2701,0.2678,0.279261,0.265165,0.269832,0.265841,0.264836,0.270101,0.271161,0.270434,0.005123
total,0.329213,0.317436,0.314842,0.330229,0.310136,0.314806,0.312812,0.310807,0.319072,0.32113,0.318048,0.006687



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0663238,0.0669599,0.0669582,0.0679569,0.0693088,0.0759525,0.0619624,0.064877,0.0632153,0.0619607,0.066548,0.00393
test,0.280825,0.280834,0.273161,0.297145,0.272833,0.28483,0.263079,0.267084,0.272837,0.267011,0.275964,0.009628
total,0.347149,0.347794,0.340119,0.365102,0.342142,0.360783,0.325042,0.331961,0.336053,0.328972,0.342512,0.012414



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.060962,0.0609636,0.0642054,0.0609648,0.0599637,0.0609646,0.0619628,0.0629616,0.0629618,0.0609598,0.061687,0.001235
test,0.264082,0.270101,0.267653,0.270832,0.263837,0.262108,0.274106,0.263729,0.268346,0.261845,0.266664,0.003945
total,0.325044,0.331064,0.331859,0.331797,0.323801,0.323073,0.336069,0.32669,0.331308,0.322805,0.328351,0.004392



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.063961,0.0635989,0.0638149,0.0649612,0.0619624,0.0639606,0.0693383,0.0729563,0.0629642,0.0629613,0.065048,0.003243
test,0.2602,0.277137,0.266835,0.263844,0.266113,0.274098,0.286854,0.279831,0.264136,0.276674,0.271572,0.008135
total,0.324161,0.340736,0.33065,0.328806,0.328076,0.338059,0.356192,0.352788,0.3271,0.339635,0.33662,0.010431



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0669596,0.0709567,0.062963,0.0649595,0.0619612,0.0699556,0.0689588,0.0629616,0.0619636,0.0622358,0.065388,0.003353
test,0.285113,0.281833,0.278137,0.276969,0.274101,0.266876,0.284856,0.274641,0.259614,0.266835,0.274898,0.007938
total,0.352072,0.35279,0.3411,0.341928,0.336062,0.336831,0.353815,0.337603,0.321578,0.329071,0.340285,0.009972



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.064961,0.063961,0.0609641,0.0612326,0.0609818,0.0619636,0.0599635,0.0609622,0.064961,0.0616443,0.062159,0.001708
test,0.262837,0.269113,0.261393,0.263837,0.262837,0.26314,0.265207,0.261142,0.26084,0.26384,0.263419,0.002295
total,0.327798,0.333074,0.322357,0.32507,0.323819,0.325103,0.325171,0.322104,0.325801,0.325485,0.325578,0.002956



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0679572,0.059963,0.0609629,0.0591798,0.0619619,0.0609612,0.0619619,0.0649624,0.0629902,0.0636239,0.062452,0.002451
test,0.277666,0.260158,0.264873,0.260842,0.262837,0.259842,0.263093,0.298027,0.261839,0.267835,0.267701,0.011278
total,0.345623,0.320121,0.325836,0.320022,0.324799,0.320803,0.325054,0.36299,0.32483,0.331459,0.330154,0.013083



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0639627,0.0603333,0.0649574,0.0614693,0.0601132,0.0599644,0.0619621,0.0612783,0.0614731,0.0609608,0.061647,0.001549
test,0.280831,0.263843,0.258878,0.258234,0.261167,0.260155,0.261111,0.255849,0.261836,0.25984,0.262174,0.00655
total,0.344793,0.324176,0.323835,0.319703,0.32128,0.32012,0.323073,0.317128,0.323309,0.320801,0.323822,0.007293



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0624728,0.0609627,0.0602157,0.0609605,0.0659587,0.0619602,0.061192,0.0609629,0.0619709,0.0599797,0.061664,0.00161
test,0.263154,0.270073,0.261839,0.263628,0.262181,0.324288,0.261837,0.25884,0.257852,0.267087,0.269078,0.018711
total,0.325626,0.331035,0.322055,0.324588,0.32814,0.386248,0.323029,0.319803,0.319823,0.327067,0.330741,0.018813



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.062963,0.064784,0.0619614,0.0599618,0.0609627,0.0619607,0.060286,0.0629618,0.0709565,0.0609624,0.062776,0.00305
test,0.268652,0.264836,0.260839,0.262089,0.260163,0.279105,0.258851,0.25884,0.258841,0.264387,0.26366,0.00598
total,0.331615,0.32962,0.3228,0.322051,0.321126,0.341066,0.319137,0.321802,0.329798,0.32535,0.326436,0.006316



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619624,0.0622585,0.06217,0.0609794,0.0689538,0.0630133,0.0619636,0.0589614,0.0599639,0.0609629,0.062119,0.002546
test,0.26511,0.3248,0.265836,0.267532,0.265595,0.265841,0.264356,0.260846,0.266783,0.262642,0.270934,0.018051
total,0.327072,0.387058,0.328006,0.328512,0.334549,0.328854,0.326319,0.319807,0.326747,0.323605,0.333053,0.018353



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0742025,0.0599606,0.0599554,0.0599632,0.0599632,0.0619609,0.062979,0.0629644,0.0609622,0.0619619,0.062487,0.004076
test,0.259839,0.256844,0.258823,0.26009,0.259079,0.270506,0.262838,0.326795,0.26432,0.260096,0.267923,0.019958
total,0.334042,0.316804,0.318779,0.320054,0.319042,0.332467,0.325817,0.389759,0.325282,0.322057,0.33041,0.020528



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619981,0.0619597,0.0609603,0.0609627,0.0619621,0.0614688,0.0609889,0.0599627,0.0599902,0.0679591,0.061821,0.002164
test,0.26084,0.262838,0.263837,0.269339,0.267074,0.261134,0.25984,0.260838,0.261818,0.277087,0.264465,0.005086
total,0.322838,0.324798,0.324798,0.330302,0.329036,0.322603,0.320829,0.320801,0.321808,0.345046,0.326286,0.006967



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0629599,0.0609622,0.0609622,0.0619614,0.0612304,0.0629616,0.0619638,0.0589714,0.0609612,0.0609632,0.06139,0.001104
test,0.264839,0.259846,0.262051,0.328814,0.262837,0.265834,0.264833,0.261663,0.262838,0.32098,0.275454,0.024839
total,0.327799,0.320808,0.323013,0.390775,0.324068,0.328795,0.326797,0.320635,0.323799,0.381943,0.336843,0.024969



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0639606,0.0615199,0.0609629,0.0599613,0.061332,0.0619628,0.0605569,0.0619864,0.0610199,0.0623045,0.061557,0.001045
test,0.260841,0.265834,0.261052,0.264839,0.258842,0.265835,0.261833,0.260065,0.263787,0.339791,0.270272,0.023289
total,0.324801,0.327354,0.322015,0.3248,0.320174,0.327798,0.32239,0.322051,0.324807,0.402096,0.331829,0.023534



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0609646,0.0609665,0.0609689,0.0609639,0.0599649,0.0599651,0.0599625,0.0619628,0.0609634,0.0609639,0.060765,0.0006
test,0.270831,0.262837,0.261087,0.258839,0.262837,0.267833,0.278826,0.260073,0.261836,0.261839,0.264684,0.005823
total,0.331796,0.323803,0.322056,0.319803,0.322802,0.327799,0.338789,0.322036,0.3228,0.322803,0.325449,0.0055



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0609627,0.0599663,0.0654678,0.0659599,0.0792439,0.0632613,0.0624712,0.0609989,0.0624678,0.0599706,0.064077,0.005425
test,0.264354,0.263371,0.259268,0.263976,0.296817,0.344764,0.261711,0.259662,0.260402,0.265196,0.273952,0.025828
total,0.325317,0.323337,0.324736,0.329936,0.37606,0.408026,0.324182,0.320661,0.32287,0.325166,0.338029,0.028025



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0599864,0.0624814,0.0619621,0.0599627,0.0609627,0.0649669,0.0620482,0.0605187,0.0609686,0.0609951,0.061485,0.001412
test,0.259774,0.263838,0.263207,0.257841,0.264843,0.263187,0.265003,0.259769,0.259352,0.263835,0.262065,0.002468
total,0.319761,0.326319,0.325169,0.317803,0.325806,0.328154,0.327052,0.320288,0.32032,0.32483,0.32355,0.003449



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619886,0.0649655,0.0619624,0.0603294,0.0604689,0.0599639,0.0609653,0.0603771,0.0604739,0.0609627,0.061246,0.001396
test,0.262387,0.340519,0.268432,0.261722,0.266848,0.263087,0.261837,0.258865,0.260961,0.264476,0.270913,0.023354
total,0.324375,0.405484,0.330394,0.322052,0.327317,0.323051,0.322803,0.319242,0.321435,0.325438,0.332159,0.024624



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.061048,0.0601485,0.0603411,0.0599625,0.0599966,0.0604801,0.0609677,0.0611939,0.060951,0.0609627,0.060605,0.000447
test,0.265737,0.26684,0.259548,0.261351,0.273867,0.261772,0.276833,0.331313,0.262356,0.26484,0.272446,0.020313
total,0.326785,0.326988,0.319889,0.321313,0.333864,0.322252,0.337801,0.392507,0.323307,0.325802,0.333051,0.020517



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.060683,0.0699563,0.0639691,0.0629585,0.0609615,0.0604689,0.0609646,0.059968,0.0599635,0.0609634,0.062086,0.002891
test,0.262921,0.270838,0.265962,0.262839,0.262842,0.273085,0.261841,0.261837,0.265835,0.267839,0.265584,0.003733
total,0.323604,0.340794,0.329931,0.325798,0.323804,0.333554,0.322806,0.321805,0.325799,0.328803,0.32767,0.005566



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.778859,0.781955,0.780849,0.776648,0.774878,0.783724,0.78483,0.778638,0.775763,0.768789,0.768789,0.78483,0.778213,,0.005297
1.0,0.77908,0.780849,0.782839,0.776426,0.776426,0.769129,0.785493,0.764042,0.775984,0.765694,0.764042,0.785493,0.775458,-0.002754,0.208115
0.75,0.778417,0.782618,0.781955,0.774215,0.769129,0.782176,0.784609,0.77908,0.770898,0.750663,0.750663,0.784609,0.774086,-0.001372,0.207546
0.5,0.765148,0.756966,0.755639,0.782176,0.776869,0.663644,0.661433,0.779965,0.774878,0.765915,0.661433,0.782176,0.743854,-0.030232,0.211946
0.25,0.770456,0.784166,0.768023,0.601061,0.601725,0.766254,0.780407,0.780628,0.664087,0.745137,0.601061,0.784166,0.720598,-0.023256,0.211657
0.1,0.773552,0.762716,0.239938,0.754091,0.747899,0.607696,0.557497,0.666962,0.705882,0.755747,0.239938,0.773552,0.632122,-0.088475,0.263589
0.05,0.75387,0.684432,0.430119,0.719814,0.623397,0.370854,0.731314,0.590668,0.70544,0.684571,0.370854,0.75387,0.618267,-0.013856,0.215887
0.01,0.686643,0.755418,0.417072,0.743476,0.68598,0.686201,0.249005,0.483193,0.409332,0.732317,0.249005,0.755418,0.571088,-0.047179,0.245499
0.005,0.720035,0.454003,0.291243,0.367315,0.756524,0.516143,0.510615,0.769571,0.749226,0.471264,0.291243,0.769571,0.555563,-0.015526,0.231309
0.001,0.474569,0.319328,0.609244,0.431889,0.735073,0.628925,0.708757,0.743255,0.639982,0.532935,0.319328,0.743255,0.573878,0.018316,0.207197



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.778859,0.781955,0.780849,0.776648,0.774878,0.783724,0.78483,0.778638,0.775763,0.768789,0.768789,0.78483,0.778213,,0.005297
1.0,0.753207,0.241486,0.753428,0.75188,0.749668,0.234852,0.752322,0.23065,0.750774,0.259726,0.23065,0.753428,0.538506,-0.239707,0.319786
0.75,0.247678,0.758514,0.246572,0.24812,0.250332,0.251437,0.247678,0.76935,0.250774,0.736295,0.246572,0.76935,0.418556,-0.11995,0.272157
0.5,0.247678,0.241486,0.753428,0.24812,0.250332,0.251437,0.247678,0.76935,0.250774,0.740274,0.241486,0.76935,0.417616,-0.00094,0.257047
0.25,0.753207,0.758514,0.246572,0.226891,0.250332,0.227996,0.752322,0.76935,0.749226,0.595712,0.226891,0.76935,0.527197,0.109581,0.265123
0.1,0.247678,0.241486,0.246572,0.24812,0.749668,0.227996,0.247678,0.23065,0.250774,0.736295,0.227996,0.749668,0.367049,-0.160148,0.252821
0.05,0.752322,0.241486,0.753428,0.75188,0.775984,0.748341,0.752322,0.23065,0.749226,0.736295,0.23065,0.775984,0.624881,0.257832,0.238077
0.01,0.752322,0.241486,0.753428,0.24812,0.749668,0.748563,0.752322,0.23065,0.250774,0.263705,0.23065,0.753428,0.497926,-0.126954,0.295577
0.005,0.752322,0.241486,0.246572,0.24812,0.250332,0.251437,0.247678,0.23065,0.768686,0.736295,0.23065,0.768686,0.41441,-0.083517,0.267981
0.001,0.752322,0.241486,0.222689,0.750995,0.250332,0.748563,0.752322,0.76935,0.250774,0.263705,0.222689,0.76935,0.499548,0.085138,0.271406



Wall time: 2min 13s


# Mushroom Dataset

In [11]:
%%time

# Define the headings to be used
headings = ['label', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Read in the official training and testing data separately
mushroom_data = pd.read_csv('datasets/Mushroom/agaricus-lepiota.data', names=headings)

# Apply some pre-processing
for heading in headings:
    mushroom_data[heading] = mushroom_data[heading].astype('category').str.strip()
    
# Convert every '?' to NaN, so Pandas' built-in functions can be used
mushroom_data = mushroom_data.replace({'?': np.NaN})

# Find out which features contain missing (NaN) values
nan_features = mushroom_data.columns[mushroom_data.isnull().any()]
nan_features_count = mushroom_data[nan_features].isnull().sum()

# According to the following overview, all missing values occur in the 'stalk-root' feature, so let's drop this feature.
# The alternative would be to drop all rows that contain a missing value, but this would be 30.5% of the entire dataset, which is a bit much.

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Drop the 'stalk-root' feature
mushroom_data = mushroom_data.drop('stalk-root', axis='columns')

# Get the number of records in the data set
n_records_before = mushroom_data.shape[0]

# If values are still missing (they must be categorical attributes), drop the rows with missing data
mushroom_data = mushroom_data.dropna()

# Get the number of records in the data set
n_records_after = mushroom_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
mushroom_data = mushroom_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


stalk-root    2480
dtype: int64


0 records were dropped due to missing values.
This is 0.0 % of the entire data set.
The resulting data set contains 8124 records.

Wall time: 110 ms


In [12]:
display(mushroom_data.head())

mushroom_data.shape

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


(8124, 22)

In [13]:
%%time

run_classifier(mushroom_data, 'label', 'datasets/Mushroom/buckets_mushroom.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.010993,0.0109937,0.010993,0.0109932,0.0119925,0.010994,0.0109932,0.0109937,0.0109935,0.0109937,0.011093,0.0003
test,0.201109,0.199877,0.197878,0.202874,0.198877,0.201943,0.199875,0.200094,0.203948,0.200876,0.200735,0.001733
total,0.212102,0.21087,0.208871,0.213868,0.21087,0.212937,0.210868,0.211088,0.214942,0.21187,0.211828,0.00165



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119948,0.0109932,0.0159903,0.0119874,0.0109937,0.0109935,0.0110531,0.0109928,0.0109928,0.0109932,0.011698,0.001483
test,0.198875,0.200686,0.203539,0.209871,0.202162,0.199877,0.210811,0.206875,0.198295,0.198388,0.202938,0.004469
total,0.21087,0.211679,0.21953,0.221858,0.213156,0.21087,0.221864,0.217867,0.209287,0.209381,0.214636,0.004841



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109935,0.0109944,0.0109937,0.0109956,0.0109932,0.0119939,0.0109937,0.010993,0.0109932,0.0109947,0.011094,0.0003
test,0.215868,0.218376,0.203871,0.214868,0.205872,0.207759,0.205873,0.212868,0.204873,0.205167,0.209539,0.005106
total,0.226861,0.229371,0.214865,0.225863,0.216866,0.219753,0.216866,0.223861,0.215866,0.216162,0.220633,0.00508



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119927,0.0109885,0.0109973,0.0119927,0.0109947,0.0109932,0.0109921,0.0109918,0.010994,0.011991,0.011293,0.000458
test,0.208222,0.203874,0.205148,0.201876,0.204873,0.204871,0.205416,0.205872,0.206955,0.203876,0.205098,0.001655
total,0.220214,0.214862,0.216145,0.213868,0.215868,0.215865,0.216408,0.216864,0.217949,0.215867,0.216391,0.001639



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109944,0.0109932,0.0119927,0.0109913,0.0109928,0.0109954,0.0109918,0.0109913,0.0109918,0.0109928,0.011093,0.0003
test,0.206705,0.204872,0.206873,0.205168,0.21087,0.20687,0.204876,0.205121,0.204875,0.204874,0.20611,0.001793
total,0.2177,0.215865,0.218866,0.21616,0.221863,0.217865,0.215868,0.216112,0.215866,0.215867,0.217203,0.00186



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109956,0.0109906,0.010993,0.010994,0.0109932,0.010993,0.0109911,0.0109916,0.0109942,0.0119927,0.011093,0.0003
test,0.212085,0.207872,0.217865,0.205871,0.205122,0.220863,0.205874,0.206875,0.202492,0.206875,0.209179,0.005619
total,0.223081,0.218863,0.228858,0.216865,0.216115,0.231856,0.216865,0.217866,0.213486,0.218868,0.220272,0.005586



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119932,0.0119917,0.0109932,0.0109937,0.0119925,0.0113204,0.0109935,0.0119925,0.0109932,0.0109918,0.011426,0.000472
test,0.206873,0.213119,0.203876,0.202876,0.208872,0.207873,0.24185,0.221864,0.204109,0.216867,0.212818,0.011275
total,0.218867,0.225111,0.21487,0.213869,0.220864,0.219193,0.252843,0.233857,0.215102,0.227859,0.224244,0.011265



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109942,0.0119932,0.0109947,0.0109982,0.00999355,0.0109918,0.0109932,0.0129924,0.0119915,0.0109937,0.011294,0.00078
test,0.205872,0.203874,0.204874,0.207574,0.203911,0.20715,0.207871,0.204879,0.204874,0.204133,0.205501,0.001449
total,0.216866,0.215868,0.215868,0.218572,0.213904,0.218141,0.218864,0.217871,0.216866,0.215127,0.216795,0.001526



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109932,0.010993,0.0109911,0.010993,0.010994,0.0109911,0.0120044,0.0119936,0.0109932,0.00999594,0.011094,0.00054
test,0.210869,0.206875,0.206086,0.203875,0.212869,0.209872,0.203152,0.206873,0.205105,0.204873,0.207045,0.00302
total,0.221862,0.217868,0.217077,0.214868,0.223863,0.220863,0.215157,0.218867,0.216098,0.214869,0.218139,0.003004



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109935,0.0109906,0.0129907,0.0109918,0.0109932,0.0119934,0.0109935,0.0129931,0.0109928,0.0129907,0.011692,0.0009
test,0.209692,0.215867,0.202878,0.206266,0.204874,0.204874,0.255841,0.208874,0.254843,0.232889,0.21969,0.019622
total,0.220685,0.226858,0.215868,0.217257,0.215867,0.216867,0.266834,0.221867,0.265836,0.24588,0.231382,0.019419



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109942,0.0109916,0.0109913,0.0109911,0.00999546,0.0119922,0.0109932,0.010993,0.0119936,0.0119927,0.011193,0.0006
test,0.209156,0.21187,0.207075,0.203145,0.2101,0.205874,0.204874,0.218111,0.208871,0.216869,0.209594,0.004644
total,0.22015,0.222862,0.218066,0.214136,0.220096,0.217866,0.215867,0.229104,0.220865,0.228861,0.220787,0.004732



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119932,0.0109921,0.0109923,0.0119925,0.0109937,0.0109947,0.0109961,0.0119917,0.0119922,0.0109923,0.011393,0.000489
test,0.203874,0.204873,0.20987,0.205875,0.204082,0.202874,0.202874,0.205879,0.206166,0.207873,0.205424,0.002103
total,0.215867,0.215865,0.220863,0.217868,0.215075,0.213869,0.21387,0.21787,0.218158,0.218865,0.216817,0.002163



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119905,0.0109918,0.0109923,0.0129933,0.0109947,0.0109951,0.0109925,0.0110664,0.0119917,0.0119977,0.011501,0.000666
test,0.214867,0.206126,0.204875,0.207872,0.214867,0.203152,0.207409,0.205875,0.2052,0.203869,0.207411,0.003968
total,0.226858,0.217118,0.215867,0.220865,0.225862,0.214147,0.218402,0.216941,0.217192,0.215867,0.218912,0.004084



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109909,0.0109918,0.0109942,0.0109947,0.0109937,0.0119929,0.0109928,0.010993,0.0109949,0.0119925,0.011193,0.0004
test,0.223862,0.203876,0.205178,0.203876,0.217868,0.204873,0.203142,0.206874,0.211868,0.204105,0.208552,0.006728
total,0.234853,0.214868,0.216172,0.214871,0.228861,0.216866,0.214135,0.217867,0.222863,0.216097,0.219745,0.006618



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.010993,0.0109942,0.0119936,0.0109909,0.0109932,0.0109925,0.0119941,0.0109911,0.0109911,0.0109928,0.011193,0.000401
test,0.208873,0.21087,0.206873,0.213222,0.204875,0.204875,0.203874,0.208874,0.204876,0.204875,0.207209,0.002969
total,0.219866,0.221864,0.218866,0.224213,0.215868,0.215867,0.215868,0.219866,0.215867,0.215867,0.218401,0.00287



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109928,0.010998,0.011992,0.0119944,0.010993,0.0109923,0.0109947,0.0109935,0.0109932,0.0109947,0.011194,0.0004
test,0.204145,0.206867,0.202917,0.207872,0.203135,0.206871,0.205455,0.205872,0.203872,0.205872,0.205288,0.001611
total,0.215138,0.217865,0.214909,0.219866,0.214128,0.217863,0.21645,0.216866,0.214865,0.216867,0.216482,0.001674



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109935,0.0109923,0.0109918,0.0109932,0.0109923,0.010993,0.010994,0.0109935,0.0119915,0.0109935,0.011093,0.0003
test,0.205873,0.216135,0.204876,0.202874,0.210876,0.214994,0.205873,0.203876,0.205993,0.208874,0.208024,0.004362
total,0.216867,0.227127,0.215868,0.213868,0.221869,0.225987,0.216867,0.214869,0.217984,0.219867,0.219117,0.004325



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109954,0.0109925,0.0113151,0.0119927,0.010993,0.0119922,0.0109932,0.0109951,0.0109916,0.0109947,0.011226,0.000395
test,0.211869,0.209876,0.214864,0.210874,0.204875,0.211587,0.204874,0.206871,0.206874,0.204126,0.208669,0.003455
total,0.222865,0.220868,0.22618,0.222866,0.215868,0.223579,0.215867,0.217866,0.217865,0.215121,0.219895,0.003676



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109923,0.0109916,0.011992,0.0119934,0.0119917,0.0109944,0.0109932,0.0109906,0.0109921,0.0109935,0.011292,0.000458
test,0.216379,0.215876,0.211157,0.204873,0.204889,0.205879,0.204131,0.220865,0.204874,0.207148,0.209607,0.005756
total,0.227371,0.226867,0.223149,0.216866,0.21688,0.216874,0.215124,0.231856,0.215866,0.218142,0.2209,0.005635



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109928,0.00999475,0.0109923,0.0109944,0.0109918,0.0109932,0.0109937,0.0109925,0.0119934,0.0119915,0.011093,0.000538
test,0.205872,0.211869,0.202876,0.207177,0.204878,0.209871,0.204872,0.203875,0.221861,0.216867,0.209002,0.005892
total,0.216865,0.221863,0.213868,0.218171,0.21587,0.220864,0.215866,0.214868,0.233855,0.228859,0.220095,0.006211



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109937,0.0109937,0.0109923,0.0109951,0.0129919,0.0109954,0.0119925,0.00999331,0.0109921,0.0119948,0.011293,0.000781
test,0.21218,0.219866,0.204876,0.203874,0.217868,0.205871,0.205875,0.204164,0.205931,0.204873,0.208538,0.005633
total,0.223174,0.230859,0.215869,0.214869,0.23086,0.216866,0.217868,0.214157,0.216923,0.216867,0.219831,0.005974



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.010994,0.0109935,0.0119915,0.0110006,0.0109932,0.010993,0.0109928,0.0109951,0.0109923,0.0109899,0.011094,0.000299
test,0.204873,0.213882,0.212869,0.201868,0.206161,0.205873,0.205875,0.204877,0.209289,0.207874,0.207344,0.003539
total,0.215867,0.224876,0.22486,0.212868,0.217155,0.216866,0.216867,0.215872,0.220282,0.218864,0.218438,0.003702



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.512315,0.508621,0.514778,0.538177,0.5,0.508621,0.523399,0.533251,0.512315,0.518382,0.5,0.538177,0.517336,,0.012812
1.0,0.512315,0.508621,0.51601,0.538177,0.5,0.508621,0.523399,0.533251,0.512315,0.518382,0.5,0.538177,0.517439,0.000103,0.138809
0.75,0.512315,0.508621,0.514778,0.538177,0.5,0.508621,0.523399,0.533251,0.512315,0.518382,0.5,0.538177,0.517336,-0.000103,0.138838
0.5,0.512315,0.508621,0.517241,0.538177,0.5,0.508621,0.523399,0.533251,0.512315,0.518382,0.5,0.538177,0.517542,0.000205,0.138809
0.25,0.512315,0.509852,0.514778,0.538177,0.5,0.508621,0.522167,0.533251,0.513547,0.518382,0.5,0.538177,0.517439,-0.000103,0.138853
0.1,0.511084,0.509852,0.514778,0.539409,0.507389,0.514778,0.522167,0.533251,0.508621,0.520833,0.507389,0.539409,0.51908,0.001641,0.138738
0.05,0.520936,0.504926,0.514778,0.538177,0.506158,0.509852,0.525862,0.534483,0.508621,0.523284,0.504926,0.538177,0.519182,0.000102,0.139232
0.01,0.508621,0.511084,0.518473,0.539409,0.503695,0.51601,0.525862,0.524631,0.514778,0.523284,0.503695,0.539409,0.519079,-0.000103,0.139199
0.005,0.5,0.513547,0.514778,0.536946,0.524631,0.507389,0.507389,0.498768,0.504926,0.515931,0.498768,0.536946,0.513335,-0.005744,0.139282
0.001,0.513547,0.492611,0.514778,0.525862,0.514778,0.5,0.525862,0.527094,0.513547,0.522059,0.492611,0.527094,0.514154,0.000819,0.137699



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.512315,0.508621,0.514778,0.538177,0.5,0.508621,0.523399,0.533251,0.512315,0.518382,0.5,0.538177,0.517336,,0.012812
1.0,0.501232,0.507389,0.507389,0.460591,0.492611,0.491379,0.525862,0.471675,0.486453,0.476716,0.460591,0.525862,0.492313,-0.025024,0.139786
0.75,0.487685,0.492611,0.492611,0.539409,0.492611,0.51601,0.525862,0.528325,0.486453,0.477941,0.477941,0.539409,0.504739,0.012426,0.133329
0.5,0.512315,0.492611,0.524631,0.536946,0.492611,0.48399,0.525862,0.528325,0.513547,0.477941,0.477941,0.536946,0.508639,0.0039,0.136481
0.25,0.512315,0.507389,0.507389,0.461823,0.492611,0.51601,0.495074,0.471675,0.513547,0.477941,0.461823,0.51601,0.494467,-0.014172,0.137311
0.1,0.487685,0.490148,0.492611,0.460591,0.507389,0.51601,0.474138,0.498768,0.486453,0.477941,0.460591,0.51601,0.489028,-0.005439,0.133274
0.05,0.487685,0.525862,0.497537,0.51601,0.492611,0.48399,0.470443,0.471675,0.486453,0.477941,0.470443,0.525862,0.492209,0.003181,0.132032
0.01,0.487685,0.492611,0.525862,0.502463,0.507389,0.48399,0.520936,0.471675,0.513547,0.5,0.471675,0.525862,0.500308,0.008098,0.132738
0.005,0.487685,0.507389,0.507389,0.522167,0.507389,0.51601,0.474138,0.528325,0.51601,0.509804,0.474138,0.528325,0.506564,0.006256,0.134813
0.001,0.48399,0.507389,0.507389,0.523399,0.492611,0.51601,0.527094,0.528325,0.524631,0.477941,0.477941,0.528325,0.50792,0.001356,0.136626



Wall time: 1min 1s


# Congressional Voting Dataset

In [14]:
%%time

# Define the headings to be used
headings = ['party', 'handicapped-infants', 'water-project-cost-sharing', 'adoption-of-the-budget-resolution', 'physician-fee-freeze', 'el-salvador-aid', 'religious-groups-in-schools', 'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile', 'immigration', 'synfuels-corporation-cutback', 'education-spending', 'superfund-right-to-sue', 'crime', 'duty-free-exports', 'export-administration-act-south-africa']

# Read in the official training and testing data separately
congressional_votes = pd.read_csv('datasets/Congressional_Voting_Records/house-votes-84.data', names=headings)

# Apply some pre-processing
for heading in headings:
    congressional_votes[heading] = congressional_votes[heading].astype('category').str.strip()
    
# Convert every '?' to NaN, so Pandas' built-in functions can be used
congressional_votes = congressional_votes.replace({'?': np.NaN})

# Find out which features contain missing (NaN) values
nan_features = congressional_votes.columns[congressional_votes.isnull().any()]
nan_features_count = congressional_votes[nan_features].isnull().sum()

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Get the number of records in the data set
n_records_before = congressional_votes.shape[0]

# Drop the rows with missing values, since they occur in multiple features
congressional_votes = congressional_votes.dropna()

# Get the number of records in the data set
n_records_after = congressional_votes.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
congressional_votes = congressional_votes.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


handicapped-infants                        12
water-project-cost-sharing                 48
adoption-of-the-budget-resolution          11
physician-fee-freeze                       11
el-salvador-aid                            15
religious-groups-in-schools                11
anti-satellite-test-ban                    14
aid-to-nicaraguan-contras                  15
mx-missile                                 22
immigration                                 7
synfuels-corporation-cutback               21
education-spending                         31
superfund-right-to-sue                     25
crime                                      17
duty-free-exports                          28
export-administration-act-south-africa    104
dtype: int64


203 records were dropped due to missing values.
This is 46.7 % of the entire data set.
The resulting data set contains 232 records.

Wall time: 34 ms


In [15]:
display(congressional_votes.head())

congressional_votes.shape

Unnamed: 0,party,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
2,democrat,y,y,y,n,n,n,y,y,y,n,y,n,n,n,y,y
3,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
4,democrat,y,n,y,n,n,n,y,y,y,y,n,n,n,n,y,y


(232, 17)

In [16]:
%%time

run_classifier(congressional_votes, 'party', 'datasets/Congressional_Voting_Records/buckets_congressional_voting.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699425,0.0059967,0.00699615,0.00699782,0.0059936,0.0059948,0.00599837,0.00599718,0.00599742,0.00699449,0.006396,0.00049
test,0.108935,0.111932,0.100175,0.0999386,0.0989392,0.100938,0.0999367,0.0989399,0.103935,0.0999897,0.102366,0.0043
total,0.115929,0.117928,0.107172,0.106936,0.104933,0.106932,0.105935,0.104937,0.109932,0.106984,0.108762,0.004318



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599623,0.00699759,0.0069952,0.00599813,0.00699425,0.00599766,0.00599766,0.00599432,0.00699401,0.00699568,0.006496,0.000499
test,0.100177,0.102936,0.0999393,0.100935,0.0979402,0.0999367,0.0999379,0.0989418,0.0989397,0.0982685,0.099795,0.001364
total,0.106173,0.109934,0.106935,0.106933,0.104934,0.105934,0.105936,0.104936,0.105934,0.105264,0.106291,0.001386



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599813,0.00599813,0.00699759,0.00599623,0.0069952,0.00599647,0.00599813,0.00599647,0.00599504,0.00599623,0.006197,0.0004
test,0.104934,0.106933,0.102937,0.104934,0.104933,0.103937,0.103935,0.10319,0.104937,0.125921,0.106659,0.006509
total,0.110932,0.112931,0.109935,0.11093,0.111928,0.109934,0.109933,0.109186,0.110932,0.131917,0.112856,0.006437



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00600147,0.00699854,0.00599623,0.0069952,0.0069952,0.00699615,0.00599551,0.00699592,0.00799322,0.00699568,0.006796,0.000599
test,0.102937,0.102938,0.103936,0.104934,0.121957,0.102231,0.104935,0.104709,0.106935,0.103935,0.105945,0.005485
total,0.108938,0.109936,0.109932,0.11193,0.128952,0.109227,0.110931,0.111705,0.114928,0.110931,0.112741,0.005643



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0069952,0.0059967,0.00699544,0.00599623,0.0069952,0.00699735,0.0059967,0.0059967,0.00799394,0.00699639,0.006696,0.00064
test,0.102938,0.106936,0.103937,0.103221,0.103937,0.105933,0.108932,0.103936,0.105936,0.103935,0.104964,0.001813
total,0.109933,0.112932,0.110933,0.109217,0.110932,0.112931,0.114929,0.109932,0.11393,0.110931,0.11166,0.001805



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0069952,0.00699663,0.00699401,0.00699592,0.00599742,0.00599885,0.00699568,0.00599551,0.00699759,0.00699592,0.006696,0.000458
test,0.103937,0.105414,0.102938,0.104936,0.105933,0.103935,0.102937,0.102937,0.105933,0.103203,0.10421,0.001181
total,0.110933,0.112411,0.109932,0.111932,0.11193,0.109934,0.109933,0.108932,0.112931,0.110199,0.110907,0.001253



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699711,0.00599957,0.00599551,0.00599551,0.00699544,0.00699735,0.00699425,0.00699759,0.00599766,0.00699425,0.006596,0.000489
test,0.103278,0.105932,0.109933,0.102938,0.102937,0.103936,0.103938,0.10319,0.103936,0.102937,0.104296,0.002065
total,0.110276,0.111932,0.115929,0.108934,0.109932,0.110933,0.110932,0.110187,0.109934,0.109931,0.110892,0.001842



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699592,0.00699639,0.00599456,0.0059967,0.00599694,0.00699425,0.00699687,0.00599551,0.0059979,0.00599861,0.006396,0.000489
test,0.103936,0.104935,0.103937,0.103936,0.103937,0.102938,0.103275,0.103937,0.105447,0.102936,0.103922,0.000756
total,0.110932,0.111932,0.109932,0.109932,0.109934,0.109933,0.110272,0.109933,0.111445,0.108934,0.110318,0.000831



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699663,0.00799656,0.00599623,0.00599504,0.00699806,0.00599694,0.00699377,0.00599813,0.0059998,0.0159898,0.007496,0.002905
test,0.124922,0.108939,0.115928,0.107935,0.11393,0.108934,0.105937,0.103934,0.132919,0.115931,0.113931,0.008596
total,0.131918,0.116936,0.121924,0.11393,0.120928,0.114931,0.112931,0.109932,0.138918,0.131921,0.121427,0.009217



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699639,0.00599599,0.00699544,0.00699544,0.00699449,0.00799513,0.00699568,0.00599456,0.00599623,0.00599766,0.006696,0.00064
test,0.105933,0.105196,0.120924,0.120929,0.106934,0.104935,0.104935,0.103936,0.105625,0.104152,0.10835,0.00634
total,0.112929,0.111192,0.12792,0.127925,0.113928,0.11293,0.111931,0.109931,0.111621,0.110149,0.115046,0.006544



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699449,0.00699496,0.00699401,0.00699496,0.00699496,0.00599718,0.00699568,0.00600505,0.00699449,0.00699687,0.006796,0.000398
test,0.106936,0.108935,0.102938,0.104937,0.105934,0.107636,0.104935,0.104172,0.10349,0.107934,0.105785,0.001916
total,0.11393,0.11593,0.109932,0.111932,0.112929,0.113633,0.111931,0.110177,0.110484,0.114931,0.112581,0.001945



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699568,0.00599742,0.00599766,0.00599599,0.00699449,0.00699568,0.0059967,0.00599551,0.0069952,0.00699735,0.006496,0.0005
test,0.102935,0.103936,0.104935,0.104937,0.102938,0.105214,0.105937,0.103938,0.103936,0.102935,0.104164,0.001006
total,0.109931,0.109933,0.110932,0.110933,0.109932,0.11221,0.111934,0.109933,0.110931,0.109932,0.11066,0.00083



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699496,0.00599575,0.00899482,0.00799417,0.00699568,0.00699711,0.00599694,0.00699711,0.00599504,0.00599623,0.006896,0.000943
test,0.105936,0.113933,0.151273,0.105938,0.104935,0.108932,0.105935,0.102937,0.104935,0.104934,0.110969,0.013737
total,0.112931,0.119929,0.160268,0.113932,0.11193,0.115929,0.111932,0.109934,0.11093,0.11093,0.117864,0.014406



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699639,0.00730348,0.00599551,0.00599694,0.00699353,0.00699401,0.0069952,0.00599766,0.0069952,0.00699425,0.006726,0.000486
test,0.103936,0.105933,0.104935,0.105935,0.106935,0.103937,0.103936,0.104934,0.105937,0.103251,0.104967,0.001132
total,0.110932,0.113237,0.11093,0.111932,0.113928,0.110931,0.110932,0.110932,0.112932,0.110245,0.111693,0.001182



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699568,0.00699353,0.00599408,0.00699639,0.00699687,0.0059948,0.00699735,0.00599599,0.00799155,0.00599623,0.006695,0.00064
test,0.103936,0.106936,0.103938,0.102935,0.102936,0.105935,0.103934,0.127157,0.13086,0.116928,0.110549,0.010052
total,0.110931,0.113929,0.109932,0.109931,0.109933,0.11193,0.110931,0.133153,0.138851,0.122924,0.117245,0.010156



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699615,0.00799251,0.00699568,0.0119927,0.00699711,0.00699496,0.00699306,0.00899363,0.00599551,0.00743961,0.007739,0.001601
test,0.112201,0.112943,0.114929,0.114929,0.104664,0.104479,0.10383,0.104936,0.106447,0.104935,0.108429,0.004454
total,0.119198,0.120936,0.121925,0.126922,0.111661,0.111474,0.110823,0.11393,0.112442,0.112375,0.116169,0.005337



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599813,0.00600004,0.00599647,0.00699449,0.0069952,0.0069952,0.00650978,0.00699592,0.00704408,0.00600696,0.006554,0.000474
test,0.105937,0.106193,0.105188,0.105935,0.149925,0.117928,0.115038,0.113931,0.104937,0.102926,0.112794,0.013289
total,0.111936,0.112193,0.111185,0.112929,0.15692,0.124923,0.121548,0.120927,0.111981,0.108933,0.119348,0.013516



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599837,0.00699592,0.00699568,0.00699735,0.00599504,0.00599599,0.00599813,0.00699639,0.00599694,0.00599742,0.006397,0.00049
test,0.104208,0.104936,0.105475,0.103935,0.105458,0.104935,0.105935,0.103935,0.103194,0.10894,0.105095,0.001514
total,0.110207,0.111932,0.112471,0.110932,0.111453,0.110931,0.111933,0.110932,0.109191,0.114938,0.111492,0.001453



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599456,0.00699496,0.00699711,0.00699568,0.00699568,0.00699663,0.00723934,0.00599527,0.00599623,0.0069952,0.00672,0.00048
test,0.116929,0.111439,0.107255,0.105935,0.107137,0.104934,0.103468,0.10388,0.107932,0.103448,0.107236,0.003986
total,0.122924,0.118433,0.114252,0.112931,0.114133,0.11193,0.110707,0.109875,0.113928,0.110444,0.113956,0.003812



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699735,0.0059967,0.00699353,0.00599766,0.00699759,0.00699401,0.00599551,0.00699663,0.00699568,0.00599861,0.006596,0.000489
test,0.103822,0.104173,0.104937,0.104825,0.102921,0.103941,0.105944,0.107651,0.106939,0.106936,0.105209,0.001501
total,0.11082,0.110169,0.11193,0.110823,0.109919,0.110935,0.11194,0.114648,0.113935,0.112935,0.111805,0.001512



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00699496,0.0059917,0.00699735,0.00599623,0.00599456,0.00599623,0.00599647,0.0069952,0.00599408,0.0069952,0.006395,0.00049
test,0.107934,0.108162,0.105269,0.106117,0.10796,0.108933,0.106964,0.106941,0.104943,0.105339,0.106856,0.001321
total,0.114929,0.114154,0.112266,0.112113,0.113955,0.114929,0.112961,0.113936,0.110937,0.112334,0.113251,0.001263



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00599432,0.00699711,0.00599694,0.0059886,0.00699306,0.00699329,0.00699854,0.00599623,0.00599647,0.00599766,0.006395,0.00049
test,0.104225,0.105945,0.112414,0.103441,0.103634,0.104936,0.104933,0.104106,0.104762,0.103935,0.105233,0.002494
total,0.11022,0.112942,0.118411,0.109429,0.110627,0.11193,0.111931,0.110102,0.110758,0.109932,0.111628,0.002484



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.434783,0.304348,0.521739,0.565217,0.565217,0.347826,0.478261,0.478261,0.565217,0.6,0.304348,0.6,0.480435,,0.105496
1.0,0.434783,0.304348,0.521739,0.565217,0.565217,0.304348,0.434783,0.478261,0.565217,0.6,0.304348,0.6,0.473188,-0.007246,0.167046
0.75,0.434783,0.304348,0.521739,0.521739,0.565217,0.347826,0.478261,0.478261,0.565217,0.64,0.304348,0.64,0.483478,0.01029,0.165995
0.5,0.434783,0.304348,0.521739,0.565217,0.565217,0.347826,0.478261,0.521739,0.565217,0.6,0.304348,0.6,0.484058,0.00058,0.164586
0.25,0.434783,0.304348,0.521739,0.521739,0.565217,0.304348,0.478261,0.478261,0.565217,0.6,0.304348,0.6,0.473188,-0.01087,0.166049
0.1,0.434783,0.347826,0.521739,0.478261,0.565217,0.347826,0.478261,0.521739,0.521739,0.6,0.347826,0.6,0.480435,0.007246,0.152835
0.05,0.434783,0.304348,0.565217,0.521739,0.521739,0.347826,0.391304,0.521739,0.478261,0.6,0.304348,0.6,0.465942,-0.014493,0.162277
0.01,0.347826,0.217391,0.521739,0.521739,0.304348,0.347826,0.434783,0.521739,0.478261,0.56,0.217391,0.56,0.41942,-0.046522,0.171274
0.005,0.521739,0.478261,0.391304,0.652174,0.304348,0.304348,0.434783,0.608696,0.565217,0.56,0.304348,0.652174,0.481449,0.062029,0.165728
0.001,0.391304,0.478261,0.391304,0.347826,0.695652,0.608696,0.26087,0.521739,0.521739,0.52,0.26087,0.695652,0.474493,-0.006957,0.187857



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.434783,0.304348,0.521739,0.565217,0.565217,0.347826,0.478261,0.478261,0.565217,0.6,0.304348,0.6,0.480435,,0.105496
1.0,0.608696,0.478261,0.608696,0.347826,0.304348,0.391304,0.73913,0.608696,0.478261,0.48,0.304348,0.73913,0.507391,0.026957,0.191424
0.75,0.521739,0.304348,0.608696,0.347826,0.304348,0.608696,0.26087,0.608696,0.478261,0.52,0.26087,0.608696,0.452754,-0.054638,0.191142
0.5,0.391304,0.521739,0.608696,0.347826,0.695652,0.608696,0.26087,0.608696,0.478261,0.48,0.26087,0.695652,0.496522,0.043768,0.186648
0.25,0.608696,0.521739,0.608696,0.652174,0.304348,0.391304,0.73913,0.391304,0.478261,0.56,0.304348,0.73913,0.524928,0.028406,0.19362
0.1,0.391304,0.521739,0.391304,0.347826,0.695652,0.608696,0.26087,0.391304,0.478261,0.48,0.26087,0.695652,0.46029,-0.064638,0.196044
0.05,0.391304,0.521739,0.391304,0.652174,0.304348,0.608696,0.26087,0.608696,0.478261,0.52,0.26087,0.652174,0.47087,0.01058,0.182739
0.01,0.608696,0.521739,0.391304,0.652174,0.304348,0.391304,0.73913,0.608696,0.521739,0.52,0.304348,0.73913,0.525217,0.054348,0.1883
0.005,0.608696,0.478261,0.608696,0.652174,0.304348,0.391304,0.26087,0.608696,0.478261,0.48,0.26087,0.652174,0.482029,-0.043188,0.196084
0.001,0.608696,0.478261,0.608696,0.347826,0.304348,0.391304,0.26087,0.391304,0.521739,0.52,0.26087,0.608696,0.441884,-0.040145,0.177904



Wall time: 27.6 s


# Nursery Dataset

In [17]:
%%time

# Define the headings to be used
headings = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health']

# Read in the official training and testing data separately
nursery_data = pd.read_csv('datasets/Nursery/nursery.data', names=headings)

# Apply some pre-processing
for heading in headings:
    nursery_data[heading] = nursery_data[heading].astype('category').str.strip()
    
# Convert every '?' to NaN, so Pandas' built-in functions can be used
nursery_data = nursery_data.replace({'?': np.NaN})

# Find out which features contain missing (NaN) values
nan_features = nursery_data.columns[nursery_data.isnull().any()]
nan_features_count = nursery_data[nan_features].isnull().sum()

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Get the number of records in the data set
n_records_before = nursery_data.shape[0]

# Drop the rows with missing values, since they occur in multiple features
nursery_data = nursery_data.dropna()

# Get the number of records in the data set
n_records_after = nursery_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
nursery_data = nursery_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


Series([], dtype: float64)


0 records were dropped due to missing values.
This is 0.0 % of the entire data set.
The resulting data set contains 12960 records.

Wall time: 93.5 ms


In [18]:
display(nursery_data.head())

nursery_data.shape

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health
0,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


(12960, 8)

In [19]:
%%time

run_classifier(nursery_data, 'health', 'datasets/Nursery/buckets_nursery.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0169892,0.00999236,0.00999379,0.00899267,0.00999403,0.0119934,0.00999403,0.0119917,0.0109925,0.0109928,0.011193,0.002134
test,0.205872,0.197211,0.198489,0.194766,0.198222,0.239851,0.201875,0.278066,0.288939,0.238043,0.224133,0.033607
total,0.222862,0.207203,0.208483,0.203759,0.208216,0.251844,0.211869,0.290058,0.299931,0.249036,0.235326,0.034027



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999379,0.0139918,0.0179899,0.0199881,0.00999379,0.0109937,0.0139923,0.0159898,0.0129926,0.0119934,0.013792,0.003186
test,0.230092,0.251846,0.335793,0.314806,0.237855,0.22309,0.22486,0.300816,0.237181,0.268836,0.262518,0.038727
total,0.240086,0.265838,0.353783,0.334795,0.247849,0.234084,0.238853,0.316806,0.250174,0.280829,0.27631,0.041453



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0129914,0.0122545,0.0119913,0.010994,0.0109925,0.0129907,0.00999522,0.0129919,0.00999308,0.00999594,0.011519,0.001216
test,0.215869,0.208873,0.207874,0.21187,0.24714,0.209872,0.200877,0.197691,0.197878,0.196877,0.209482,0.014051
total,0.22886,0.221127,0.219865,0.222864,0.258133,0.222862,0.210872,0.210683,0.207871,0.206873,0.221001,0.014247



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.010993,0.0102963,0.0109923,0.00999331,0.00999355,0.00899482,0.0109923,0.00999331,0.00999165,0.00999451,0.010224,0.000596
test,0.19988,0.208871,0.20987,0.20288,0.20307,0.209451,0.205873,0.197878,0.19588,0.196877,0.203053,0.005057
total,0.210873,0.219167,0.220863,0.212873,0.213064,0.218446,0.216866,0.207871,0.205871,0.206872,0.213277,0.005127



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0159912,0.0109932,0.00999355,0.00999308,0.00999308,0.0109923,0.00999331,0.00999355,0.00999379,0.0109916,0.010893,0.001757
test,0.202876,0.199305,0.198314,0.196879,0.200139,0.204882,0.203875,0.197877,0.199037,0.198014,0.20012,0.002635
total,0.218867,0.210299,0.208308,0.206872,0.210132,0.215874,0.213868,0.207871,0.209031,0.209006,0.211013,0.003702



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999379,0.00999379,0.0109932,0.00999522,0.00999427,0.00999379,0.00999594,0.00999475,0.00973082,0.00955296,0.010024,0.000354
test,0.201876,0.196217,0.198387,0.202874,0.196879,0.19722,0.196878,0.196878,0.210871,0.200009,0.199809,0.004283
total,0.21187,0.20621,0.20938,0.212869,0.206874,0.207214,0.206874,0.206873,0.220602,0.209562,0.209833,0.004194



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00980592,0.0109925,0.00999355,0.0109923,0.00999403,0.00999403,0.00899363,0.00999331,0.0169897,0.00999355,0.010774,0.002142
test,0.206872,0.20987,0.201877,0.197878,0.202877,0.201661,0.200886,0.199883,0.196445,0.203875,0.202212,0.003784
total,0.216678,0.220862,0.21187,0.20887,0.212871,0.211655,0.209879,0.209876,0.213434,0.213868,0.212986,0.003405



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999403,0.00999379,0.0109944,0.00899434,0.00899458,0.00999403,0.00999546,0.00999331,0.00999475,0.00899434,0.009794,0.0006
test,0.197877,0.199876,0.196215,0.203874,0.19588,0.200876,0.201874,0.196879,0.197878,0.198094,0.198932,0.002482
total,0.207871,0.20987,0.207209,0.212868,0.204875,0.21087,0.21187,0.206872,0.207873,0.207089,0.208727,0.002401



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109894,0.00999331,0.00999331,0.00999379,0.00999308,0.00999188,0.00999379,0.00999403,0.00999427,0.00999355,0.010093,0.000299
test,0.198879,0.196879,0.200132,0.205873,0.211871,0.198923,0.196634,0.199877,0.19688,0.198184,0.200413,0.004586
total,0.209868,0.206872,0.210126,0.215867,0.221864,0.208915,0.206627,0.209871,0.206875,0.208177,0.210506,0.004562



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999331,0.00999379,0.00999236,0.010994,0.00999665,0.00999355,0.00999403,0.00999284,0.00999355,0.00999308,0.010094,0.0003
test,0.200874,0.197877,0.19688,0.203205,0.200874,0.205873,0.198175,0.198878,0.199876,0.197878,0.200039,0.002639
total,0.210868,0.207871,0.206873,0.214199,0.21087,0.215866,0.208169,0.208871,0.20987,0.207871,0.210133,0.002773



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999403,0.0099926,0.00999355,0.0109923,0.00999427,0.00999355,0.0109932,0.00999403,0.00999427,0.00999546,0.010194,0.0004
test,0.198879,0.201876,0.200876,0.201174,0.202879,0.197878,0.198877,0.19488,0.197751,0.19889,0.199396,0.002232
total,0.208873,0.211869,0.21087,0.212166,0.212874,0.207871,0.20987,0.204874,0.207745,0.208885,0.20959,0.002322



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999212,0.00999403,0.0109932,0.00999546,0.00999379,0.00999403,0.00999379,0.00999689,0.00999403,0.00999379,0.010094,0.0003
test,0.202131,0.199877,0.19695,0.197877,0.197879,0.199588,0.19688,0.204108,0.236853,0.262344,0.209449,0.02098
total,0.212123,0.209871,0.207943,0.207873,0.207873,0.209582,0.206874,0.214105,0.246847,0.272338,0.219543,0.020923



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119913,0.0109932,0.010994,0.00999427,0.00999355,0.0189893,0.00999451,0.0149922,0.010993,0.00999355,0.011893,0.002772
test,0.212202,0.21287,0.21087,0.204117,0.238856,0.23613,0.212869,0.20887,0.213869,0.204875,0.215553,0.011429
total,0.224194,0.223863,0.221864,0.214111,0.24885,0.255119,0.222863,0.223862,0.224862,0.214868,0.227446,0.012865



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109942,0.00999331,0.0109932,0.00999427,0.00999451,0.00999355,0.00999522,0.0103176,0.0099926,0.00999355,0.010226,0.000396
test,0.206191,0.209872,0.204874,0.204093,0.200876,0.199878,0.203874,0.205872,0.197879,0.201876,0.203528,0.003318
total,0.217185,0.219865,0.215867,0.214087,0.210871,0.209871,0.213869,0.216189,0.207871,0.211869,0.213755,0.003481



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999403,0.0109937,0.00999427,0.0109932,0.00999403,0.00999427,0.00999403,0.00999427,0.00999427,0.00999308,0.010194,0.0004
test,0.207134,0.199877,0.218868,0.207525,0.207872,0.201875,0.200876,0.209141,0.206872,0.207872,0.206791,0.005112
total,0.217129,0.21087,0.228862,0.218518,0.217866,0.21187,0.21087,0.219136,0.216866,0.217865,0.216985,0.005006



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999379,0.0109935,0.00998449,0.0119922,0.00899529,0.00999403,0.00999546,0.0109906,0.0109947,0.00999379,0.010393,0.0008
test,0.200094,0.206873,0.221864,0.206872,0.216866,0.257841,0.201876,0.202876,0.199877,0.20987,0.212491,0.016588
total,0.210088,0.217866,0.231848,0.218864,0.225862,0.267835,0.211871,0.213867,0.210872,0.219863,0.222884,0.016343



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999427,0.0099864,0.0109921,0.00999379,0.0119922,0.00999379,0.0109918,0.00999475,0.00999331,0.0109942,0.010493,0.000671
test,0.217148,0.206873,0.20987,0.205096,0.212871,0.199878,0.195881,0.21087,0.19788,0.209826,0.206619,0.006545
total,0.227142,0.21686,0.220862,0.21509,0.224863,0.209872,0.206873,0.220865,0.207873,0.22082,0.217112,0.006703



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0109928,0.00999379,0.00999403,0.00999331,0.0109932,0.00999331,0.0109949,0.00999379,0.00999331,0.0109942,0.010394,0.00049
test,0.195354,0.20262,0.196878,0.198879,0.200876,0.198877,0.198878,0.202211,0.198882,0.198879,0.199233,0.002104
total,0.206347,0.212613,0.206872,0.208872,0.21187,0.20887,0.209873,0.212205,0.208876,0.209873,0.209627,0.002019



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119939,0.00999355,0.00999355,0.00999284,0.00999475,0.0109925,0.00999355,0.00999403,0.00999498,0.0119913,0.010493,0.000806
test,0.20415,0.197877,0.199878,0.202875,0.199474,0.197878,0.196878,0.196878,0.201202,0.197874,0.199496,0.002402
total,0.216144,0.207871,0.209871,0.212868,0.209469,0.208871,0.206872,0.206872,0.211197,0.209865,0.20999,0.002704



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999355,0.00999284,0.00999403,0.00999522,0.00999331,0.00999403,0.0101357,0.00999475,0.00999331,0.00999379,0.010008,4.3e-05
test,0.198877,0.198163,0.200772,0.205874,0.197142,0.205364,0.195879,0.200996,0.197266,0.196878,0.199721,0.003333
total,0.208871,0.208156,0.210766,0.215869,0.207135,0.215358,0.206015,0.210991,0.207259,0.206872,0.209729,0.003317



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999427,0.00999475,0.00994396,0.00999546,0.010993,0.0103629,0.00999403,0.00999451,0.00999427,0.00999403,0.010126,0.00031
test,0.202875,0.198265,0.198877,0.200875,0.196878,0.202875,0.200019,0.199876,0.20033,0.209871,0.201074,0.003425
total,0.21287,0.20826,0.208821,0.210871,0.207871,0.213238,0.210013,0.209871,0.210324,0.219865,0.2112,0.003339



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.00999355,0.0109925,0.0109935,0.00999379,0.00999308,0.00999403,0.00999475,0.00999379,0.00999331,0.00999427,0.010194,0.0004
test,0.196878,0.199877,0.21087,0.199876,0.197878,0.199666,0.201875,0.199879,0.197879,0.202362,0.200704,0.003758
total,0.206872,0.21087,0.221864,0.20987,0.207871,0.20966,0.21187,0.209873,0.207872,0.212356,0.210898,0.004019



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.318673,0.298611,0.3125,0.307099,0.311728,0.324846,0.298611,0.332562,0.328704,0.314815,0.298611,0.332562,0.314943,,0.012248
1.0,0.239198,0.295525,0.309414,0.131173,0.260031,0.229167,0.248457,0.289352,0.290123,0.260802,0.131173,0.309414,0.249486,-0.065458,0.1013
0.75,0.205247,0.300154,0.161265,0.310957,0.165895,0.283179,0.277006,0.031636,0.327932,0.292438,0.031636,0.327932,0.226273,-0.023212,0.119411
0.5,0.212963,0.292438,0.256173,0.307099,0.267747,0.32716,0.145833,0.334877,0.079475,0.214506,0.079475,0.334877,0.237719,0.011445,0.104935
0.25,0.28858,0.089506,0.3125,0.283179,0.273148,0.016975,0.259259,0.310185,0.334877,0.26466,0.016975,0.334877,0.23206,-0.005658,0.126592
0.1,0.320216,0.125,0.281636,0.212191,0.277778,0.321759,0.287037,0.287809,0.315586,0.304012,0.125,0.321759,0.264982,0.032922,0.090729
0.05,0.321759,0.232253,0.306327,0.304784,0.249228,0.175154,0.249228,0.340278,0.290123,0.28858,0.175154,0.340278,0.272762,0.00778,0.088241
0.01,0.32716,0.326389,0.236883,0.021605,0.064043,0.263889,0.307099,0.239969,0.287037,0.290895,0.021605,0.32716,0.226145,-0.046618,0.131757
0.005,0.017747,0.263117,0.290123,0.111111,0.29321,0.324846,0.277778,0.005401,0.27392,0.259259,0.005401,0.324846,0.203897,-0.022248,0.133345
0.001,0.326389,0.141975,0.145833,0.069444,0.155864,0.300154,0.100309,0.108796,0.306327,0.111111,0.069444,0.326389,0.18017,-0.023727,0.109427



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.318673,0.298611,0.3125,0.307099,0.311728,0.324846,0.298611,0.332562,0.328704,0.314815,0.298611,0.332562,0.314943,,0.012248
1.0,0.016204,0.002315,0.002315,0.003086,0.16358,0.153549,0.018519,0.241512,0.010031,0.016204,0.002315,0.241512,0.072595,-0.242348,0.123054
0.75,0.026235,0.003086,0.058642,0.152778,0.278549,0.216049,0.00463,0.218364,0.017747,0.006173,0.003086,0.278549,0.105324,0.032729,0.107122
0.5,0.029321,0.002315,0.037037,0.026235,0.003086,0.074846,0.020833,0.020062,0.017747,0.029321,0.002315,0.074846,0.028164,-0.07716,0.036095
0.25,0.005401,0.006173,0.006944,0.198302,0.007716,0.097994,0.002315,0.005401,0.005401,0.176698,0.002315,0.198302,0.059414,0.03125,0.077506
0.1,0.03858,0.003858,0.007716,0.007716,0.003086,0.027778,0.199074,0.146605,0.059414,0.021605,0.003086,0.199074,0.059799,0.000386,0.072074
0.05,0.250772,0.136574,0.171296,0.347222,0.000772,0.017747,0.00463,0.003086,0.023148,0.006173,0.000772,0.347222,0.109118,0.049318,0.128113
0.01,0.122685,0.02392,0.0,0.088735,0.17284,0.00463,0.030093,0.031636,0.000772,0.111111,0.0,0.17284,0.063272,-0.045846,0.068239
0.005,0.023148,0.013117,0.012346,0.102623,0.020833,0.294753,0.233796,0.023148,0.005401,0.037037,0.005401,0.294753,0.088863,0.025592,0.107835
0.001,0.018519,0.297068,0.311728,0.167438,0.006173,0.00463,0.26466,0.122685,0.087963,0.135802,0.00463,0.311728,0.144419,0.055556,0.117469



Wall time: 1min 5s


# Optical Digit Recognition Dataset

In [20]:
%%time

# Define the headings to be used
headings = list(range(0, 64))
headings.append('label')

# Read in the official training and testing data separately
opt_training_data = pd.read_csv('datasets/Optical_Digit_Recognition/optdigits.tra', names=headings)
opt_testing_data = pd.read_csv('datasets/Optical_Digit_Recognition/optdigits.tes', names=headings)

opt_data = opt_training_data.append(opt_testing_data)

# Apply some pre-processing
for heading in headings:
    opt_data[heading] = opt_data[heading].astype('int').apply(pd.to_numeric, downcast="unsigned")
    
# Find out which features contain missing (NaN) values
nan_features = opt_data.columns[opt_data.isnull().any()]
nan_features_count = opt_data[nan_features].isnull().sum()

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Get the number of records in the data set
n_records_before = opt_data.shape[0]

# Drop the rows with missing values, since they occur in multiple features
opt_data = opt_data.dropna()

# Get the number of records in the data set
n_records_after = opt_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
opt_data = opt_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


Series([], dtype: float64)


0 records were dropped due to missing values.
This is 0.0 % of the entire data set.
The resulting data set contains 5620 records.

Wall time: 1.8 s


In [21]:
display(opt_data.head())

opt_data.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,label
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,6,14,7,1,0,0,0
1,0,0,10,16,6,0,0,0,0,7,...,0,0,0,10,16,15,3,0,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,9,14,0,0,0,0,7
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,1,15,2,0,0,4
4,0,0,5,14,4,0,0,0,0,0,...,0,0,0,4,12,14,7,0,0,6


(5620, 65)

In [22]:
%%time

run_classifier(opt_data, 'label', 'datasets/Optical_Digit_Recognition/buckets_optdigits.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0419719,0.038976,0.037977,0.0369761,0.0399742,0.038976,0.0389752,0.0429754,0.0389748,0.038976,0.039475,0.001687
test,0.0969455,0.0959411,0.0989711,0.0977829,0.0999396,0.095942,0.0949419,0.0969403,0.0989397,0.0944541,0.09708,0.001726
total,0.138917,0.134917,0.136948,0.134759,0.139914,0.134918,0.133917,0.139916,0.137914,0.13343,0.136555,0.002355



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0369775,0.0389559,0.0379758,0.0369761,0.0379741,0.0399754,0.0399747,0.0459731,0.0399737,0.0399747,0.039473,0.002459
test,0.100937,0.0944471,0.0969405,0.0929444,0.105937,0.103935,0.114931,0.118926,0.119929,0.0959401,0.104487,0.00967
total,0.137914,0.133403,0.134916,0.12992,0.143911,0.143911,0.154905,0.164899,0.159902,0.135915,0.14396,0.011419



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0839477,0.0659585,0.0619619,0.0609624,0.0619614,0.0609627,0.0639606,0.0729558,0.0629637,0.0609627,0.06566,0.00701
test,0.10194,0.0949411,0.0949426,0.0949495,0.0951886,0.0964556,0.0989435,0.0959413,0.0959496,0.0992537,0.096851,0.00227
total,0.185888,0.1609,0.156904,0.155912,0.15715,0.157418,0.162904,0.168897,0.158913,0.160216,0.16251,0.00859



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0639606,0.0639598,0.0639582,0.0609624,0.0632327,0.0639584,0.061923,0.0619609,0.0629606,0.0602565,0.062713,0.001297
test,0.10094,0.100973,0.105937,0.0999622,0.106936,0.0999386,0.101937,0.100939,0.103936,0.10194,0.102344,0.002327
total,0.1649,0.164933,0.169896,0.160925,0.170168,0.163897,0.16386,0.1629,0.166897,0.162196,0.165057,0.002927



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0729547,0.0676708,0.0619617,0.0589643,0.0609627,0.0609632,0.0629609,0.0629611,0.0609632,0.0609629,0.063133,0.003933
test,0.0969419,0.105935,0.0939445,0.0953364,0.0989394,0.0949414,0.0949421,0.0949438,0.0949414,0.0969405,0.096781,0.003349
total,0.169897,0.173606,0.155906,0.154301,0.159902,0.155905,0.157903,0.157905,0.155905,0.157903,0.159913,0.006156



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619609,0.065959,0.0639765,0.0619631,0.0639834,0.0719566,0.0609629,0.0619621,0.0613086,0.0629606,0.063699,0.003102
test,0.104937,0.0989423,0.100939,0.102309,0.10296,0.104939,0.109955,0.100939,0.102086,0.106936,0.103494,0.003086
total,0.166898,0.164901,0.164915,0.164272,0.166944,0.176896,0.170918,0.162901,0.163395,0.169896,0.167194,0.00409



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0749528,0.0689573,0.0629604,0.0612154,0.0619626,0.0649588,0.0729549,0.0609627,0.0616634,0.0619626,0.065255,0.004916
test,0.0979397,0.0969412,0.0999641,0.0949419,0.0954163,0.0949657,0.108932,0.0949621,0.0989387,0.0984747,0.098148,0.003998
total,0.172893,0.165899,0.162925,0.156157,0.157379,0.159925,0.181887,0.155925,0.160602,0.160437,0.163403,0.007821



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619617,0.0649664,0.0762677,0.058965,0.0609682,0.0629618,0.0619624,0.0649605,0.0659614,0.0609627,0.063994,0.004576
test,0.0999386,0.100938,0.101938,0.102936,0.10755,0.0999389,0.101448,0.101195,0.102937,0.101964,0.102078,0.002076
total,0.1619,0.165904,0.178205,0.161901,0.168518,0.162901,0.16341,0.166156,0.168898,0.162927,0.166072,0.004723



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0739536,0.0629618,0.0622671,0.0599642,0.0619612,0.0609617,0.0609627,0.0653644,0.0648079,0.0659587,0.063916,0.003851
test,0.109932,0.0969405,0.0969396,0.0959408,0.0959411,0.0999393,0.0999396,0.0969679,0.100938,0.101936,0.099542,0.004026
total,0.183886,0.159902,0.159207,0.155905,0.157902,0.160901,0.160902,0.162332,0.165745,0.167895,0.163458,0.007584



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619621,0.0623124,0.0619259,0.059963,0.0619619,0.06496,0.0639603,0.0639815,0.0629604,0.0613139,0.06253,0.001392
test,0.101936,0.102933,0.108931,0.101938,0.100939,0.101938,0.105935,0.102937,0.105935,0.103935,0.103736,0.002356
total,0.163898,0.165246,0.170857,0.161901,0.162901,0.166898,0.169896,0.166919,0.168895,0.165249,0.166266,0.002829



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0769498,0.0639601,0.0669565,0.0609615,0.0629609,0.0619633,0.0749555,0.0659585,0.0639606,0.0619643,0.066059,0.005258
test,0.0989408,0.0973232,0.0979421,0.0969419,0.11093,0.0989389,0.098762,0.09794,0.100939,0.098938,0.09976,0.003869
total,0.175891,0.161283,0.164899,0.157903,0.173891,0.160902,0.173717,0.163898,0.1649,0.160902,0.165819,0.006049



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0629604,0.0819347,0.0629625,0.059963,0.0644677,0.0639617,0.0679591,0.0629618,0.0629609,0.0629625,0.065309,0.005849
test,0.105968,0.101937,0.100938,0.102664,0.102937,0.101936,0.105935,0.103935,0.101937,0.102936,0.103112,0.001611
total,0.168928,0.183871,0.163901,0.162627,0.167405,0.165897,0.173894,0.166897,0.164898,0.165898,0.168422,0.005928



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0759528,0.0649593,0.0619619,0.0599642,0.0619719,0.0629609,0.06496,0.0619617,0.0609648,0.0609634,0.063662,0.004379
test,0.0991712,0.0989406,0.102939,0.0989404,0.0989285,0.0981779,0.0989397,0.0979424,0.102937,0.0989387,0.099586,0.001716
total,0.175124,0.1639,0.164901,0.158905,0.1609,0.161139,0.1639,0.159904,0.163902,0.159902,0.163248,0.00443



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0612013,0.0629625,0.0619631,0.0599623,0.0629585,0.074209,0.0629611,0.0629599,0.0639591,0.0624723,0.063561,0.003705
test,0.100973,0.100936,0.101456,0.101945,0.103939,0.105936,0.100938,0.100943,0.10096,0.0999401,0.101797,0.0017
total,0.162174,0.163899,0.163419,0.161907,0.166897,0.180145,0.163899,0.163903,0.164919,0.162412,0.165358,0.005119



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.062963,0.0639606,0.0649612,0.0589666,0.0609624,0.0609627,0.0609641,0.0619636,0.061933,0.0609615,0.06186,0.001639
test,0.106932,0.101938,0.102937,0.0999639,0.100938,0.100952,0.104934,0.100936,0.100941,0.103242,0.102371,0.002064
total,0.169895,0.165899,0.167898,0.158931,0.1619,0.161915,0.165898,0.1629,0.162874,0.164204,0.164231,0.003059



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0679569,0.0609627,0.0619619,0.0599635,0.0609641,0.0629597,0.0619614,0.0614727,0.0619595,0.0609615,0.062112,0.002096
test,0.100939,0.100944,0.111937,0.102936,0.100214,0.0999486,0.100974,0.101751,0.103944,0.101936,0.102552,0.003333
total,0.168896,0.161906,0.173898,0.162899,0.161178,0.162908,0.162935,0.163224,0.165903,0.162898,0.164665,0.003726



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0759532,0.0609641,0.0629671,0.0599625,0.0609624,0.0609612,0.0609677,0.0619617,0.0619602,0.0649924,0.063165,0.004466
test,0.101938,0.100938,0.100453,0.100936,0.109933,0.102944,0.100939,0.100944,0.11217,0.104937,0.103613,0.003959
total,0.177891,0.161902,0.16342,0.160898,0.170896,0.163906,0.161906,0.162906,0.17413,0.16993,0.166779,0.005666



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0649595,0.0719533,0.063962,0.0599608,0.0619617,0.0619631,0.0639625,0.0622871,0.0619631,0.0619628,0.063494,0.003122
test,0.105935,0.117928,0.100939,0.148616,0.100941,0.0999386,0.108933,0.0999389,0.103941,0.102974,0.109008,0.014193
total,0.170895,0.189881,0.164901,0.208576,0.162902,0.161902,0.172895,0.162226,0.165905,0.164937,0.172502,0.014404



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0819471,0.0619652,0.0619638,0.0679595,0.0619626,0.0619709,0.0619602,0.0629613,0.0629606,0.0629604,0.064861,0.005952
test,0.104937,0.102937,0.105236,0.102938,0.102938,0.104926,0.10394,0.100302,0.106935,0.101455,0.103654,0.00185
total,0.186884,0.164902,0.167199,0.170897,0.1649,0.166897,0.1659,0.163263,0.169895,0.164416,0.168515,0.006532



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0619617,0.091943,0.08395,0.0739543,0.0719573,0.0809481,0.0799503,0.0789516,0.0641508,0.0613484,0.074912,0.009628
test,0.129919,0.128783,0.119926,0.112631,0.11393,0.13192,0.114932,0.107935,0.107933,0.0999372,0.116785,0.010123
total,0.191881,0.220726,0.203876,0.186585,0.185887,0.212868,0.194882,0.186887,0.172084,0.161286,0.191696,0.016858



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0819283,0.0629601,0.0619588,0.0599618,0.0619655,0.0609622,0.0619617,0.0634847,0.0629692,0.0609691,0.063912,0.00609
test,0.111932,0.10094,0.100939,0.100564,0.100938,0.113447,0.105941,0.100938,0.099931,0.100449,0.103602,0.004825
total,0.19386,0.1639,0.162898,0.160526,0.162904,0.174409,0.167902,0.164423,0.1629,0.161418,0.167514,0.009562



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0629616,0.0619621,0.0619619,0.0599673,0.0609646,0.0639594,0.0612693,0.0649595,0.0629616,0.061234,0.06222,0.001424
test,0.10294,0.100938,0.104937,0.103939,0.10123,0.100456,0.101937,0.103937,0.106936,0.100161,0.102741,0.002083
total,0.165902,0.1629,0.166898,0.163907,0.162195,0.164415,0.163207,0.168896,0.169897,0.161395,0.164961,0.002713



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.727758,0.718861,0.676157,0.747331,0.729537,0.715302,0.706406,0.724199,0.715302,0.692171,0.676157,0.747331,0.714709,,0.022686
1.0,0.741993,0.733096,0.658363,0.733096,0.725979,0.715302,0.717082,0.685053,0.702847,0.683274,0.658363,0.741993,0.708037,-0.006673,0.193041
0.75,0.711744,0.747331,0.656584,0.759786,0.740214,0.69573,0.676157,0.715302,0.720641,0.640569,0.640569,0.759786,0.705368,-0.002669,0.193451
0.5,0.637011,0.718861,0.715302,0.708185,0.745552,0.738434,0.727758,0.669039,0.642349,0.669039,0.637011,0.745552,0.696174,-0.009193,0.192566
0.25,0.661922,0.567616,0.558719,0.653025,0.533808,0.606762,0.608541,0.580071,0.629893,0.55516,0.533808,0.661922,0.595937,-0.100237,0.191202
0.1,0.482206,0.539146,0.467972,0.533808,0.450178,0.572954,0.524911,0.352313,0.619217,0.25089,0.25089,0.619217,0.471975,-0.123962,0.197512
0.05,0.270463,0.450178,0.355872,0.359431,0.450178,0.459075,0.354093,0.320285,0.274021,0.425267,0.270463,0.459075,0.3707,-0.101275,0.144298
0.01,0.147687,0.199288,0.302491,0.272242,0.234875,0.236655,0.215302,0.314947,0.104982,0.113879,0.104982,0.314947,0.213523,-0.157177,0.123588
0.005,0.259786,0.188612,0.149466,0.145907,0.206406,0.233096,0.213523,0.291815,0.256228,0.243772,0.145907,0.291815,0.218861,0.005338,0.075158
0.001,0.113879,0.11032,0.172598,0.213523,0.215302,0.213523,0.161922,0.241993,0.19573,0.183274,0.11032,0.241993,0.181198,-0.037663,0.07357



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.727758,0.718861,0.676157,0.747331,0.729537,0.715302,0.706406,0.724199,0.715302,0.692171,0.676157,0.747331,0.714709,,0.022686
1.0,0.053381,0.071174,0.113879,0.131673,0.201068,0.128114,0.120996,0.24911,0.128114,0.092527,0.053381,0.24911,0.132711,-0.581999,0.200886
0.75,0.154804,0.137011,0.092527,0.097865,0.099644,0.144128,0.05694,0.140569,0.078292,0.078292,0.05694,0.154804,0.107651,-0.025059,0.049094
0.5,0.087189,0.106762,0.186833,0.119217,0.154804,0.042705,0.128114,0.131673,0.174377,0.080071,0.042705,0.186833,0.120107,0.012456,0.054674
0.25,0.092527,0.048043,0.108541,0.081851,0.069395,0.135231,0.199288,0.096085,0.094306,0.144128,0.048043,0.199288,0.109727,-0.01038,0.056844
0.1,0.05694,0.156584,0.177936,0.13879,0.106762,0.129893,0.104982,0.071174,0.126335,0.101423,0.05694,0.177936,0.117141,0.007414,0.048602
0.05,0.19573,0.172598,0.092527,0.1121,0.096085,0.177936,0.035587,0.108541,0.067616,0.131673,0.035587,0.19573,0.118476,0.001335,0.061368
0.01,0.104982,0.122776,0.160142,0.161922,0.065836,0.16548,0.096085,0.117438,0.140569,0.106762,0.065836,0.16548,0.122776,0.0043,0.046181
0.005,0.014235,0.137011,0.129893,0.024911,0.131673,0.090747,0.040925,0.160142,0.024911,0.085409,0.014235,0.160142,0.08452,-0.038256,0.062879
0.001,0.144128,0.108541,0.179715,0.115658,0.101423,0.117438,0.044484,0.065836,0.153025,0.104982,0.044484,0.179715,0.113286,0.028766,0.048009



Wall time: 46 s


# Skin Dataset

In [23]:
%%time

# Define the headings to be used
headings = ['B', 'G', 'R', 'label']

# Read in the official training and testing data separately
skin_data = pd.read_csv('datasets/Skin/Skin_NonSkin.txt', names=headings, delimiter='\t')

# Apply some pre-processing
for heading in headings:
    skin_data[heading] = skin_data[heading].astype('int').apply(pd.to_numeric, downcast="unsigned")
    
# Find out which features contain missing (NaN) values
nan_features = skin_data.columns[skin_data.isnull().any()]
nan_features_count = skin_data[nan_features].isnull().sum()

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Get the number of records in the data set
n_records_before = skin_data.shape[0]

# Drop the rows with missing values, since they occur in multiple features
skin_data = skin_data.dropna()

# Get the number of records in the data set
n_records_after = skin_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
skin_data = skin_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


Series([], dtype: float64)


0 records were dropped due to missing values.
This is 0.0 % of the entire data set.
The resulting data set contains 245057 records.

Wall time: 4.72 s


In [24]:
display(skin_data.head())

skin_data.shape

Unnamed: 0,B,G,R,label
0,74,85,123,1
1,73,84,122,1
2,72,83,121,1
3,70,81,119,1
4,70,81,119,1


(245057, 4)

In [25]:
%%time

run_classifier(skin_data, 'label', 'datasets/Skin/buckets_skin.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0939553,0.11791,0.141306,0.102936,0.0989382,0.101938,0.096453,0.0959413,0.124923,0.0959389,0.107024,0.015001
test,0.06496,0.0739541,0.0739532,0.096941,0.057965,0.0569658,0.0609627,0.0583649,0.0699577,0.0599632,0.067399,0.011607
total,0.158915,0.191864,0.21526,0.199877,0.156903,0.158904,0.157416,0.154306,0.194881,0.155902,0.174423,0.022053



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0939415,0.0952511,0.0939448,0.0938175,0.093941,0.0937545,0.0949464,0.0952358,0.0969398,0.0939658,0.094574,0.000972
test,0.0565419,0.0589914,0.061964,0.059963,0.0569658,0.0609682,0.0579643,0.0649834,0.0604777,0.0582898,0.059711,0.002414
total,0.150483,0.154243,0.155909,0.15378,0.150907,0.154723,0.152911,0.160219,0.157418,0.152256,0.154285,0.002828



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.130919,0.13188,0.130941,0.135735,0.12792,0.12877,0.125944,0.125923,0.125923,0.127919,0.129187,0.003023
test,0.0619676,0.0569654,0.059963,0.0559664,0.0589638,0.0569649,0.0579832,0.0609624,0.0569668,0.0559659,0.058267,0.002002
total,0.192887,0.188845,0.190904,0.191701,0.186884,0.185735,0.183928,0.186885,0.18289,0.183885,0.187454,0.003332



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.126925,0.127921,0.126922,0.127921,0.126965,0.127918,0.126928,0.127683,0.136917,0.128927,0.128503,0.002872
test,0.0659597,0.0609629,0.070956,0.0669639,0.0609617,0.062963,0.0639615,0.0599668,0.0639591,0.0662336,0.064289,0.003178
total,0.192884,0.188884,0.197878,0.194885,0.187927,0.190881,0.190889,0.18765,0.200876,0.19516,0.192791,0.00417



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.130761,0.127002,0.128216,0.128183,0.127919,0.12746,0.12693,0.128918,0.126924,0.134924,0.128724,0.002342
test,0.0579641,0.0559685,0.0599627,0.0559661,0.0559659,0.0599618,0.0559654,0.0559673,0.0594673,0.0579646,0.057515,0.001679
total,0.188725,0.182971,0.188179,0.184149,0.183885,0.187422,0.182896,0.184885,0.186391,0.192889,0.186239,0.002986



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.127938,0.128917,0.126922,0.127922,0.127434,0.128757,0.128924,0.126921,0.127919,0.130919,0.128257,0.001131
test,0.063961,0.0639627,0.0589633,0.0589638,0.0609703,0.0609632,0.0579648,0.0619626,0.0599637,0.0599663,0.060764,0.001938
total,0.191899,0.19288,0.185886,0.186886,0.188405,0.18972,0.186889,0.188884,0.187883,0.190885,0.189022,0.002186



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.128218,0.13092,0.133917,0.1599,0.128921,0.133917,0.1312,0.148909,0.128931,0.151334,0.137617,0.010794
test,0.0629618,0.0704608,0.0809505,0.0589659,0.0589638,0.0629621,0.0589628,0.0599625,0.0564861,0.13092,0.07016,0.021398
total,0.19118,0.20138,0.214868,0.218866,0.187884,0.196879,0.190163,0.208872,0.185417,0.282254,0.207776,0.027134



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.132915,0.17589,0.135916,0.169119,0.154907,0.132922,0.130919,0.134604,0.134393,0.131923,0.143351,0.016035
test,0.0619652,0.0639641,0.06496,0.0659611,0.059967,0.0659599,0.0709624,0.0609977,0.0625036,0.0617871,0.063903,0.003059
total,0.19488,0.239855,0.200876,0.23508,0.214875,0.198882,0.201882,0.195602,0.196897,0.193711,0.207254,0.016169



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.135285,0.128926,0.129084,0.128602,0.127318,0.130433,0.131256,0.12895,0.13792,0.141463,0.131924,0.004462
test,0.0609632,0.059963,0.0569651,0.0575242,0.0599637,0.0569663,0.0577962,0.0619619,0.0579638,0.0579891,0.058806,0.001676
total,0.196248,0.188889,0.186049,0.186126,0.187282,0.1874,0.189052,0.190912,0.195884,0.199452,0.190729,0.004532



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.13278,0.128925,0.127283,0.154909,0.131262,0.126923,0.129953,0.13393,0.140913,0.13892,0.13458,0.008076
test,0.0622234,0.0599654,0.0609624,0.0749538,0.0609639,0.0589638,0.0654767,0.0599644,0.0589638,0.0639608,0.06264,0.004567
total,0.195003,0.188891,0.188245,0.229862,0.192226,0.185886,0.19543,0.193894,0.199877,0.202881,0.197219,0.011946



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.135922,0.128547,0.130923,0.133967,0.127923,0.131656,0.127923,0.132921,0.12764,0.140993,0.131842,0.004076
test,0.0569646,0.0638144,0.0589654,0.0570233,0.0609648,0.0559664,0.0604753,0.0569675,0.0605025,0.0599759,0.059162,0.002314
total,0.192887,0.192362,0.189888,0.19099,0.188888,0.187623,0.188398,0.189889,0.188143,0.200969,0.191004,0.003716



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.129441,0.134494,0.130053,0.128646,0.133337,0.129922,0.129509,0.129427,0.127146,0.128953,0.130093,0.002079
test,0.060961,0.0645349,0.0577826,0.0629604,0.0663006,0.0609608,0.059963,0.063961,0.0600336,0.0629618,0.062042,0.00242
total,0.190402,0.199029,0.187836,0.191607,0.199638,0.190883,0.189472,0.193388,0.18718,0.191915,0.192135,0.004005



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.130477,0.131923,0.147932,0.154508,0.132658,0.128999,0.130022,0.129942,0.12784,0.128427,0.134273,0.008711
test,0.0609643,0.056973,0.0639608,0.0624745,0.0579846,0.0569117,0.0589659,0.0609629,0.0570021,0.0599637,0.059616,0.002346
total,0.191442,0.188896,0.211893,0.216982,0.190643,0.185911,0.188988,0.190905,0.184842,0.18839,0.193889,0.010524



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.127944,0.129923,0.127921,0.132927,0.129861,0.128031,0.129923,0.129926,0.131428,0.126922,0.129481,0.001728
test,0.0579641,0.0594707,0.0619626,0.0599632,0.060436,0.0640235,0.0599725,0.0589638,0.0634654,0.061471,0.060769,0.001844
total,0.185909,0.189394,0.189883,0.19289,0.190297,0.192054,0.189896,0.18889,0.194893,0.188393,0.19025,0.002386



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.128918,0.127921,0.127928,0.132154,0.136127,0.138411,0.141062,0.130919,0.12895,0.154908,0.13473,0.008023
test,0.0599637,0.0619631,0.0594697,0.0649581,0.0679576,0.058965,0.0649605,0.0609632,0.0629616,0.0654728,0.062764,0.002846
total,0.188882,0.189884,0.187397,0.197112,0.204085,0.197376,0.206023,0.191883,0.191911,0.220381,0.197493,0.009679



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.132917,0.127922,0.129311,0.129436,0.12793,0.130522,0.142423,0.134424,0.129935,0.127639,0.131246,0.004268
test,0.08199,0.0584698,0.0599647,0.0639608,0.0609634,0.0599618,0.0649605,0.0619626,0.0609629,0.0619645,0.063516,0.00642
total,0.214907,0.186391,0.189275,0.193397,0.188894,0.190484,0.207383,0.196386,0.190898,0.189604,0.194762,0.008741



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.139868,0.12992,0.138417,0.129465,0.128952,0.128147,0.128921,0.128427,0.127223,0.130937,0.131028,0.00418
test,0.0669665,0.05667,0.0639637,0.0599716,0.0604689,0.0629902,0.0589635,0.0589643,0.061482,0.0599632,0.06104,0.002793
total,0.206834,0.18659,0.202381,0.189436,0.18942,0.191138,0.187884,0.187391,0.188704,0.1909,0.192068,0.00649



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.129368,0.128953,0.132923,0.132923,0.130426,0.127922,0.131919,0.130452,0.133446,0.128776,0.130711,0.001879
test,0.0634766,0.0619624,0.0669587,0.056967,0.0589643,0.0589638,0.0599632,0.0609651,0.0620234,0.059988,0.061023,0.002651
total,0.192844,0.190915,0.199881,0.18989,0.189391,0.186885,0.191882,0.191417,0.19547,0.188764,0.191734,0.003515



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.128921,0.129919,0.128335,0.128938,0.132817,0.135441,0.128424,0.130424,0.129817,0.12943,0.130247,0.002122
test,0.0632994,0.0629616,0.0579662,0.063961,0.0632164,0.0764678,0.069958,0.0609639,0.0672104,0.0589962,0.0645,0.005211
total,0.19222,0.19288,0.186301,0.192899,0.196034,0.211909,0.198382,0.191388,0.197027,0.188426,0.194747,0.006712



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.129925,0.127429,0.128921,0.129918,0.129427,0.134813,0.128427,0.128318,0.132918,0.137915,0.130801,0.003183
test,0.0674696,0.0594771,0.063961,0.0599635,0.057965,0.0661974,0.0579643,0.0608525,0.0619621,0.0669596,0.062277,0.003458
total,0.197395,0.186906,0.192882,0.189881,0.187392,0.20101,0.186391,0.189171,0.19488,0.204875,0.193078,0.006038



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.12892,0.131127,0.130918,0.132895,0.129412,0.132915,0.12815,0.130255,0.145217,0.12744,0.131725,0.00482
test,0.0569654,0.0619638,0.0609624,0.0579648,0.0667682,0.0599632,0.0609655,0.0623658,0.0589712,0.058172,0.060506,0.002689
total,0.185886,0.19309,0.19188,0.19086,0.19618,0.192879,0.189115,0.192621,0.204188,0.185612,0.192231,0.005056



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.129918,0.131438,0.12894,0.131919,0.130438,0.128932,0.139061,0.135682,0.12992,0.128926,0.131517,0.003174
test,0.0659626,0.0609629,0.0599637,0.0629623,0.0603907,0.0583336,0.0659723,0.0595779,0.0569677,0.0634675,0.061456,0.002899
total,0.19588,0.192401,0.188904,0.194881,0.190829,0.187265,0.205033,0.19526,0.186888,0.192393,0.192973,0.005042



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.92373,0.92373,0.924424,0.923444,0.922628,0.927443,0.921037,0.925893,0.923689,0.923099,0.921037,0.927443,0.923966,,0.002006
1.0,0.925852,0.923526,0.924668,0.923322,0.923444,0.927076,0.921118,0.926872,0.923689,0.92314,0.921118,0.927076,0.924242,0.000275,0.246948
0.75,0.923811,0.924138,0.924668,0.923444,0.923322,0.927362,0.921649,0.9283,0.924097,0.923099,0.921649,0.9283,0.924487,0.000245,0.247023
0.5,0.924179,0.923403,0.924791,0.92422,0.923322,0.927403,0.922465,0.927851,0.924097,0.923466,0.922465,0.927851,0.924626,0.000139,0.247086
0.25,0.927607,0.923567,0.924628,0.926709,0.923199,0.93079,0.921853,0.926505,0.923199,0.922691,0.921853,0.93079,0.925282,0.000656,0.247134
0.1,0.92524,0.92426,0.91818,0.914181,0.927933,0.931687,0.918058,0.924016,0.918221,0.921549,0.914181,0.931687,0.922433,-0.00285,0.247355
0.05,0.929484,0.929647,0.926505,0.91867,0.928423,0.899735,0.924628,0.909284,0.912752,0.929259,0.899735,0.929647,0.919814,-0.002619,0.246762
0.01,0.547398,0.825831,0.326913,0.898184,0.78029,0.544175,0.791308,0.788982,0.789063,0.839385,0.326913,0.898184,0.696386,-0.223428,0.310815
0.005,0.409141,0.792573,0.286023,0.792206,0.79041,0.548255,0.452153,0.829096,0.550786,0.253101,0.253101,0.829096,0.565495,-0.130891,0.28473
0.001,0.795266,0.206325,0.792287,0.452561,0.20959,0.206815,0.208692,0.211018,0.789594,0.798221,0.206325,0.798221,0.47291,-0.092585,0.308332



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.92373,0.92373,0.924424,0.923444,0.922628,0.927443,0.921037,0.925893,0.923689,0.923099,0.921037,0.927443,0.923966,,0.002006
1.0,0.795266,0.793675,0.232238,0.216078,0.23685,0.206815,0.234564,0.211018,0.789063,0.798221,0.206815,0.798221,0.459902,-0.464064,0.3675
0.75,0.204734,0.793675,0.223057,0.207794,0.20959,0.787064,0.208692,0.211018,0.789063,0.201779,0.201779,0.793675,0.40266,-0.057242,0.291019
0.5,0.781432,0.793675,0.207713,0.792206,0.784615,0.206815,0.791308,0.788982,0.210937,0.207735,0.206815,0.793675,0.547159,0.144499,0.295693
0.25,0.204734,0.206325,0.223057,0.207794,0.23685,0.793185,0.208692,0.788982,0.789063,0.798221,0.204734,0.798221,0.454988,-0.092171,0.311495
0.1,0.782738,0.793675,0.207713,0.792206,0.20959,0.206815,0.791308,0.788982,0.78029,0.207735,0.206815,0.793675,0.546795,0.091807,0.300999
0.05,0.204734,0.206325,0.792287,0.792206,0.20959,0.219996,0.779922,0.211018,0.210937,0.798221,0.204734,0.798221,0.452349,-0.094446,0.312292
0.01,0.204734,0.793675,0.217874,0.207794,0.222567,0.793185,0.791308,0.788982,0.233626,0.798221,0.204734,0.798221,0.504577,0.052227,0.303247
0.005,0.795266,0.784738,0.223057,0.207794,0.20959,0.206815,0.217996,0.211018,0.789063,0.201779,0.201779,0.795266,0.40368,-0.100897,0.295786
0.001,0.795266,0.771271,0.792287,0.792206,0.20959,0.206815,0.208692,0.775882,0.789063,0.792265,0.206815,0.795266,0.594618,0.190938,0.284044



Wall time: 5min 44s


# SPECT Heart Dataset

In [26]:
%%time

# Define the headings to be used
headings = ['overall_diagnosis', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22']

# Read in the official training and testing data separately
spect_training_data = pd.read_csv('datasets/SPECT_Heart/SPECT.train', names=headings)
spect_testing_data = pd.read_csv('datasets/SPECT_Heart/SPECT.test', names=headings)

spect_data = spect_training_data.append(spect_testing_data)

# Apply some pre-processing
for heading in headings:
    spect_data[heading] = spect_data[heading].astype('int').apply(pd.to_numeric, downcast="unsigned")
    
# Find out which features contain missing (NaN) values
nan_features = spect_data.columns[spect_data.isnull().any()]
nan_features_count = spect_data[nan_features].isnull().sum()

print("Number of records with NaN values, per feature:")
display(nan_features_count)
print()

# Get the number of records in the data set
n_records_before = spect_data.shape[0]

# Drop the rows with missing values, since they occur in multiple features
spect_data = spect_data.dropna()

# Get the number of records in the data set
n_records_after = spect_data.shape[0]

# Print how many records containing NaN values got dropped
n_records_dropped = n_records_before - n_records_after
print(n_records_dropped, "records were dropped due to missing values.")
print("This is", round(n_records_dropped / n_records_before * 100, 1), "% of the entire data set.")
print("The resulting data set contains", n_records_after, "records.")

# Reset the indices
spect_data = spect_data.reset_index(drop=True)

# Print a newline to separate outputs
print()

Number of records with NaN values, per feature:


Series([], dtype: float64)


0 records were dropped due to missing values.
This is 0.0 % of the entire data set.
The resulting data set contains 267 records.

Wall time: 66 ms


In [27]:
display(spect_data.head())

spect_data.shape

Unnamed: 0,overall_diagnosis,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22
0,1,0,0,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,1
2,1,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


(267, 23)

In [28]:
%%time

run_classifier(spect_data, 'overall_diagnosis', 'datasets/SPECT_Heart/buckets_SPEC.csv')

Timings (in seconds) for original algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0379758,0.0129921,0.0129902,0.0139914,0.0129919,0.0129938,0.0129919,0.0129921,0.0119927,0.0129919,0.01549,0.007508
test,0.0179894,0.0129936,0.0119927,0.0119927,0.0119927,0.0124972,0.0129921,0.0119925,0.0129919,0.011992,0.012943,0.001738
total,0.0559652,0.0259857,0.0249829,0.025984,0.0249846,0.025491,0.025984,0.0249846,0.0249846,0.0249839,0.028433,0.009188



Timings (in seconds) for enhanced algorithm with e = None :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0119925,0.0129921,0.0119927,0.0129943,0.0129924,0.0129919,0.0124993,0.0119927,0.0129919,0.0129921,0.012643,0.00045
test,0.0129921,0.0119927,0.0129919,0.0139894,0.0119932,0.0119927,0.0119927,0.0129919,0.0129921,0.0119925,0.012592,0.000662
total,0.0249846,0.0249848,0.0249846,0.0269837,0.0249856,0.0249846,0.024492,0.0249846,0.025984,0.0249846,0.025235,0.000679



Timings (in seconds) for original algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0359774,0.0223267,0.021986,0.0229881,0.0219872,0.0219867,0.021987,0.0219865,0.0219862,0.0219862,0.02352,0.004164
test,0.0169907,0.0129919,0.0119927,0.0149965,0.0119913,0.0124989,0.0129921,0.0129921,0.0119927,0.0129921,0.013243,0.001504
total,0.052968,0.0353186,0.0339787,0.0379846,0.0339785,0.0344856,0.0349791,0.0349786,0.0339789,0.0349784,0.036763,0.005515



Timings (in seconds) for enhanced algorithm with e = 1.0 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0213468,0.0219865,0.0222552,0.0219858,0.0289817,0.026984,0.0269816,0.0229909,0.0254898,0.0279832,0.024699,0.002739
test,0.0139916,0.0129921,0.0119939,0.0119925,0.0139923,0.0149906,0.0119927,0.0119929,0.0119927,0.021987,0.013792,0.002924
total,0.0353384,0.0349786,0.0342491,0.0339782,0.042974,0.0419745,0.0389743,0.0349839,0.0374825,0.0499701,0.03849,0.00488



Timings (in seconds) for original algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219846,0.0234919,0.0219865,0.0229862,0.0229857,0.0239851,0.0299802,0.0209873,0.0219865,0.0379753,0.024835,0.004977
test,0.0119927,0.0125029,0.0119925,0.0135489,0.0169892,0.0129921,0.0119927,0.0129919,0.0129921,0.0179894,0.013598,0.00202
total,0.0339773,0.0359948,0.0339789,0.036535,0.0399749,0.0369773,0.0419729,0.0339792,0.0349786,0.0559647,0.038433,0.006367



Timings (in seconds) for enhanced algorithm with e = 0.75 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0239851,0.0349782,0.0229862,0.020987,0.0429738,0.0319815,0.0234945,0.0229852,0.0219855,0.0219862,0.026834,0.006953
test,0.0129933,0.014991,0.0129914,0.0129921,0.0149915,0.0159886,0.0129914,0.0129919,0.0129921,0.0129917,0.013692,0.001099
total,0.0369785,0.0499692,0.0359776,0.0339792,0.0579653,0.0479701,0.0364859,0.0359771,0.0349777,0.0349779,0.040526,0.007896



Timings (in seconds) for original algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0349782,0.0229859,0.0369763,0.025984,0.0219865,0.0209868,0.021986,0.0209873,0.0219865,0.0221877,0.025105,0.005615
test,0.0159903,0.0250092,0.0139928,0.0119948,0.0119934,0.0129919,0.0129926,0.0129919,0.0119925,0.0129929,0.014294,0.003747
total,0.0509684,0.0479951,0.0509691,0.0379789,0.0339799,0.0339787,0.0349786,0.0339792,0.0339789,0.0351806,0.039399,0.00706



Timings (in seconds) for enhanced algorithm with e = 0.5 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219865,0.0209873,0.0219862,0.0219865,0.0219867,0.0219865,0.0219862,0.020987,0.0229857,0.0219865,0.021887,0.000538
test,0.0119925,0.0129919,0.0149915,0.0119927,0.0119925,0.0119927,0.0119925,0.0129921,0.0119929,0.0129919,0.012592,0.000916
total,0.0339789,0.0339792,0.0369778,0.0339792,0.0339792,0.0339792,0.0339787,0.0339792,0.0349786,0.0349784,0.034479,0.000922



Timings (in seconds) for original algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0329804,0.0269833,0.0419748,0.0325031,0.0339799,0.0319808,0.0219862,0.0229864,0.0219843,0.020987,0.028835,0.006578
test,0.0129919,0.0209873,0.0209873,0.0129921,0.028981,0.0129914,0.0129921,0.0149903,0.0139923,0.0129921,0.01649,0.005159
total,0.0459723,0.0479705,0.0629621,0.0454953,0.0629609,0.0449722,0.0349784,0.0379767,0.0359766,0.0339792,0.045324,0.010034



Timings (in seconds) for enhanced algorithm with e = 0.25 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219865,0.0229859,0.0219862,0.0219867,0.0209873,0.021986,0.0219862,0.0219862,0.0219862,0.0219867,0.021986,0.000447
test,0.0119927,0.0129919,0.0119929,0.0119925,0.0139914,0.0119927,0.0119925,0.0129921,0.0129924,0.0119927,0.012492,0.00067
total,0.0339792,0.0359778,0.0339792,0.0339792,0.0349786,0.0339787,0.0339787,0.0349784,0.0349786,0.0339794,0.034479,0.00067



Timings (in seconds) for original algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0409744,0.0279827,0.0279827,0.0229976,0.0219862,0.0249841,0.0229857,0.0224931,0.0209873,0.0239856,0.025736,0.005551
test,0.0189877,0.0199878,0.0149908,0.0119839,0.0129919,0.0129967,0.0129921,0.0129919,0.0119929,0.0129914,0.014291,0.002721
total,0.059962,0.0479705,0.0429735,0.0349815,0.0349782,0.0379808,0.0359778,0.035485,0.0329802,0.0369771,0.040027,0.007876



Timings (in seconds) for enhanced algorithm with e = 0.1 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219867,0.0229847,0.0224936,0.0219858,0.0229855,0.0229864,0.0219867,0.0219886,0.0229855,0.0219865,0.022437,0.000471
test,0.0119925,0.0129919,0.0119922,0.0129919,0.0119929,0.0119929,0.0119927,0.0119927,0.0119934,0.0119922,0.012193,0.0004
total,0.0339792,0.0359766,0.0344858,0.0349777,0.0349784,0.0349793,0.0339794,0.0339813,0.0349789,0.0339787,0.03463,0.000633



Timings (in seconds) for original algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.034488,0.0309832,0.0319803,0.0249846,0.0229862,0.021987,0.0219862,0.0219865,0.0219865,0.0219862,0.025535,0.004703
test,0.0219882,0.025003,0.0159903,0.0189869,0.0129926,0.0129917,0.0129914,0.0122373,0.0119925,0.0135005,0.015867,0.004351
total,0.0564761,0.0559862,0.0479705,0.0439715,0.0359788,0.0349786,0.0349777,0.0342238,0.0339789,0.0354867,0.041403,0.008621



Timings (in seconds) for enhanced algorithm with e = 0.05 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219865,0.0219862,0.0229862,0.0219867,0.0219903,0.0219872,0.0219858,0.0234828,0.0219872,0.0219865,0.022237,0.000511
test,0.0129919,0.0144913,0.0119927,0.0119925,0.011992,0.0119925,0.0129921,0.0139928,0.0119917,0.0129921,0.012742,0.000873
total,0.0349784,0.0364776,0.0349789,0.0339792,0.0339823,0.0339797,0.0349779,0.0374756,0.0339789,0.0349786,0.034979,0.001117



Timings (in seconds) for original algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0339789,0.0249848,0.0279822,0.0245078,0.0229864,0.0219855,0.0214942,0.0214946,0.0229857,0.021987,0.024439,0.003708
test,0.0139916,0.0209868,0.0129926,0.0149913,0.0119934,0.0119929,0.0129919,0.0132449,0.0119927,0.0139911,0.013917,0.002538
total,0.0479705,0.0459716,0.0409749,0.039499,0.0349798,0.0339785,0.0344861,0.0347395,0.0349784,0.0359781,0.038356,0.00484



Timings (in seconds) for enhanced algorithm with e = 0.01 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219882,0.0229857,0.0209863,0.0219867,0.0219865,0.0209863,0.0219867,0.0219874,0.020987,0.0219865,0.021787,0.0006
test,0.0129902,0.0129919,0.0129924,0.0119925,0.0119927,0.0129924,0.0129919,0.0129912,0.0129921,0.0119927,0.012692,0.000458
total,0.0349784,0.0359776,0.0339787,0.0339792,0.0339792,0.0339787,0.0349786,0.0349786,0.0339792,0.0339792,0.034479,0.00067



Timings (in seconds) for original algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.058965,0.0269835,0.0255075,0.0239875,0.0229876,0.0229843,0.020987,0.0219872,0.0219865,0.0219862,0.026836,0.010847
test,0.0139902,0.0199876,0.0149963,0.0129912,0.011992,0.0119925,0.0129921,0.0119925,0.0119927,0.0129933,0.013592,0.002331
total,0.0729551,0.0469711,0.0405037,0.0369787,0.0349796,0.0349767,0.0339792,0.0339797,0.0339792,0.0349796,0.040428,0.011518



Timings (in seconds) for enhanced algorithm with e = 0.005 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219848,0.0219862,0.0219867,0.0219865,0.0229855,0.0209866,0.0219867,0.021986,0.020988,0.0219862,0.021886,0.000538
test,0.0124996,0.0129921,0.0129919,0.0119927,0.0119925,0.0129921,0.0129919,0.0119927,0.0129907,0.0129921,0.012643,0.000449
total,0.0344844,0.0349784,0.0349786,0.0339792,0.0349779,0.0339787,0.0349786,0.0339787,0.0339787,0.0349784,0.034529,0.000471



Timings (in seconds) for original algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0349793,0.0299816,0.0234947,0.0219865,0.0219865,0.0219862,0.021987,0.022248,0.0249867,0.0219882,0.024562,0.004216
test,0.0129907,0.0189884,0.0135009,0.0129919,0.0129921,0.0129919,0.0129921,0.0119936,0.0119927,0.0119927,0.013343,0.001949
total,0.0479701,0.04897,0.0369956,0.0349784,0.0349786,0.0349782,0.0349791,0.0342417,0.0369794,0.0339808,0.037905,0.005371



Timings (in seconds) for enhanced algorithm with e = 0.001 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219865,0.0209877,0.0219865,0.0219996,0.0229857,0.021986,0.0219855,0.0219862,0.0209873,0.021986,0.021888,0.000538
test,0.0129926,0.0139918,0.0119925,0.0129921,0.0119927,0.0119932,0.0129988,0.0119927,0.0129921,0.0129921,0.012693,0.00064
total,0.0349791,0.0349796,0.0339789,0.0349917,0.0349784,0.0339792,0.0349844,0.0339789,0.0339794,0.0349782,0.034581,0.000491



Timings (in seconds) for original algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0369771,0.0299814,0.0239844,0.022985,0.021986,0.020988,0.0219862,0.0219865,0.0219867,0.0219865,0.024485,0.004819
test,0.0129921,0.0199878,0.0145841,0.0129924,0.0129921,0.0139909,0.0129919,0.0122299,0.0129921,0.0129919,0.013875,0.002128
total,0.0499692,0.0499692,0.0385685,0.0359774,0.0349782,0.0349789,0.0349782,0.0342164,0.0349789,0.0349784,0.038359,0.005913



Timings (in seconds) for enhanced algorithm with e = 1e-10 :


Unnamed: 0,fold 1,fold 2,fold 3,fold 4,fold 5,fold 6,fold 7,fold 8,fold 9,fold 10,mean,std
train,0.0219877,0.0219865,0.0219867,0.0214934,0.0219865,0.0219862,0.0219865,0.0219867,0.020987,0.0219865,0.021837,0.000319
test,0.0149894,0.0139918,0.0119925,0.0129919,0.0129921,0.0119927,0.0119927,0.0119925,0.0124986,0.0119927,0.012743,0.00098
total,0.0369771,0.0359783,0.0339792,0.0344853,0.0349786,0.0339789,0.0339792,0.0339792,0.0334857,0.0339792,0.03458,0.001043



Original Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.576923,0.653846,0.692308,0.5,0.461538,0.615385,0.615385,0.615385,0.538462,0.636364,0.461538,0.692308,0.588287,,0.07799
1.0,0.538462,0.692308,0.692308,0.653846,0.461538,0.692308,0.653846,0.807692,0.769231,0.575758,0.461538,0.807692,0.650544,0.062257,0.192093
0.75,0.653846,0.730769,0.769231,0.730769,0.538462,0.653846,0.538462,0.807692,0.653846,0.484848,0.484848,0.807692,0.654526,0.003982,0.20554
0.5,0.653846,0.730769,0.653846,0.538462,0.423077,0.653846,0.692308,0.807692,0.346154,0.787879,0.346154,0.807692,0.620144,-0.034382,0.234158
0.25,0.576923,0.5,0.538462,0.730769,0.615385,0.423077,0.576923,0.769231,0.461538,0.666667,0.423077,0.769231,0.587607,-0.032537,0.202095
0.1,0.423077,0.615385,0.615385,0.230769,0.692308,0.807692,0.576923,0.692308,0.307692,0.606061,0.230769,0.807692,0.550505,-0.037102,0.245731
0.05,0.538462,0.538462,0.576923,0.576923,0.423077,0.461538,0.615385,0.230769,0.269231,0.363636,0.230769,0.615385,0.45338,-0.097125,0.200185
0.01,0.269231,0.307692,0.384615,0.230769,0.384615,0.384615,0.5,0.346154,0.230769,0.636364,0.230769,0.636364,0.378497,-0.074883,0.180215
0.005,0.346154,0.346154,0.423077,0.230769,0.346154,0.461538,0.384615,0.346154,0.230769,0.636364,0.230769,0.636364,0.384907,0.00641,0.162694
0.001,0.269231,0.5,0.384615,0.269231,0.307692,0.423077,0.461538,0.230769,0.269231,0.363636,0.230769,0.5,0.350816,-0.034091,0.139096



Enhanced Algorithm:


Unnamed: 0,accuracy 1,accuracy 2,accuracy 3,accuracy 4,accuracy 5,accuracy 6,accuracy 7,accuracy 8,accuracy 9,accuracy 10,min,max,mean,mean_diff,std
,0.576923,0.653846,0.692308,0.5,0.461538,0.615385,0.615385,0.615385,0.538462,0.636364,0.461538,0.692308,0.588287,,0.07799
1.0,0.461538,0.423077,0.423077,0.230769,0.230769,0.461538,0.576923,0.346154,0.461538,0.636364,0.230769,0.636364,0.426573,-0.161713,0.207024
0.75,0.576923,0.5,0.423077,0.615385,0.423077,0.423077,0.423077,0.346154,0.5,0.454545,0.346154,0.615385,0.470571,0.043998,0.142557
0.5,0.230769,0.384615,0.576923,0.230769,0.538462,0.423077,0.423077,0.576923,0.269231,0.363636,0.230769,0.576923,0.402098,-0.068473,0.180658
0.25,0.461538,0.5,0.384615,0.230769,0.5,0.384615,0.307692,0.346154,0.269231,0.363636,0.230769,0.5,0.373252,-0.028846,0.142216
0.1,0.269231,0.307692,0.538462,0.653846,0.423077,0.307692,0.423077,0.384615,0.538462,0.393939,0.269231,0.653846,0.430264,0.057012,0.161287
0.05,0.230769,0.461538,0.5,0.346154,0.384615,0.269231,0.384615,0.192308,0.653846,0.393939,0.192308,0.653846,0.388598,-0.041667,0.185727
0.01,0.5,0.615385,0.461538,0.269231,0.307692,0.423077,0.384615,0.692308,0.730769,0.363636,0.269231,0.730769,0.479021,0.090423,0.191058
0.005,0.269231,0.692308,0.461538,0.192308,0.269231,0.576923,0.384615,0.769231,0.307692,0.575758,0.192308,0.769231,0.455031,-0.02399,0.236976
0.001,0.461538,0.384615,0.5,0.230769,0.384615,0.461538,0.5,0.384615,0.653846,0.393939,0.230769,0.653846,0.436674,-0.018357,0.173338



Wall time: 11.1 s
