# Building a Perceptron and Decision Tree Classifier from Scratch
I will be using the [Heart Disease](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) & [Absenteeism at work](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work) datasets from the UCI Machine Learning Repository.

The objective of this notebook is to build the Perceptron and Decision tree from scratch to show how each of the models work to create their predictions. Each model will be tested on each dataset with a binary classification problem.

The structure of this notebook is as follows;

* Load the libaries and datasets
* View the Heart Disease dataset
* Create the Perceptron model
* Run the Percepton model on the Heart Disease dataset
* Create the Decision Tree
* Run the Decision Tree model on the Heart Disease dataset
* View the Absenteeism at work dataset
* Run the Percepton model on the Absenteeism at work dataset
* Run the Decision Tree model on the Absenteeism at work dataset

# Load the libaries and datasets

In [None]:
# Install additional packages used within the code
# QuickDA will be used to look at feature importance of the datasets
!pip install quickda

In [None]:
# Import the python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from quickda.explore_numeric_categoric import *
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [None]:
# Load the input files into dataframes
heart_path = '../input/heart-disease-uci/heart.csv'
absent_path = '../input/absenteeism-at-work-uci-ml-repositiory/Absenteeism_at_work.csv'
df1 = pd.read_csv(heart_path, header=0) # this is for the initial build of the model
df2 = pd.read_csv(absent_path, sep=';', header=0) # this is to check that the model runs on an additional dataset

# View the Heart Disease dataset

In [None]:
# Data Exploration of the Heart Disease dataset
# What's the structure and the descriptive statistics for the input file
df1.head(3)

In [None]:
df1.info()

In [None]:
df1.describe().T

In [None]:
# Understand the baseline of the target attribute
df1['target'].value_counts(normalize=True)

In [None]:
# Plot the distribution of the target values of each dataset
fig, ax = plt.subplots(figsize=(6,4))

ax.bar(df1['target'].unique(), df1['target'].value_counts())
ax.set_title('Heart Disease Target Distrbution')
ax.set_xticks([0,1])
ax.set_xlabel('Target')
ax.set_ylabel('Count');

**Data Visualisation**

In [None]:
# Plot the predictive power score (pps) of the features
eda_numcat(df1, method='pps', x='target')

In [None]:
# Create separate dataframes for discrete and continuous variables
discrete_df = df1[['sex','cp','fbs','restecg','exang','slope','ca','thal','target']].copy()
continuous_df = df1[['age','trestbps','chol','thalach','oldpeak','target']].copy()

In [None]:
# Change the target variable to a catagorical value
discrete_df['target'].replace({0:'No',1:'Yes'}, inplace=True)
continuous_df['target'].replace({0:'No',1:'Yes'}, inplace=True)

In [None]:
# Create a pairplot for the continuous variables to understand the relationship between the variables
sns.pairplot(continuous_df, hue='target', diag_kind='hist', corner=True, plot_kws={'alpha': 0.2});

In [None]:
# Using the scatter plots from the pairplot explore some of the more promising pairings
# Create 6 subplots and have each pairing coloured by the target variable
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18,12))

axes[0][0].scatter(x=df1['chol'],y=df1['thalach'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[0][0].set_xlabel('chol')
axes[0][0].set_ylabel('thalach')
axes[0][0].set_title('chol vs thalach')

axes[0][1].scatter(x=df1['chol'],y=df1['age'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[0][1].set_xlabel('chol')
axes[0][1].set_ylabel('age')
axes[0][1].set_title('chol vs age')

axes[0][2].scatter(x=df1['thalach'],y=df1['age'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[0][2].set_xlabel('thalach')
axes[0][2].set_ylabel('age')
axes[0][2].set_title('age vs thalach')

axes[1][0].scatter(x=df1['oldpeak'],y=df1['thalach'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[1][0].set_xlabel('oldpeak')
axes[1][0].set_ylabel('thalach')
axes[1][0].set_title('oldpeak vs thalach')

axes[1][1].scatter(x=df1['chol'],y=df1['oldpeak'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[1][1].set_xlabel('chol')
axes[1][1].set_ylabel('oldpeak')
axes[1][1].set_title('chol vs oldpeak')

axes[1][2].scatter(x=df1['trestbps'],y=df1['thalach'],c=df1['target'],cmap='coolwarm',alpha=0.2)
axes[1][2].set_xlabel('trestbps')
axes[1][2].set_ylabel('thalach')
axes[1][2].set_title('trestbps vs thalach')

fig.tight_layout();

In [None]:
# Create histograms for the discrete attributes coloured by the target to identify any key splits
fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(16,8))

ax[0][0].hist(discrete_df[discrete_df.target == 'Yes']['sex'], color='r', alpha=0.5, label='Yes')
ax[0][0].hist(discrete_df[discrete_df.target == 'No']['sex'], color='b', alpha=0.5, label='No')
ax[0][0].set_title('Distribution of sex')
ax[0][0].set_xlabel('sex')
ax[0][0].set_ylabel('count')
ax[0][0].legend()

ax[0][1].hist(discrete_df[discrete_df.target == 'Yes']['cp'], color='r', alpha=0.5, label='Yes')
ax[0][1].hist(discrete_df[discrete_df.target == 'No']['cp'], color='b', alpha=0.5, label='No')
ax[0][1].set_title('Distribution of cp')
ax[0][1].set_xlabel('cp')
ax[0][1].set_ylabel('count')
ax[0][1].legend()

ax[0][2].hist(discrete_df[discrete_df.target == 'Yes']['fbs'], color='r', alpha=0.5, label='Yes')
ax[0][2].hist(discrete_df[discrete_df.target == 'No']['fbs'], color='b', alpha=0.5, label='No')
ax[0][2].set_title('Distribution of fbs')
ax[0][2].set_xlabel('fbs')
ax[0][2].set_ylabel('count')
ax[0][2].legend()

ax[0][3].hist(discrete_df[discrete_df.target == 'Yes']['restecg'], color='r', alpha=0.5, label='Yes')
ax[0][3].hist(discrete_df[discrete_df.target == 'No']['restecg'], color='b', alpha=0.5, label='No')
ax[0][3].set_title('Distribution of restecg')
ax[0][3].set_xlabel('restecg')
ax[0][3].set_ylabel('count')
ax[0][3].legend()

ax[1][0].hist(discrete_df[discrete_df.target == 'Yes']['exang'], color='r', alpha=0.5, label='Yes')
ax[1][0].hist(discrete_df[discrete_df.target == 'No']['exang'], color='b', alpha=0.5, label='No')
ax[1][0].set_title('Distribution of exang')
ax[1][0].set_xlabel('exang')
ax[1][0].set_ylabel('count')
ax[1][0].legend()

ax[1][1].hist(discrete_df[discrete_df.target == 'Yes']['slope'], color='r', alpha=0.5, label='Yes')
ax[1][1].hist(discrete_df[discrete_df.target == 'No']['slope'], color='b', alpha=0.5, label='No')
ax[1][1].set_title('Distribution of slope')
ax[1][1].set_xlabel('slope')
ax[1][1].set_ylabel('count')
ax[1][1].legend()

ax[1][2].hist(discrete_df[discrete_df.target == 'Yes']['ca'], color='r', alpha=0.5, label='Yes')
ax[1][2].hist(discrete_df[discrete_df.target == 'No']['ca'], color='b', alpha=0.5, label='No')
ax[1][2].set_title('Distribution of ca')
ax[1][2].set_xlabel('ca')
ax[1][2].set_ylabel('count')
ax[1][2].legend()

ax[1][3].hist(discrete_df[discrete_df.target == 'Yes']['thal'], color='r', alpha=0.5, label='Yes')
ax[1][3].hist(discrete_df[discrete_df.target == 'No']['thal'], color='b', alpha=0.5, label='No')
ax[1][3].set_title('Distribution of thal')
ax[1][3].set_xlabel('thal')
ax[1][3].set_ylabel('count')
ax[1][3].legend()

plt.tight_layout();

**EDA Observations**
There is no clear spearation in any of the pairings that I have focused on so 100% accuracy is not expected using a perceptron. I will pair up some of the attributes and check how they perform against the perceptron.

There are some separations in the discrete attributes so a decision tree may be more accurate.

# Create the Perceptron model
Perceptron model code adapted from Tutorial: Implementing a Linear Model from the course material.

In [None]:
class MyPerceptron:
    
    def __init__(self, max_iterations=1000,n_dimensions=2):
        # Default max iterations in the case of not being 100% accurate is 100 but can be set as an input by the user
        # Default dimensions is 2 for when X has two attributes and can be manually adjusted, the number of demensions is used to set the random weights
        np.random.RandomState(42) # Set the random state so results can be replicated       
        self.max_iterations = max_iterations 
        self.W = np.random.random(n_dimensions) # Set the weights as random numbers, the number of weights created are based on the shape of X
        self.b = np.random.random() # Set the bias as a random number
        
    def predict(self, X):
        # Predict y as 1 or -1 using X
        X = np.array(X) # Set X as an array
        hval = (self.W * X).sum(axis=1) + self.b # Sum of weights array multiplied by X array and add the bias
        return np.sign(hval) # Returns -1 or 1 as the prediction

    def fit(self, X, y):
        # Find the best weights and bias using X and y

        # Create the defaults     
        self.counter = 0 # Counter for iterations
        self.best_corr = 0 # Best accuracy set as 0, this will be updated through the loop
        self.best_params = {} # Empty dictionary for the weights and bias for the best accuracy
        
        # Set X and y as a numpy array
        X = np.array(X)
        y = np.array(y)
        
        while True:
            pred = self.predict(X) # Run the predict function using X
            is_pred_corr = y == pred # Create a bool array for correct predictions
            is_pred_wrong = np.logical_not(is_pred_corr) # Create an array for if the prediction is wrong
            error_indexes = is_pred_wrong.nonzero()[0] # Return the first instance of the array to create an index
            accu = np.sum(is_pred_corr) / len(X) # Calculate the accuracy as a percentage
            if accu > self.best_corr:
                # If the accuracy for the current loop is greater than previous loops record the accuracy as best_corr 
                # and add the weights and bias to the best_params dictionary
                self.best_corr = accu
                self.best_params = {'w':self.W, 'bias':self.b}
            if len(error_indexes) > 0:
                # Use the length of the error indexes to find a misclassified X and use X and y to update the weights and bias
                next_i = error_indexes[0]
                W_update = X[next_i] * y[next_i]
                self.W = self.W + W_update
                self.b = self.b + y[next_i]
            else:
                # If there are no entries in error index break as accuracy is 100%
                break
                
            self.counter += 1 # If the accuracy is less than 100% (error_indexes > 0) then add to the counter
            
            if self.counter >= self.max_iterations:
                # If the loop has run more than the max_iterations then set the weights as the best weights and the bias as the best bias
                self.W = self.best_params['w']
                self.b = self.best_params['bias']
                break
                
    def compute_linear_score_with_(self, X):
        # Function used for viualisation of the decision boundary
        s = None
        X = np.array(X) # Set X as a numpy array
        h = np.append(self.W, self.b) # Create a numpy array using the weights bias 
        s = (X * h[:-1]).sum(axis=1) + h[-1] # Sum of weights array multiplied by X array and add the bias
        return s

    def predict_with_(self, X):
        # Function used for viualisation of the decision boundary
        X = np.array(X) # Set X as a numpy array
        h = np.append(self.W, self.b) # Create a numpy array using the weights bias
        return (self.compute_linear_score_with_(X) > 0).astype(int)

Visualisation of decision boandary. Adapted from the Code Exercise 4 from the course material.

In [None]:
def viz_hypo(pred_func, X_samples, y_samples, correct, incorrect):
    """
    X_samples is to provide a range
    """
    #  Visualisation of the model behaviour
    x0_min, x1_min = np.min(X_samples, axis=0)
    x0_max, x1_max = np.max(X_samples, axis=0)

    xx, yy = np.meshgrid(np.arange(x0_min - 0.1, x0_max + 0.1, 0.02),
                        np.arange(x1_min - 0.1, x1_max + 0.1, 0.02))
    
    grid_coord = np.stack((xx.flatten(), yy.flatten())).T
    zz = pred_func(grid_coord).reshape(xx.shape)

    fig, ax = plt.subplots(constrained_layout=True)

    C = ax.contourf(xx, yy, zz, cmap='Pastel1') 
    C2 = ax.contour(xx, yy, zz, colors=('k', ), linewidths=2)
    ax.clabel(C2, inline=True, fontsize=10)

    # Use masks to subset X_samples into a correctly classified and misclassified datasets
    X_samples1 = X_samples[correct]
    X_samples2 = X_samples[incorrect]

    # Add the samples to the plot as a scatter
    ax.scatter(x=np.array(X_samples1)[:,0], y=np.array(X_samples1)[:,1], c='g', alpha=0.5, marker='o', edgecolor='k', label='correctly classified')
    ax.scatter(x=np.array(X_samples2)[:,0], y=np.array(X_samples2)[:,1], c='r', alpha=0.5, marker='x', label='misclassified')

    plt.legend()
    plt.show()

# Run the Perceptron on the Heart Disease dataset

In [None]:
def perceptron_pipeline(X,y):
    '''
    Pipeline for Percepton model
    '''
    # Initiate the model
    my_model = MyPerceptron(n_dimensions=X.shape[1])

    # Split the data into training and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Fit the model to the training data
    my_model.fit(X_train, y_train)

    # Predict with the validation data
    pred = my_model.predict(X_valid)

    # Compute the accuracy of the prediction
    accu = np.sum(pred==y_valid) / len(y_valid)

    weights = my_model.W
    bias = my_model.b
    training_accu = my_model.best_corr

    return weights, bias, accu, training_accu

In [None]:
# Create the different iterations of feature pairs as the X variable
X1 = df1[['thalach','chol']].copy() # From the scatter plots
X2 = df1[['thalach','age']].copy() # From the scatter plots
X3 = df1[['thalach','oldpeak']].copy() # From the scatter plots
X4 = df1[['oldpeak','chol']].copy() # From the scatter plots
X5 = df1[['trestbps','thalach']].copy() # From the scatter plots
X6 = df1[['chol','age']].copy() # From the scatter plots
X_top = df1[['thal','cp']].copy() # From the feature importance plot

In [None]:
# Create the y variable from the target column 
df1.loc[df1['target']==0,'target'] = -1 # Change the zero values to -1 so we can use misclassified points to adjust the weights and bias
y = df1['target'] # Set the target column as y

In [None]:
# Create a dictionary of the X variables
X_dict = {'X1':X1, 'X2':X2, 'X3':X3, 'X4':X4, 'X5':X5, 'X6':X6, 'X_top':X_top}

In [None]:
# Create empty lists for training and validation accuracy scores to be used for plotting
X_list = []
train_accu = []
valid_accu = []

In [None]:
# Loop through the dictionary running the Perceptron pipeline
for X_key, X_value in X_dict.items():

    # Run the Perceptron pipeline
    weights, bias, accu, training_accu = perceptron_pipeline(X_value,y)

    # Add the training and validation accuracy to the lists
    X_list.append(X_key)
    train_accu.append(training_accu)
    valid_accu.append(accu)

    print(f'X variable: {X_key} - Training accuracy: {training_accu}, Validation accuracy: {accu}')
    print(f'Perceptron Equation: ({weights[0]} x x1) + ({weights[1]} x x2) + {bias}')
    print('\n')

In [None]:
# Create a dataframe from the training and validation scores
accu_df = pd.DataFrame(list(zip(train_accu, valid_accu)), index=X_list, columns=['Train', 'Validation'])

# Plot the accuracy datafram
accu_df.plot()
plt.title('Training vs Validation scores by X')
plt.xlabel('X')
plt.ylabel('Accuracy %');

X2 and X5 have identical training and validation accuracy so I will plot the decision boandary for X2

In [None]:
# Split X and y into training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X2, y, test_size=0.2, stratify=y, random_state=42)

# Initiate the model
my_model = MyPerceptron()

# Fit the model to the training data
my_model.fit(X_train, y_train)

# Create masks for correctly and incorrectly classified data points
pred = my_model.predict(X_train)
correct = pred==y_train
incorrect = pred!=y_train

# Visualise the decision boundary
viz_hypo(my_model.predict_with_, X_train, y_train, correct, incorrect)

Test the same X values against the standard scikit-learn Perceptron model

In [None]:
# Import the Percepton
from sklearn.linear_model import Perceptron

# Initiate the model
clf = Perceptron()

# Loop through the dictionary of X pairings
for X_key, X_value in X_dict.items():

    # Split X and y into training and validation data
    X_train, X_valid, y_train, y_valid = train_test_split(X_value, y, test_size=0.2, stratify=y, random_state=42)

    # Fit the training data to the model
    clf.fit(X_train, y_train)

    # Print the accuracy scores
    print(f'X variable: {X_key} - Training score: {clf.score(X_train, y_train)}, validation score: {clf.score(X_valid, y_valid)}')

Run the Perceptrons using all attributes for X

In [None]:
# Create an X_all by dropping target from the dataframe
X_all = df1.drop('target', axis=1)

# Split X_all and y into training and validation data
X_all_train, X_all_valid, y_all_train, y_all_valid = train_test_split(X_all, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Run the data through the scikit-learn Perceptron
# Initiate the model
clf = Perceptron()

# Fit the training data to the model
clf.fit(X_all_train, y_all_train)

# Return the training and validation accuracy
clf.score(X_all_train, y_all_train), clf.score(X_all_valid, y_all_valid)

In [None]:
# Initiate the model
my_model_all = MyPerceptron(n_dimensions=X_all_train.shape[1])

# Fit the training data to the model
my_model_all.fit(X_all_train, y_all_train)

# Predict y using the X validation data
pred_valid = my_model_all.predict(X_all_valid)

# Return the training and validation accuracy
my_model_all.best_corr, np.sum(pred_valid==y_all_valid) / len(y_all_valid)

In [None]:
# Return the weights and bias for my Perceptron model
my_model_all.best_params

# Create the Decision Tree
Code adapted from [Let’s Write a Decision Tree Classifier from Scratch - Machine Learning Recipes #8](https://www.youtube.com/watch?v=LDRbO9a6XPU)

In [None]:
def unique_vals(rows, col):
    """Find the unique values for a column in a dataset."""
    return set([row[col] for row in rows])

In [None]:
def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [None]:
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

In [None]:
class Question:
    """A Question is used to partition a dataset.

    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g., Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
    """

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [None]:
def partition(rows, question):
    """Partitions a dataset.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [None]:
def gini(rows):
    """Calculate the Gini Impurity for a list of rows.

    There are a few different ways to do this, I thought this one was
    the most concise. See:
    https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
    """
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [None]:
def info_gain(left, right, current_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [None]:
def find_best_split(rows, threshold=0):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain > best_gain + threshold:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [None]:
class Leaf:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [None]:
class Decision_Node:
    """A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [None]:
def build_tree(rows, threshold):
    """Builds the tree.

    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows, threshold)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows, threshold)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows, threshold)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)

In [None]:
def print_tree(node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [None]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [None]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [None]:
def tree_predict(data, tree):
    '''
    Predict target using the Decision Tree
    '''

    # Create empty lists for prediction and actual
    pred = []
    actual = []

    # Loop through the data and append to the actual and predition list 
    for row in data:
        pred.append(list(classify(row, tree).keys())[0])
        actual.append(row[-1])

    return actual, pred

In [None]:
def tree_accuracy(actual, pred):
    '''
    Return the accuracy of the Decision Tree
    '''

    # Compare each item in the list
    accu = [1 if p==a else 0 for p,a in zip(pred, actual)]

    return np.sum(accu) / len(accu)

# Run the Decision Tree on the Heart Disease dataset

In [None]:
def prepare_tree_data(df, target):
    '''
    Function to prepare the data for use in the Decision Tree
    '''

    # Create the X and y values for splitting into training and validation set
    X = df.copy()
    y = df[target].copy()

    # Split the data into 80% training and 20% validation stratifying the data using y
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Create a list of lists of the training and validation_data for use in the Decision Tree
    training_data = X_train.values.tolist()
    validation_data = X_valid.values.tolist()

    # Create headers using the columns
    header = df.columns

    return training_data, validation_data, header

In [None]:
# Prepare the data from df1 for use in the Decision Tree
training_data, validation_data, header = prepare_tree_data(df1, 'target')

In [None]:
# Run the Decsion Tree for different thresholds to pre-prune tree

# Set the empty lists
train_accu = []
valid_accu = []
thold = []

# Loop through the different thresholds
for threshold in np.linspace(0,0.1,num=500, endpoint=True):
    # Build the Decision Tree
    my_tree = build_tree(training_data,threshold)

    # Compute the Decision Tree Accuracy
    train_actual, train_pred = tree_predict(training_data, my_tree)
    valid_actual, valid_pred = tree_predict(validation_data, my_tree)

    # Add the accuracy to the list
    train_accu.append(tree_accuracy(train_actual, train_pred))
    valid_accu.append(tree_accuracy(valid_actual, valid_pred))
    thold.append(threshold)

In [None]:
# Create a dataframe from the training and validation scores
accu_df_heart = pd.DataFrame(list(zip(train_accu, valid_accu)), index=thold, columns=['Training','Validation'])

# Plot the accuracy datafram
accu_df_heart.plot()
plt.title('Training & Validation Accuracy scores by Threshold')
plt.xlabel('Threshold')
plt.ylabel('Accuracy %');

In [None]:
# Print the threshold for the most accurate tree
accu_df_heart['difference'] = abs(accu_df_heart['Training'] - accu_df_heart['Validation'])
accu_df_heart = accu_df_heart[accu_df_heart['Validation']>0.75].copy()
best_threshold = accu_df_heart[['difference']].idxmin()[0]
print(f'Most accurate threshold: {best_threshold}')

In [None]:
# Build the Decsion Tree with the best threshold
my_tree = build_tree(training_data,best_threshold)

# Compute the Decision Tree Accuracy
train_actual, train_pred = tree_predict(training_data, my_tree)
valid_actual, valid_pred = tree_predict(validation_data, my_tree)
tree_accuracy(train_actual, train_pred), tree_accuracy(valid_actual, valid_pred)

In [None]:
# Print the Decision Tree
print_tree(my_tree)

Test the dataset against the standard scikit-learn Decision Tree model

In [None]:
# Compare results with the standard libary from scikit-learn
from sklearn.tree import DecisionTreeClassifier

# Create the X and y values for splitting into training and validation set
X = df1.drop('target', axis=1).copy()
y = df1['target'].copy()

 # Split the data into 80% training and 20% validation stratifying the data using y
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initiate the Decision Tree
t = DecisionTreeClassifier()

# Fit the model to the data
t.fit(X_train, y_train)

# Calculate the tree accuracy on the training and validation data
t.score(X_train, y_train), t.score(X_valid, y_valid)

In [None]:
# Import tree to print the scikit-learn tree
from sklearn import tree

# Initiate the DT Classifier
clf = tree.DecisionTreeClassifier()

# Fit the tree to the data
clf = clf.fit(X_train, y_train)

# Print the tree
tree.plot_tree(clf) 

# View the Absenteeism at work dataset

In [None]:
# Data Exploration of the Absenteeism dataset
# What's the structure and the descriptive statistics for the input file
df2.head(3)

In [None]:
df2.info()

In [None]:
df2.describe().T

In [None]:
# Plot the distribution of the target values of dataset
fig, ax = plt.subplots(figsize=(6,4))

n_bins = round(1 + (3.322 * np.log(len(df2['Absenteeism time in hours']))))

ax.hist(df2['Absenteeism time in hours'], bins=n_bins)
ax.set_title('Absenteeism at work Target Distrbution')
ax.set_xlabel('Target (hrs)')
ax.set_ylabel('Count');

In [None]:
# Create a histogram of a subset of the Absenteeism at work target variable
subset_df2 = df2[df2['Absenteeism time in hours'] < 20]
subset_df2['Absenteeism time in hours'].hist()
plt.title('Absenteeism at work Target < 20 hrs Distrbution')
plt.xlabel('Target (hrs)')
plt.ylabel('Count');

In [None]:
# Change the target variable in the Absenteeism at work dataset from a continuous variable to a binary classification
bin_names = [0,1]
ranges = [-np.inf,7.99,np.inf] # using the split found in the histogram
df2['target'] = pd.cut(df2['Absenteeism time in hours'], bins=ranges, labels=bin_names)

In [None]:
# Remove columns not required
df2.drop(columns=['ID','Absenteeism time in hours'], inplace=True)

In [None]:
# Plot the predictive power score (pps) of the features
eda_numcat(df2, method='pps', x='target')

In [None]:
df2['target'] = df2['target'].astype('int')

In [None]:
# Understand the baseline of the target attribute of the Absenteeism at work dataset
df2['target'].value_counts(normalize=True)

In [None]:
# Create separate dataframes for discrete and continuous variables
discrete_df = df2[['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons','Work load Average/day ', 'Hit target',
                   'Disciplinary failure', 'Education', 'Son', 'Social drinker','Social smoker', 'Pet', 'target']].copy()
continuous_df = df2[['Transportation expense', 'Distance from Residence to Work','Service time', 'Age',
                     'Weight', 'Height', 'Body mass index', 'target']].copy()

In [None]:
# Create a pairplot for the continuous variables to understand the relationship between the variables
sns.pairplot(continuous_df, hue='target', diag_kind='hist', corner=True, plot_kws={'alpha': 0.2});

In [None]:
def discrete_hist(row, col, column_name):
    '''
    Function to plot histogram as a subplot
    '''
    # Add distribution of the attribute where target = 1
    ax[row][col].hist(discrete_df[discrete_df.target == 1][column_name], color='r', alpha=0.5, label='1')

    # Add distribution of the attribute where target = 0
    ax[row][col].hist(discrete_df[discrete_df.target == 0][column_name], color='b', alpha=0.5, label='0')

    # Set title and axis labels
    ax[row][col].set_title(f'Distribution of {column_name}')
    ax[row][col].set_xlabel(column_name)
    ax[row][col].set_ylabel('count')

    # Add legend
    ax[row][col].legend()

In [None]:
# Create histograms for the discrete attributes coloured by the target to identify any key splits
fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(16,8))

# Run the histogram function for each variable
discrete_hist(0,0,'Reason for absence')
discrete_hist(0,1,'Month of absence')
discrete_hist(0,2,'Day of the week')
discrete_hist(0,3,'Seasons')
discrete_hist(1,0,'Work load Average/day ')
discrete_hist(1,1,'Hit target')
discrete_hist(1,2,'Disciplinary failure')
discrete_hist(1,3,'Education')
discrete_hist(2,0,'Son')
discrete_hist(2,1,'Social drinker')
discrete_hist(2,2,'Social smoker')
discrete_hist(2,3,'Pet')

plt.tight_layout();

**EDA Observations**
There doesn't appear to be any clear separations on the continuous attributes so the Perceptron may not be too accurate. I will use the attributes identified in the feature importance graph for the pairings here.

Looking at the discrete attributes though there seems to be instances where there may be some information gain so a decision tree may fair well.

# Run the Perceptron on the Absenteeism at work dataset

In [None]:
# Create the different iterations of feature pairs as the X variable usinmg the feature importance plot
X1 = df2[['Reason for absence','Service time']].copy()
X2 = df2[['Reason for absence','Transportation expense']].copy()
X3 = df2[['Reason for absence','Weight']].copy()
X4 = df2[['Reason for absence','Distance from Residence to Work']].copy()
X5 = df2[['Reason for absence','Age']].copy()
X6 = df2[['Reason for absence','Son']].copy()
X_all = df2.drop('target', axis=1).copy()

In [None]:
# Create the y variable from the target column 
df2.loc[df2['target']==0,'target'] = -1 # Change the zero values to -1 so we can use misclassified points to adjust the weights and bias
y = df2['target'] # Set the target column as y

In [None]:
# Create a dictionary of the X variables
X_dict = {'X1':X1, 'X2':X2, 'X3':X3, 'X4':X4, 'X5':X5, 'X6':X6, 'X_all':X_all}

In [None]:
# Create empty lists for training and validation accuracy scores to be used for plotting
X_list = []
train_accu = []
valid_accu = []

In [None]:
# Loop through the dictionary running the Perceptron pipeline
for X_key, X_value in X_dict.items():

    # Run the Perceptron pipeline
    weights, bias, accu, training_accu = perceptron_pipeline(X_value,y)

    # Add the training and validation accuracy to the lists
    X_list.append(X_key)
    train_accu.append(training_accu)
    valid_accu.append(accu)

    print(f'X variable: {X_key} - Training accuracy: {training_accu}, Validation accuracy: {accu}')
    print(f'Perceptron Equation: ({weights[0]} x x1) + ({weights[1]} x x2) + {bias}')
    print('\n')

Test the same X values against the standard scikit-learn Perceptron model

In [None]:
# Initiate the model
clf = Perceptron()

# Loop through the dictionary of X pairings
for X_key, X_value in X_dict.items():

    # Split X and y into training and validation data
    X_train, X_valid, y_train, y_valid = train_test_split(X_value, y, test_size=0.2, stratify=y, random_state=42)

    # Fit the training data to the model
    clf.fit(X_train, y_train)

    # Print the accuracy scores
    print(f'X variable: {X_key} - Training score: {clf.score(X_train, y_train)}, validation score{clf.score(X_valid, y_valid)}')

In [None]:
# Create a dataframe from the training and validation scores
accu_df = pd.DataFrame(list(zip(train_accu, valid_accu)), index=X_list, columns=['Train', 'Validation'])

# Plot the accuracy datafram
accu_df.plot()
plt.title('Training vs Validation scores by X')
plt.xlabel('X')
plt.ylabel('Accuracy %');

In [None]:
# Split X and y into training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X3, y, test_size=0.2, stratify=y, random_state=42)

# Initiate the model
my_model = MyPerceptron()

# Fit the model to the training data
my_model.fit(X_train, y_train)

# Create masks for correctly and incorrectly classified data points
pred = my_model.predict(X_train)
correct = pred==y_train
incorrect = pred!=y_train

# Visualise the decision boundary
viz_hypo(my_model.predict_with_, X_train, y_train, correct, incorrect)

# Run the Decision Tree on the Absenteeism at work dataset

In [None]:
# Prepare the data from df2 for use in the Decision Tree
training_data, validation_data, header = prepare_tree_data(df2, 'target')

In [None]:
# Run the Decsion Tree for different thresholds to pre-prune tree

# Set the empty lists
train_accu = []
valid_accu = []
thold = []

# Loop through the different thresholds
for threshold in np.linspace(0,0.1,num=500, endpoint=True):
    # Build the Decision Tree
    my_tree = build_tree(training_data,threshold)

    # Compute the Decision Tree Accuracy
    train_actual, train_pred = tree_predict(training_data, my_tree)
    valid_actual, valid_pred = tree_predict(validation_data, my_tree)

    # Add the accuracy to the list
    train_accu.append(tree_accuracy(train_actual, train_pred))
    valid_accu.append(tree_accuracy(valid_actual, valid_pred))
    thold.append(threshold)

In [None]:
# Create a dataframe from the training and validation scores
accu_df_work = pd.DataFrame(list(zip(train_accu, valid_accu)), index=thold, columns=['Training','Validation'])

# Plot the accuracy datafram
accu_df_work.plot()
plt.title('Training & Validation Accuracy scores by Threshold')
plt.xlabel('Threshold')
plt.ylabel('Accuracy %');

In [None]:
# Print the threshold for the most accurate tree
accu_df_work['difference'] = accu_df_work['Training'] - accu_df_work['Validation']
accu_df_work = accu_df_work[accu_df_work['Validation'] > 0.8].copy()
best_threshold = accu_df_work[['difference']].idxmin()[0]
print(f'Most accurate threshold: {best_threshold}')

In [None]:
# Build the Decsion Tree with the best threshold
my_tree = build_tree(training_data,best_threshold)

# Compute the Decision Tree Accuracy
train_actual, train_pred = tree_predict(training_data, my_tree)
valid_actual, valid_pred = tree_predict(validation_data, my_tree)
tree_accuracy(train_actual, train_pred), tree_accuracy(valid_actual, valid_pred)

In [None]:
# Print the Decision Tree
print_tree(my_tree)

Test the dataset against the standard scikit-learn Decision Tree model

In [None]:
# Create the X and y values for splitting into training and validation set
X = df2.drop('target', axis=1)
y = df2['target'].copy()

 # Split the data into 80% training and 20% validation stratifying the data using y
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initiate the Decision Tree
t = DecisionTreeClassifier()

# Fit the model to the data
t.fit(X_train, y_train)

# Calculate the tree accuracy on the training and validation data
t.score(X_train, y_train), t.score(X_valid, y_valid)

In [None]:
# Import tree to print the scikit-learn tree
from sklearn import tree

# Initiate the DT Classifier
clf = tree.DecisionTreeClassifier()

# Fit the tree to the data
clf = clf.fit(X_train, y_train)

# Print the tree
tree.plot_tree(clf) 