Import necessary modules/libraries

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets

In [13]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


Define node class, along with necessary attributes and functions

In [14]:
class TreeNode:
  def __init__(self,X,y,thresh,feature):
    self.X= X #storing the input data at a node
    self.y= y #storing the class labels
    self.thresh=thresh #storing the threshold for splitting
    self.feature=feature #storing the feature used for splitting
    self.children= [] #list of children nodes
    self.parent= None #storing the parent node

  def add_child(self, child): #adding a child node
    child.parent = self #setting the node as child's parent
    self.children.append(child) #storing the child node in the list of children nodes

  def get_level(self): #get the depth of a node
    level = 0
    p = self.parent
    while p:
      level += 1
      p = p.parent

    return level

  def print_tree(self): #displaying the tree formed hierarchically
    spaces = ' ' * self.get_level() * 3
    prefix = spaces + "|__" if self.parent else "" #using indentation and |_ as representation of depth & parent-child relation
    print(prefix + "feature_idx: "+ str(self.feature))
    if self.children:
      for child in self.children:
        child.print_tree() #recursively use the method for displaying the subtrees

In [15]:
num_feats=10
X,y= datasets.make_classification(n_samples=100, n_features=num_feats, n_classes=2, random_state=25)
print(X)
print(y)

[[ 1.99166410e+00 -8.17914411e-01  1.91377119e+00 -4.02246794e-01
   1.28710456e-01  2.11706036e+00 -8.07247494e-01 -1.78575757e+00
   7.80458793e-01  5.58835298e-01]
 [-6.69995449e-01  8.72261334e-02 -1.85867652e+00  3.84205743e-03
  -5.62732934e-01 -9.19456029e-01  5.33613207e-01 -2.09659221e+00
  -9.23400490e-01 -4.14295943e-01]
 [-1.23104347e-01 -5.88335905e-01 -1.45345973e-01 -2.97424440e-01
   1.00942355e+00 -8.35556629e-01  9.40831630e-01  4.28702247e-01
   3.53266165e-01  6.97338787e-01]
 [ 1.88522508e+00 -9.52492072e-01  4.53288505e-01  7.88354501e-01
   1.39026747e+00  1.80726594e+00 -5.15481921e-01  8.88751933e-02
  -2.09396399e+00  1.58500468e+00]
 [-3.90027957e-01  1.45939844e+00 -1.11809146e+00  7.33073436e-01
   9.57544610e-01  1.01847175e+00 -1.65369204e+00  1.70624957e+00
  -6.83275787e-01  3.07764491e-01]
 [ 3.35979889e-01 -1.78934809e+00  1.37270028e+00 -1.18095075e+00
   1.26872634e+00 -1.46434184e+00  2.16666661e+00 -2.94444406e-01
   2.76229550e-01 -6.00435574e-01

Creating a binary classification dataset with 100 data-points and 10 features

In [16]:
# num_feats=10
# X,y= datasets.make_classification(n_samples=100, n_features=num_feats, n_classes=2, random_state=25)
# Load the dataset
data = pd.read_csv('.\car_evaluation.csv')

# changing the names of the columns for convenient use
new_column_names = ['Price_Buying' , 'Price_Maintenance' , 'Doors','Persons','Lug_boot', 'Safety', 'Acceptability']
data.columns = new_column_names


# Manual Ordering of the data using Ordinal Encoder

buying_price_mapping = {'vhigh': 3, 'high': 2, 'med': 1, 'low': 0}
maintenance_price_mapping = {'vhigh': 3, 'high': 2, 'med': 1, 'low': 0}
doors_mapping = {'2': 0, '3': 1, '4': 2, '5more': 3}
persons_mapping = {'2': 0, '4': 1, 'more': 2}
lug_boot_mapping = {'small': 0, 'med': 1, 'big': 2}
safety_mapping = {'low': 0, 'med': 1, 'high': 2}
acceptability_mapping = {'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}

data['Price_Buying'] = data['Price_Buying'].map(buying_price_mapping)
data['Price_Maintenance'] = data['Price_Maintenance'].map(maintenance_price_mapping)
data['Doors'] = data['Doors'].map(doors_mapping)
data['Persons'] = data['Persons'].map(persons_mapping)
data['Lug_boot'] = data['Lug_boot'].map(lug_boot_mapping)
data['Safety'] = data['Safety'].map(safety_mapping)
data['Acceptability'] = data['Acceptability'].map(acceptability_mapping)



# display initial data insights
print(data)
print(data.shape)
print(data.describe())
print(data.info())                           # to get a view of how many columns and what data types

##################
##################

# SPLITTING THE DATA INTO 60:20:20 RATIO 

# splitting data into features (X) and target (y)
# X = data.drop(columns='Acceptability')
X = data.drop(columns='Acceptability')
y = data['Acceptability']



# 80% for [training and validation], 20% for [testing]
# stratify = y makes sure that the train_valid and test datasets have the same proportion of class labels as the original dataset
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# 75% of 80% for training which is 60%, 25% of 80% for validation which is 20%
# stratify = y_train_val makes sure that the train and valid datasets have the same proportion of class labels as the original dataset
X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size=0.25, random_state=42, stratify=y_train_validation)

# display data insights after splitting
print(X)
print(y)
print(data.info()) 
print("Training dataset:", X_train.shape, y_train.shape)
print("Validation dataset:", X_validation.shape, y_validation.shape)
print("Testing dataset:", X_test.shape, y_test.shape)



      Price_Buying  Price_Maintenance  Doors  Persons  Lug_boot  Safety  \
0                3                  3      0        0         0       1   
1                3                  3      0        0         0       2   
2                3                  3      0        0         1       0   
3                3                  3      0        0         1       1   
4                3                  3      0        0         1       2   
...            ...                ...    ...      ...       ...     ...   
1722             0                  0      3        2         1       1   
1723             0                  0      3        2         1       2   
1724             0                  0      3        2         2       0   
1725             0                  0      3        2         2       1   
1726             0                  0      3        2         2       2   

      Acceptability  
0                 0  
1                 0  
2                 0  
3          

In [21]:
root= TreeNode(X_train,y_train,-1,-1) #root node

Method to build a tree using the above dataset

In [23]:
def build_tree(node,depth):
    sel_feature= np.random.randint(0,X_train.shape[1]) #select a random feature index to perform splitting

    depth-=1 #update the depth
    #print(len(node.X),depth)

    if(depth>0 and len(node.X)>1): #check if the depth criterion is satisfied and node is not a leaf node (>1 samples)
      thresh= (np.min(X[:,sel_feature])+np.max(X[:,sel_feature]))/2
      #print(thresh,len(X[X[:,sel_feature]<thresh]))
      left_child=TreeNode(X[X[:,sel_feature]<thresh],y[X[:,sel_feature]<thresh],thresh,sel_feature)
      right_child=TreeNode(X[X[:,sel_feature]>=thresh],y[X[:,sel_feature]>=thresh],thresh,sel_feature)
      node.add_child(left_child)
      node.add_child(right_child)
      build_tree(left_child,depth)
      build_tree(right_child,depth)

    else:
      return


In [18]:
build_tree(root,5) #build a tree of depth 5 max

InvalidIndexError: (slice(None, None, None), 3)

In [24]:
root.print_tree()

feature_idx: -1
