# Robot Execution Failures (LP4)

## Data Set Preparation 

### Accesing and Splitting Data

The dataset is already available in the folder so please don't change the directory of the dataset.

In [1]:
import pandas as pd
import numpy as np
import os
#install the pandas and numpy library which is used for data manipulation, preparation
#os is a built in library

In [2]:
#define columns
column_groups = ['Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz']
columns = [f"{group}_{i+1}" for i in range(15) for group in column_groups] + ['label']
#name of the columns in the dataset
column_types = ['continuous'] * 90 + ['categorical']
#column types is useful to understand what each column represents in the dataset
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
data_path = os.path.join(project_root, 'dataset', 'Robot_Execution_Failures_Dataset', 'lp4.data')

data = []
current_features = []
current_label = None

with open(data_path, 'r') as f:  #open file
    for line in f: 
        stripped = line.strip() #
        if not stripped:
            continue            #skip empty lines

        tokens = stripped.split() #tokenize lines

        # Check if line contains ANY integer (positive or negative)
        contains_integer = any(token.lstrip('-').isdigit() for token in tokens) #boolean checks if there is no integer, means label, even a single integer means continious feature

        if contains_integer:
            # It's a feature row
            current_features.extend(map(int, tokens))       #convert token into integer and map into an array one by one in order
        else:
            # It's a label
            if current_label is not None and len(current_features) == 90:     #if the feature array has been filled, append the data, move on for the new data entry
                data.append(current_features + [current_label])
            current_label = stripped
            current_features = []

    # Handle last record
    if current_label is not None and len(current_features) == 90:         #last data entry doesn't get any more label non integer values so check if the feature row has been filled if so finish the data entry
        data.append(current_features + [current_label])


df = pd.DataFrame(data, columns=columns)

for i, col_type in enumerate(column_types):       #Apply mapping to categorical columns
    if col_type == 'categorical':
        col_name = df.columns[i]
        unique_values = df[col_name].unique()
        numerical_values = list(range(len(unique_values)))
        mapping = dict(zip(unique_values, numerical_values))
        df[col_name] = df[col_name].map(mapping)

column_types.pop(df.columns.get_loc('label'))   # pop unnecessary column from column types array
input_dataset=df.drop(columns='label')          # pop unnecessary column from input dataset such as the target attribute
output_dataset= df['label']                     #equalize output dataset to target attribute column
preview_input_head = input_dataset.head(5).copy()
preview_output_head = output_dataset.head(5).copy()
preview_input_tail = input_dataset.tail(5).copy()
preview_output_tail = output_dataset.tail(5).copy()
print(preview_input_head)
print(preview_output_head)
print(preview_input_tail)
print(preview_output_tail)

   Fx_1  Fy_1  Fz_1  Tx_1  Ty_1  Tz_1  Fx_2  Fy_2  Fz_2  Tx_2  ...  Fz_14  \
0    -2     2    20     5    -6    -1    -2     1    20     5  ...     23   
1    -3     2    22     5    -8     0    -2     2    19     5  ...      6   
2    -2     2    20     5    -6    -1    -2     2    19     5  ...     41   
3    -2     2    20     5    -6    -1    -3     1    18     4  ...     32   
4    -2     2    20     4    -7    -1    -2     1    19     5  ...      4   

   Tx_14  Ty_14  Tz_14  Fx_15  Fy_15  Fz_15  Tx_15  Ty_15  Tz_15  
0      9      2      0     -2      2     29      3     -6      0  
1      5     -8     -1     -3      3     24      4     -8     -1  
2      4     -5     -2     -3      1      3      5     -6      0  
3      4    -10     -1     -2      1     30      5     -5     -1  
4      6     -8      0     -5      4     38     -1    -16     -1  

[5 rows x 90 columns]
0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64
     Fx_1  Fy_1  Fz_1  Tx_1  Ty_1  Tz_1  Fx_2  Fy_2

### Data Validation

In [3]:
#Checking for NaN or incompatible data entries and replacing the missing inputs with 0 and deleting the entries that lack an output
# Replace '?' with NaN to make it compatible with pandas handling
input_dataset.replace('?', np.nan, inplace=True)   #replacing ? with NaN values, because below NaN values are handled, point is to convert any non value to NaN to be handled a few lines below

if input_dataset.isnull().values.any():
    input_dataset = input_dataset.fillna(0) #Filling missing inputs or NaN with 0
valid_mask = output_dataset.isin([0, 1])
if not valid_mask.all():
    input_dataset = input_dataset[valid_mask].reset_index(drop=True) # dropping rows that lacks valid output
    output_dataset = output_dataset[valid_mask].reset_index(drop=True)

### Data Preparation

In [4]:
#Now instead of MinMax Scaler I would like to use NormalScaler without a library which is fairly easy
for idx, col_type in enumerate(column_types):   #iterate through columns 
    if col_type == 'continuous':                 #apply normalization only to continous type of columns
        col_name = input_dataset.columns[idx]    
        mean = input_dataset[col_name].mean()    #get the mean of the col
        std = input_dataset[col_name].std()      #get the std of the col
        input_dataset[col_name] = (input_dataset[col_name] - mean) / (std + 1e-18) #Z Score Normalization - very small epsilon to prevent divide by zero error

preview_input = input_dataset.head(10).copy()
preview_output = output_dataset.head(10).copy()
print(preview_input)
print(preview_output) 
#let's see how the datasets have been prepared for further ML algorithms

       Fx_1      Fy_1      Fz_1      Tx_1      Ty_1      Tz_1      Fx_2  \
0 -0.050943 -0.033875  0.366401  0.231488 -0.015127 -0.034747 -0.042365   
1 -0.079376 -0.033875  0.398204  0.231488 -0.050984  0.029402 -0.042365   
2 -0.050943 -0.033875  0.366401  0.231488 -0.015127 -0.034747 -0.042365   
3 -0.050943 -0.033875  0.366401  0.231488 -0.015127 -0.034747 -0.074900   
4 -0.050943 -0.033875  0.366401  0.212559 -0.033056 -0.034747 -0.042365   
5 -0.050943 -0.076110  0.350499  0.212559 -0.015127  0.029402 -0.042365   
6 -0.022510 -0.076110  0.350499  0.212559 -0.015127  0.029402 -0.042365   
7 -0.050943 -0.076110  0.350499  0.212559 -0.015127  0.029402 -0.042365   
8 -0.022510 -0.033875  0.382302  0.193630 -0.015127  0.093550 -0.009829   
9  0.005924 -0.076110  0.286892  0.193630  0.038658  0.029402 -0.042365   

       Fy_2      Fz_2      Tx_2  ...     Fz_14     Tx_14     Ty_14     Tz_14  \
0 -0.036106  0.350786  0.316542  ...  0.672068  0.335207  0.150229  0.462562   
1  0.026915  0

### Data Splitting

In [5]:
#Split the dataset for 80-20 ratio, use 80% for training and 20% for testing, kind of like rule of thumb in ML.
combined = list(zip(input_dataset.values.tolist(), output_dataset))
np.random.seed(53)
np.random.shuffle(combined)

input_shuffled, output_shuffled = zip(*combined)
input_shuffled = np.array(input_shuffled)
output_shuffled = np.array(output_shuffled)
split_point = int(0.8 * len(input_shuffled))
input_shuffled_train_dataset, input_shuffled_test_dataset = input_shuffled[:split_point], input_shuffled[split_point:]
output_shuffled_train_dataset, output_shuffled_test_dataset = output_shuffled[:split_point], output_shuffled[split_point:]


preview_input_train = input_shuffled_train_dataset[:10]
preview_output_train = output_shuffled_train_dataset[:10]
preview_input_test = input_shuffled_test_dataset[:10]
preview_output_test = output_shuffled_test_dataset[:10]

print(pd.DataFrame(preview_input_train).round(2))
print(pd.Series(preview_output_train))
print(pd.DataFrame(preview_input_test).round(2))
print(pd.Series(preview_output_test))


     0     1     2     3     4     5     6     7     8     9   ...    80  \
0  0.46  0.56  0.54 -0.36  1.33 -1.70  0.41  0.41  0.23 -0.84  ...  0.14   
1 -0.82 -0.37  0.11  0.29 -0.48 -0.10 -0.76  0.22 -0.01 -0.09  ...  0.31   
2  0.49 -1.64 -0.41  0.06  0.49 -4.20 -0.30  0.28  0.38  0.52  ...  0.35   
3  0.26 -0.54 -0.13  0.46  0.22 -0.23  0.19 -0.10  0.13  0.15  ...  0.19   
4  0.03 -0.12  0.22  0.12  0.25  0.09  0.02 -0.10  0.02  0.01  ... -0.07   
5  1.14 -0.67  0.48  0.63  1.08  0.09  1.06 -0.35  0.50  0.32  ...  0.59   
6 -0.05 -0.03  0.37  0.23 -0.02 -0.03 -0.07 -0.04  0.32  0.28  ...  0.83   
7 -0.39  1.11 -0.27 -0.79  0.09 -0.55 -0.82 -1.74  0.16  1.74  ...  0.19   
8  0.09 -0.12 -0.09  0.10  0.13  0.09  0.15 -0.04 -0.26  0.01  ... -2.04   
9 -0.11 -0.08  0.16  0.23  0.06  0.16 -0.11 -0.10  0.17  0.32  ...  0.31   

     81    82    83    84    85    86    87    88    89  
0 -0.76 -0.75 -0.65  0.18  0.26  0.32 -0.51 -0.62 -0.07  
1 -0.36  0.20 -0.65 -0.00 -0.06  0.16 -0.15  0.

## ML Models Implementation and Evaluation

### Linear Regression

In [6]:
class LinearRegression:
    def __init__(self, learning_rate=0.001, iter=1000, features=None, target_attribute=None):
        self.learning_rate = learning_rate   #coffecient for how hard the weights and bias is updated according to the gradient descent
        self.iter = iter                     #number of iterations until the model is complete and weights and bias is updated fully
        self.features = features             #input dataset
        self.target_attribute = target_attribute  #output dataset
        self.weights = None              
        self.bias = None

    def train(self):
        num_samples = self.features.shape[0]
        num_features = self.features.shape[1]
        self.weights = np.zeros(num_features)
        self.bias = 0.0

        for _ in range(self.iter):
            predicted_output = np.dot(self.features, self.weights) + self.bias #X.w+b
            error = predicted_output - self.target_attribute
            dw = np.mean(self.features * error[:, np.newaxis], axis=0)  #gradient descent of w
            db = np.mean(error)                                         #gradient descent of bias 
            self.weights -= self.learning_rate * dw                      #updating weight with lr
            self.bias -= self.learning_rate * db                        #updating bias with lr

    def predict(self, input_test):
        return np.dot(input_test, self.weights) + self.bias

    def calculate_mse(self, input_test, actual_output):
        predicted_output = self.predict(input_test)
        mse = np.mean((predicted_output - actual_output) ** 2)
        return mse
        
    def calculate_efficiency(self, input_test, output_test):
        output_pred = self.predict(input_test)
        output_pred_rounded= np.round(output_pred).astype(int)
        return np.mean(output_pred_rounded == output_test)


In [7]:
model = LinearRegression(features=input_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
model.train()
linear_regression_model_mse=model.calculate_mse(input_shuffled_test_dataset, output_shuffled_test_dataset)
linear_regression_model_accuracy= model.calculate_efficiency(input_shuffled_test_dataset,output_shuffled_test_dataset)
print(f"Linear Regression Classification MSE: {linear_regression_model_mse :.2f}")
print(f"Rounded Linear Regression Classification Accuracy: {linear_regression_model_accuracy * 100:.2f}%")

Linear Regression Classification MSE: 0.70
Rounded Linear Regression Classification Accuracy: 30.00%


### Logistic Regression

In [8]:
#Logistic Regression is used because the output is categorized not a continous variable so no linear regression was used. 
class LogisticRegression:
    def __init__(self, learning_rate=0.01, iter=2000, features = None, target_attribute=None):
        self.learning_rate = learning_rate    #coffecient for how hard the weights and bias is updated according to the gradient descent
        self.iter = iter                       #number of iterations until the model is complete and weights and bias is updated fully
        self.features = features               #input dataset
        self.target_attribute = target_attribute  #output dataset
        self.weights = None
        self.bias = None
        
    def sigmoid_function(self, x):               #logistic regression function which is actually being applied to linearly regressed function
        return 1 / (1 + np.exp(-x))
    
    def train(self):                              #initializing the function variables
        num_samples = self.features.shape[0]
        num_features = self.features.shape[1]
        self.weights = np.zeros(num_features)
        self.bias = 0.0
        
        for _ in range(self.iter):
            z = np.dot(self.features, self.weights) + self.bias       #X.w+b
            y_hat = self.sigmoid_function(z)                          #logistic regression function applied to linear regression function
            
            error = (y_hat - self.target_attribute)                   
            dw = np.mean(self.features * error[:, np.newaxis], axis=0)  #gradient descent of w 
            db = np.mean(error)                                          #gradient descent of bias
            
            self.weights -= self.learning_rate * dw                        #updating weight with lr
            self.bias -= self.learning_rate * db                           #updating bias with lr

    def predict(self, input_test):  # Rename to match usage
        z = np.dot(input_test, self.weights) + self.bias
        probs = self.sigmoid_function(z)
        return (probs >= 0.5).astype(int)

    def calculate_efficiency(self, input_test, output_test):
        output_pred = self.predict(input_test)
        return np.mean(output_pred == output_test)


In [9]:
model = LogisticRegression(features = input_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
model.train()
logistic_regression_model_accuracy=model.calculate_efficiency(input_shuffled_test_dataset,output_shuffled_test_dataset)
print(f"Logistic Regression Accuracy: {logistic_regression_model_accuracy * 100:.2f}%")
# Predict the test set values and compute the accuracy of the logistic regression model.

Logistic Regression Accuracy: 75.00%


### Naive Bayes

In [10]:
import numpy as np
# Initialize dictionaries to store class probabilities as well as the variance relative to the mean
class BayessianClassifier:
    def __init__(self, features, target_attribute):
        self.features = features             #input dataset
        self.target_attribute = target_attribute               #output_dataset
        self.available_classes = np.unique(self.target_attribute)    #possible output classes
        
        self.mean = {}            #mean array for columns
        self.var = {}             #var array for columns
        self.class_probabilities={}    #occurence ratio of output classes
        
    def get_probabilistic_values(self):
         # Calculate the probability, mean and variance
        total_samples = len(self.target_attribute)
        for cls in self.available_classes:
            # Select the data subset
            input_occurence= self.features[self.target_attribute== cls]
            self.class_probabilities[cls]=len(input_occurence)/total_samples
            self.mean[cls] = np.mean(input_occurence, axis=0)    #calculate array of mean of columns
            self.var[cls] = np.var(input_occurence, axis=0)      #calculate array of var of columns 

    def predict(self, data_entry):
        log_posterior_array=[]
        for cls in self.available_classes:
            log_likelihood=0           #init lof likelihood
            for i, col in enumerate(data_entry):
                log_likelihood=log_likelihood+((-((col-self.mean[cls][i]) ** 2))/(2*self.var[cls][i]+1e-6))+np.log(1/(np.sqrt(2*np.pi*self.var[cls][i]+1e-6))) #gaussian pdf function, apply it to each feature and sum it up to obtain log likelihood
            log_posterior = log_likelihood + np.log(self.class_probabilities[cls])   #log posterio = log likelihood + log prior
            log_posterior_array.append(log_posterior)                                #append the predict into log_posterior array
        return self.available_classes[np.argmax(log_posterior_array)]                #return the highest probability class which is the predicted output for the input data entry
    

    def calculate_efficiency(self, input_test, output_test):
        output_pred = np.array([self.predict(sample) for sample in input_test])
        return np.mean(output_pred == output_test)

In [11]:
# Train the Gaussian Naive Bayes classifier
model = BayessianClassifier(features=input_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
model.get_probabilistic_values()
bayessian_classifier_model_accuracy=model.calculate_efficiency(input_shuffled_test_dataset, output_shuffled_test_dataset)
print(f"Bayessian Classifier Accuracy: {bayessian_classifier_model_accuracy * 100:.2f}%")

Bayessian Classifier Accuracy: 100.00%


### KNearest Neighbors

In [12]:
def replace_if_closer(neighbors, candidate):    ###get the closest neighbor array which has k elements in it. Returns the maximum distance element in the neighbors array, compare it with the candidate element, replace if candidate element is closer
    """
    neighbors: list of (label, distance)
    candidate: a new tuple (label, distance)
    """
    max_dist = -float('inf')
    max_index = -1

    for i in range(len(neighbors)):         #get's the farthest element in neighbors
        if neighbors[i][1] > max_dist:
            max_dist = neighbors[i][1]
            max_index = i
    if candidate[1] < max_dist:        #replace the candidate with the farthest if it is closer
        neighbors[max_index] = candidate
    
class KNNClassifier:
    def __init__(self, k=3, features=None, target_attribute=None):
        self.k = k
        self.features = features
        self.target_attribute = target_attribute
        self.available_classes = np.unique(self.target_attribute)    #possible output classes

    def get_closest_neighbours(self, data_entry):
        closest_neighbours = [(None, float('inf')) for _ in range(self.k)]         #start with dummy neighbors with infinite distance

        for i in range(len(self.features)):                                        #calculate the euclidean distance of the candidate data entry
            distance = np.sqrt(np.sum((self.features[i] - data_entry) ** 2))
            label = self.target_attribute[i]
            candidate = (label, distance)

            replace_if_closer(closest_neighbours, candidate)                        #replace if the candidate data entry is closer
        return closest_neighbours

    def predict(self, closest_neighbours):
        label_counts = {label: 0 for label in self.available_classes}
        for label, _ in closest_neighbours:
            label_counts[label] += 1
    
        max_count = -1
        predicted_label = None
    
        for label in label_counts:
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                predicted_label = label
    
        return predicted_label
    
    def calculate_efficiency(self, input_test, output_test):
        output_pred = np.array([self.predict(self.get_closest_neighbours(sample)) for sample in input_test]) #get output pred array by predicting each entry with corresponding closest neighbours to each set in input dataset
        return np.mean(output_pred == output_test)         


In [13]:
best_k = None
KNN_classifier_model_accuracy = 0.0

for i in range(1, 8):
    model = KNNClassifier(features=input_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset, k=i)
    model_accuracy = model.calculate_efficiency(input_shuffled_test_dataset, output_shuffled_test_dataset)
    print(f"KNN Classifier Accuracy with k={i}: {model_accuracy * 100:.2f}%")

    if model_accuracy > KNN_classifier_model_accuracy:
        KNN_classifier_model_accuracy = model_accuracy
        best_k = i

print(f"\nBest k: {best_k} with Accuracy: {KNN_classifier_model_accuracy * 100:.2f}%")

KNN Classifier Accuracy with k=1: 85.00%
KNN Classifier Accuracy with k=2: 80.00%
KNN Classifier Accuracy with k=3: 80.00%
KNN Classifier Accuracy with k=4: 80.00%
KNN Classifier Accuracy with k=5: 80.00%
KNN Classifier Accuracy with k=6: 75.00%
KNN Classifier Accuracy with k=7: 80.00%

Best k: 1 with Accuracy: 85.00%


### Decision tree

In [14]:
class DecisionTree:
    def __init__(self, features, target_attribute, column_types, max_depth):
        self.depth=None
        self.features = features
        self.target_attribute = target_attribute
        self.column_types = column_types
        self.max_depth = max_depth
        self.tree = None

        if features is not None and column_types is not None:
            if len(column_types) != features.shape[1]:
                raise ValueError("Length of column_types must match number of feature columns")

    def calculate_entropy(self, output_column):
        values, counts = np.unique(output_column, return_counts=True) #get the unique class labels from output column with their corresponding occurence frequency.
        probabilities = counts / len(output_column) #array of probabilities by dividing occurence array by the number of rows. 
        entropy = -np.sum([p * np.log2(p + 1e-9) for p in probabilities]) #calculate the given samples entropy
        return entropy

    def categorical_split_entropy(self, feature_column, output_column):
        unique_values = np.unique(feature_column)
        total_entropy = 0
        for val in unique_values:
            mask = feature_column == val #boolean array of where the categorical value selected inside the feature column for the current iteration, return true if matches in array false if it doesn't match
            y_sub = output_column[mask] # clipping the output column by only showing True returned booleans from the mask
            entropy = self.calculate_entropy(y_sub) # calculate the entropy for the specific attribute chosen for this iteration in the specific feature chosen in this function
            weight = len(y_sub) / len(output_column) #calculate the weight of the specific attribute for this iteration
            total_entropy += weight * entropy #multiply specific attribute of the specific feature with its weight, iterate through each attribute for the specific feature eventually obtain the entropy for that specific feature column, will be used later for comparison
        return total_entropy

    def numerical_split_entropy(self, feature_column, output_column):
        sorted_indices = np.argsort(feature_column) #sort the indices chosen feature columns numerical attributes
        feature_column = feature_column[sorted_indices] #sort the feature column according to the sorted indices above
        output_column = output_column[sorted_indices]  #sort the output column according to the sorted indices above
    
        best_entropy = float('inf')
        best_threshold = None
        base_entropy = self.calculate_entropy(output_column) #get the parent entropy
    
        for i in range(len(feature_column) - 1):
            threshold = (feature_column[i] + feature_column[i + 1]) / 2
    
            left_mask = feature_column <= threshold #getting the mask boolean array into two categories, numerical attribute lower than threshold and numerical attribute greater than threshold
            right_mask = feature_column > threshold
    
            left_output = output_column[left_mask] #getting the array itself according to the boolean array created above
            right_output = output_column[right_mask]
    
            if len(left_output) == 0 or len(right_output) == 0: #skip these iterations if there are duplicate data right at the border and split simply split data into 0 and remaining, meaning split didn't actually split anything at all
                continue
    
            left_entropy = self.calculate_entropy(left_output) #calculate entropy for left side
            right_entropy = self.calculate_entropy(right_output) #calculate entropy for right side
    
            weighted_entropy = (len(left_output) * left_entropy + len(right_output) * right_entropy) / len(output_column) #get the split weighted entropy sum
    
            if weighted_entropy < best_entropy: #if split weighted entropy sum is lower than the best one so far, this is the best one and this splti poitn is the best split point
                best_entropy = weighted_entropy
                best_threshold = threshold
    
        info_gain = base_entropy - best_entropy #calculate info gain according to the best weighted split entropy  
        return best_entropy, best_threshold, info_gain

    def find_best_split(self, features=None, target=None, depth=0):
        if features is None:
            features = self.features
        if target is None:
            target = self.target_attribute

        if len(np.unique(target)) == 1: #if the split has only one possible output, conclude that leaf with the given output
            return {'label': target[0]}

        if len(target) <= 5:   #if the split has fewer rows than 5, take the maximum occurence of the output class as reference and conclude the leaf
            return {'label': np.bincount(target).argmax()}

        if self.max_depth is not None and depth >= self.max_depth:   #if the tree depth 
            return {'label': np.bincount(target).argmax()}

        base_entropy = self.calculate_entropy(target)
        best_gain = -float('inf')
        best_column = None
        best_type = None
        best_threshold = None

        for col_index in range(features.shape[1]):
            feature_col = features[:, col_index] #extract the col
            col_type = self.column_types[col_index] #get the col type

            if col_type == 'categorical': #if the col is categorical, apply categorical entropy calculation
                entropy = self.categorical_split_entropy(feature_col, target)
                threshold = None
                info_gain = base_entropy - entropy
            else:                          #if the col is numerical, apply numerical entropy calculation
                entropy, threshold, info_gain = self.numerical_split_entropy(feature_col, target)

            if info_gain > best_gain: #get the best info gain
                best_gain = info_gain
                best_column = col_index
                best_type = col_type
                best_threshold = threshold

        return best_column, best_type, best_threshold, best_gain

    def build_tree(self, features=None, target=None, depth=0):
        if features is None:
            features = self.features
        if target is None:
            target = self.target_attribute

        split = self.find_best_split(features, target, depth)
        if isinstance(split, dict) and 'label' in split: #looks if split returned a leaf node or a split
            return split

        best_col, col_type, threshold, _ = split
        feature_col = features[:, best_col]

        if col_type == 'categorical':        #if the column is categorical
            branches = {}
            for val in np.unique(feature_col):
                mask = feature_col == val
                sub_features = features[mask]
                sub_target = target[mask]
                branches[val] = self.build_tree(sub_features, sub_target, depth + 1) #builds the tree with categorical seperation
            return {
                'feature_index': best_col,
                'type': col_type,
                'branches': branches
            }
        else:                                                         #else the best split decision is numerical
            left_mask = feature_col <= threshold
            right_mask = feature_col > threshold

            left_features, left_target = features[left_mask], target[left_mask]     #left side of the threshold of the tree      
            right_features, right_target = features[right_mask], target[right_mask]  #right side of the threshold of the tree
            return {
                'feature_index': best_col,
                'type': col_type,
                'threshold': threshold,
                'left': self.build_tree(left_features, left_target, depth + 1),
                'right': self.build_tree(right_features, right_target, depth + 1)
            }

    def train(self):
        self.tree = self.build_tree()
        self.depth = self.get_tree_depth(self.tree)  

    def get_tree_depth(self, node):
        if 'label' in node:
            return 0
        if node['type'] == 'categorical':
            return 1 + max(self.get_tree_depth(branch) for branch in node['branches'].values())
        else:
            return 1 + max(self.get_tree_depth(node['left']), self.get_tree_depth(node['right']))


    def predict_one(self, input_row, tree):
        if 'label' in tree:
            return tree['label']

        if tree['type'] == 'categorical':
            val = input_row[tree['feature_index']]
            if val in tree['branches']:
                return self.predict_one(input_row, tree['branches'][val])
            else:
                return list(tree['branches'].values())[0]['label']   #recursive call for getting down the tree
        else:
            if input_row[tree['feature_index']] <= tree['threshold']:
                return self.predict_one(input_row, tree['left'])      #recursive call for getting down the tree
            else:
                return self.predict_one(input_row, tree['right'])      #recursive call for getting down the tree

    def predict(self, input_test):
        return np.array([self.predict_one(row, self.tree) for row in input_test])

    def calculate_efficiency(self, test_input, test_output):
        predictions = self.predict(test_input)
        accuracy = np.mean(predictions == test_output)
        print(f"Decision Tree Classification Accuracy: {accuracy * 100:.2f}%")


In [15]:
decision_tree_model_accuracy = 0.0
best_depth = 1
depth = 1
max_possible_depth=None

while True:
    model = DecisionTree(features=input_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset, column_types=column_types, max_depth=depth)
    model.train()
    predictions = model.predict(input_shuffled_test_dataset)
    accuracy = np.mean(predictions == output_shuffled_test_dataset)
    print(f"Max Depth Tried: {depth}, Actual Depth: {model.depth}, Accuracy: {accuracy * 100:.2f}%")
    
    if accuracy > decision_tree_model_accuracy:
        decision_tree_model_accuracy = accuracy
        best_depth = depth

    # if increasing max depth no longer increases actual tree depth, then it is the maximum splitted tree, break
    if model.depth < depth:
        max_possible_depth=model.depth
        break
    depth += 1

print(f"\nMax Possible Depth is: {max_possible_depth}")
print(f"\nBest Depth: {best_depth} with Accuracy: {decision_tree_model_accuracy * 100:.2f}%")

Max Depth Tried: 1, Actual Depth: 1, Accuracy: 70.00%
Max Depth Tried: 2, Actual Depth: 2, Accuracy: 80.00%
Max Depth Tried: 3, Actual Depth: 3, Accuracy: 85.00%
Max Depth Tried: 4, Actual Depth: 3, Accuracy: 85.00%

Max Possible Depth is: 3

Best Depth: 3 with Accuracy: 85.00%


In [16]:

print(f"Linear Regression Classification MSE: {linear_regression_model_mse :.2f}")
print(f"Rounded Linear Regression Classification Accuracy: {linear_regression_model_accuracy * 100:.2f}%")
print(f"Logistic Regression Accuracy: {logistic_regression_model_accuracy * 100:.2f}%")
print(f"Decision Tree Classification Accuracy: {decision_tree_model_accuracy * 100:.2f}%")
print(f"KNN Classification Accuracy: {KNN_classifier_model_accuracy * 100:.2f}%")
print(f"Bayessian Classifier Accuracy: {bayessian_classifier_model_accuracy * 100:.2f}%")

Linear Regression Classification MSE: 0.70
Rounded Linear Regression Classification Accuracy: 30.00%
Logistic Regression Accuracy: 75.00%
Decision Tree Classification Accuracy: 85.00%
KNN Classification Accuracy: 85.00%
Bayessian Classifier Accuracy: 100.00%
