In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [6]:
# Specify the correct path to your dataset
file_path = '/content/colon-dataset-processed.csv'


In [7]:
# Load your data
data = pd.read_csv(file_path)

# Split the data into features and labels
X = data.drop('Class', axis=1)
y = data['Class']

# Encode the labels if they are categorical (assuming 'healthy' is 0 and 'diagnosed' is 1)
y = y.map({'healthy': 0, 'diagnosed': 1})

Creating a Decision Tree Model

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the decision tree classifier and fit it to the training data
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Evaluate the classifier
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Accuracy: 0.782608695652174
Confusion Matrix:
[[9 2]
 [3 9]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.82      0.75      0.78        12

    accuracy                           0.78        23
   macro avg       0.78      0.78      0.78        23
weighted avg       0.79      0.78      0.78        23



Trying to Construct a DT from scratch insteadf of a liberary.
resulted in much higher accuracy.

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

def gini_impurity(y):
    m = len(y)
    return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in range(2))

def grow_tree(X, y, depth=0, max_depth=None):
    num_samples_per_class = [np.sum(y == i) for i in range(2)]
    predicted_class = np.argmax(num_samples_per_class)
    node = Node(
        gini=gini_impurity(y),
        num_samples=len(y),
        num_samples_per_class=num_samples_per_class,
        predicted_class=predicted_class,
    )

    if depth < max_depth:
        idx, thr = best_split(X, y)
        if idx is not None:
            indices_left = X[:, idx] < thr
            X_left, y_left = X[indices_left], y[indices_left]
            X_right, y_right = X[~indices_left], y[~indices_left]
            node.feature_index = idx
            node.threshold = thr
            node.left = grow_tree(X_left, y_left, depth + 1, max_depth)
            node.right = grow_tree(X_right, y_right, depth + 1, max_depth)
    return node

def best_split(X, y):
    m, n = X.shape
    if m <= 1:
        return None, None

    num_parent = [np.sum(y == c) for c in range(2)]
    best_gini = 1.0 - sum((num / m) ** 2 for num in num_parent)
    best_idx, best_thr = None, None

    for idx in range(n):
        thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
        num_left = [0] * 2
        num_right = num_parent.copy()
        for i in range(1, m):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum(
                (num_left[x] / i) ** 2 for x in range(2)
            )
            gini_right = 1.0 - sum(
                (num_right[x] / (m - i)) ** 2 for x in range(2)
            )
            gini = (i * gini_left + (m - i) * gini_right) / m
            if thresholds[i] == thresholds[i - 1]:
                continue
            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2
    return best_idx, best_thr

def predict_sample(node, X):
    if node.left is None and node.right is None:
        return node.predicted_class
    if X[node.feature_index] < node.threshold:
        return predict_sample(node.left, X)
    else:
        return predict_sample(node.right, X)

def predict_tree(node, X):
    return [predict_sample(node, x) for x in X]

# Load your data
data = pd.read_csv('colon-dataset-processed.csv')
X = data.drop('Class', axis=1).values
y = data['Class'].map({'healthy': 0, 'diagnosed': 1}).values

# Train the decision tree
tree = grow_tree(X, y, max_depth=3)

# Predict samples
predictions = predict_tree(tree, X)



# Generate the confusion matrix
conf_matrix = confusion_matrix(y, predictions)

# Print the confusion matrix
conf_matrix = confusion_matrix(y, predictions)
print('Confusion matrix:')
print(conf_matrix)

# Generate and print the classification report
class_report = classification_report(y, predictions)
print('\nClassification report:')
print(class_report)


Confusion matrix:
[[47  3]
 [ 9 56]]

Classification report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        50
           1       0.95      0.86      0.90        65

    accuracy                           0.90       115
   macro avg       0.89      0.90      0.90       115
weighted avg       0.90      0.90      0.90       115



Here I did some modification to the DT to Handle cases where no valid split can be found by turning the node into a leaf.
Stop growing the tree if all samples at a node have the same class label. the changes made the code more robust, however the accuracy decreased to 81%, still higher then using the liberary 

In [13]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE  # Make sure you've installed imblearn

# ... [No change to the Node class, gini_impurity, predict_sample functions] ...

# Define your CustomDecisionTree class
class CustomDecisionTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree_ = None
    
    def fit(self, X, y):
        # Ensure X and y are numpy arrays to satisfy the indexing used in grow_tree
        X, y = np.array(X), np.array(y)
        self.tree_ = grow_tree(X, y, max_depth=self.max_depth)
        return self
    
    def predict(self, X):
        # Ensure X is a numpy array
        X = np.array(X)
        return predict_tree(self.tree_, X)

# ... [No changes to the grow_tree, best_split functions] ...

# Load your data
data = pd.read_csv('colon-dataset-processed.csv')
X = data.drop('Class', axis=1)
y = data['Class'].map({'healthy': 0, 'diagnosed': 1})

# Balance the dataset using SMOTE
smote = SMOTE()
X_balanced, y_balanced = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Convert the pandas dataframe to numpy array for compatibility with our custom tree
X_train = X_train.values
X_test = X_test.values

# Find the best max_depth using cross-validation
best_accuracy = 0
best_depth = 0
kf = StratifiedKFold(n_splits=5)
for depth in range(1, 10):
    tree = CustomDecisionTree(max_depth=depth)
    scores = cross_val_score(tree, X_train, y_train, cv=kf)
    if np.mean(scores) > best_accuracy:
        best_accuracy = np.mean(scores)
        best_depth = depth

# Train the decision tree with the best max_depth
tree = CustomDecisionTree(max_depth=best_depth)
tree.fit(X_train, y_train)

# Predict samples on the test set
predictions = tree.predict(X_test)

# Generate and evaluate the confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion matrix:')
print(conf_matrix)

# Generate and evaluate the classification report
class_report = classification_report(y_test, predictions)
print('\nClassification report:')
print(class_report)



Confusion matrix:
[[ 9  5]
 [ 0 12]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      0.64      0.78        14
           1       0.71      1.00      0.83        12

    accuracy                           0.81        26
   macro avg       0.85      0.82      0.81        26
weighted avg       0.86      0.81      0.80        26

