The selected Dataset was stored in Google Drive, So firstly Colab had to be Given Access to the files in Drive

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive



 STEP 1: Data Loading


In [None]:
import pandas as pd
import numpy as np

# Load the dataset using the provided path
data_path = "/content/drive/MyDrive/Breast_Cancer.csv"
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()


Step 1.1 Data Exploration

In [None]:
# Display basic information about the dataset
df.info()

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)



 STEP 2: Data Preprocessing

In [87]:
# Convert the 'diagnosis' column to binary representation
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
y = df['diagnosis'].values
df = df.drop(columns=['id', 'diagnosis', 'Unnamed: 32'])

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=42)


STEP 3: Helper Functions


In [88]:

def entropy(y):
    """Compute the entropy of the given labels."""
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

def info_gain(y_left, y_right, parent_entropy):
    """Calculate information gain from a potential split."""
    n = len(y_left) + len(y_right)
    child_entropy = (len(y_left) / n) * entropy(y_left) + (len(y_right) / n) * entropy(y_right)
    return parent_entropy - child_entropy



STEP 4: Building the Tree


In [89]:

def build_tree(X, y, depth=0, max_depth=5):
    """
    Recursively build the decision tree.
    - Stop the recursion if all samples have the same label or if depth equals max_depth.
    - For each feature, check every unique value as a potential split.
    - Apply the best split and continue recursively.
    """
    n_samples, n_features = X.shape
    unique_classes = np.unique(y)

    # Stopping criteria
    if len(unique_classes) == 1 or depth == max_depth:
        return int(unique_classes[0])

    # Initialize best split parameters
    best_gain = 0
    best_threshold = None
    best_index = None
    parent_entropy = entropy(y)

    # Search for the best split
    for index in range(n_features):
        thresholds = np.unique(X[:, index])
        for threshold in thresholds:
            y_left = y[X[:, index] < threshold]
            y_right = y[X[:, index] >= threshold]

            # Skip this split if it doesn't divide the dataset.
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            # Calculate gain and update best split if needed
            gain = info_gain(y_left, y_right, parent_entropy)
            if gain > best_gain:
                best_gain = gain
                best_threshold = threshold
                best_index = index

    # If no gain, return majority class
    if best_gain == 0:
        return int(np.bincount(y).argmax())

    # Recursive calls for left and right subtrees
    left_mask = X[:, best_index] < best_threshold
    right_mask = X[:, best_index] >= best_threshold

    left_tree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
    right_tree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth)

    return (best_index, best_threshold, left_tree, right_tree)


STEP 5: Prediction Function


In [90]:
def predict(sample, tree):
    """Recursively predict the class of a sample using the decision tree."""
    # If we reach a leaf node (integer), return the class
    if isinstance(tree, int):
        return tree

    # Extract tree parameters
    index, threshold, left_tree, right_tree = tree
    if sample[index] < threshold:
        return predict(sample, left_tree)
    return predict(sample, right_tree)

STEP 6: Model Evaluation


In [91]:

# Construct the decision tree
tree_root = build_tree(X_train, y_train, depth=0, max_depth=5)

# Predict labels for test set and calculate accuracy
predicted_labels = [predict(sample, tree_root) for sample in X_test]
accuracy = np.mean(predicted_labels == y_test)
print(f"Decision Tree Accuracy: {accuracy * 100:.2f}%")

Decision Tree Accuracy: 95.61%
