In [17]:
import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [18]:
def entropy(labels):
    counts = Counter(labels)
    total = len(labels)
    ent = 0.0
    for v in counts.values():
        p = v/total
        ent -= p * math.log2(p)
    return ent

In [19]:
def information_gain(X_col, y):
    base_entropy = entropy(y)
    values = set(X_col)
    total = len(y)
    weighted_entropy = 0
    for v in values:
        subset_y = [y[i] for i in range(total) if X_col[i] == v]
        weighted_entropy += (len(subset_y)/total) * entropy(subset_y)
    return base_entropy - weighted_entropy

In [20]:
def gain_ratio(X_col, y):
    ig = information_gain(X_col, y)
    values = set(X_col)
    total = len(y)
    split_info = -sum((list(X_col).count(v)/total) * math.log2(list(X_col).count(v)/total) for v in values)
    return ig / split_info if split_info != 0 else 0

In [21]:
def id3(X, y, attributes):
    if len(set(y)) == 1:
        return list(set(y))[0]  # pure class
    if not attributes:
        return Counter(y).most_common(1)[0][0]  # majority class
    
    # choose attribute with max info gain
    gains = [information_gain([row[attr] for row in X], y) for attr in attributes]
    best_attr = attributes[np.argmax(gains)]
    
    tree = {best_attr: {}}
    values = set(row[best_attr] for row in X)
    
    for v in values:
        sub_X = [row for i,row in enumerate(X) if row[best_attr] == v]
        sub_y = [y[i] for i,row in enumerate(X) if row[best_attr] == v]
        if not sub_X:
            tree[best_attr][v] = Counter(y).most_common(1)[0][0]
        else:
            new_attrs = [a for a in attributes if a != best_attr]
            tree[best_attr][v] = id3(sub_X, sub_y, new_attrs)
    return tree

# =====================
# C4.5 Algorithm
# =====================

def c45(X, y, attributes):
    if len(set(y)) == 1:
        return list(set(y))[0]
    if not attributes:
        return Counter(y).most_common(1)[0][0]
    
    # choose attribute with max gain ratio
    ratios = [gain_ratio([row[attr] for row in X], y) for attr in attributes]
    best_attr = attributes[np.argmax(ratios)]
    
    tree = {best_attr: {}}
    values = set(row[best_attr] for row in X)
    
    for v in values:
        sub_X = [row for i,row in enumerate(X) if row[best_attr] == v]
        sub_y = [y[i] for i,row in enumerate(X) if row[best_attr] == v]
        if not sub_X:
            tree[best_attr][v] = Counter(y).most_common(1)[0][0]
        else:
            new_attrs = [a for a in attributes if a != best_attr]
            tree[best_attr][v] = c45(sub_X, sub_y, new_attrs)
    return tree


In [22]:


def predict(tree, sample, default=None):
    """Predict the label for one sample (dict of features)."""
    if not isinstance(tree, dict):
        return tree  # leaf node

    # get the attribute at this node
    attr = next(iter(tree))
    branches = tree[attr]

    # if sample[attr] not in branches, return majority class of this node
    value = sample.get(attr)
    if value not in branches:
        # majority class fallback
        leaves = []
        def collect_labels(subtree):
            if isinstance(subtree, dict):
                for v in subtree.values():
                    collect_labels(v)
            else:
                leaves.append(subtree)
        collect_labels(branches)
        return Counter(leaves).most_common(1)[0][0]

    return predict(branches[value], sample, default)


In [23]:
df = pd.read_csv('drug_200.csv')
display(df.head())

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [24]:
features = df.drop(["Drug"], axis=1).columns.tolist()
X = df.drop(["Drug"], axis=1).to_dict("records")   # list of dicts
y = df["Drug"].tolist()

In [25]:
X

[{'Age': 23,
  'Sex': 'F',
  'BP': 'HIGH',
  'Cholesterol': 'HIGH',
  'Na_to_K': 25.355},
 {'Age': 47,
  'Sex': 'M',
  'BP': 'LOW',
  'Cholesterol': 'HIGH',
  'Na_to_K': 13.093},
 {'Age': 47,
  'Sex': 'M',
  'BP': 'LOW',
  'Cholesterol': 'HIGH',
  'Na_to_K': 10.114},
 {'Age': 28,
  'Sex': 'F',
  'BP': 'NORMAL',
  'Cholesterol': 'HIGH',
  'Na_to_K': 7.798},
 {'Age': 61,
  'Sex': 'F',
  'BP': 'LOW',
  'Cholesterol': 'HIGH',
  'Na_to_K': 18.043},
 {'Age': 22,
  'Sex': 'F',
  'BP': 'NORMAL',
  'Cholesterol': 'HIGH',
  'Na_to_K': 8.607},
 {'Age': 49,
  'Sex': 'F',
  'BP': 'NORMAL',
  'Cholesterol': 'HIGH',
  'Na_to_K': 16.275},
 {'Age': 41,
  'Sex': 'M',
  'BP': 'LOW',
  'Cholesterol': 'HIGH',
  'Na_to_K': 11.037},
 {'Age': 60,
  'Sex': 'M',
  'BP': 'NORMAL',
  'Cholesterol': 'HIGH',
  'Na_to_K': 15.171},
 {'Age': 43,
  'Sex': 'M',
  'BP': 'LOW',
  'Cholesterol': 'NORMAL',
  'Na_to_K': 19.368},
 {'Age': 47,
  'Sex': 'F',
  'BP': 'LOW',
  'Cholesterol': 'HIGH',
  'Na_to_K': 11.767},
 {'Age':

In [26]:
y

['drugY',
 'drugC',
 'drugC',
 'drugX',
 'drugY',
 'drugX',
 'drugY',
 'drugC',
 'drugY',
 'drugY',
 'drugC',
 'drugY',
 'drugY',
 'drugY',
 'drugX',
 'drugY',
 'drugX',
 'drugA',
 'drugC',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugX',
 'drugY',
 'drugY',
 'drugX',
 'drugB',
 'drugX',
 'drugY',
 'drugX',
 'drugX',
 'drugA',
 'drugX',
 'drugX',
 'drugX',
 'drugY',
 'drugB',
 'drugY',
 'drugX',
 'drugX',
 'drugX',
 'drugA',
 'drugC',
 'drugY',
 'drugY',
 'drugY',
 'drugX',
 'drugY',
 'drugY',
 'drugB',
 'drugC',
 'drugB',
 'drugY',
 'drugX',
 'drugY',
 'drugY',
 'drugA',
 'drugY',
 'drugX',
 'drugB',
 'drugY',
 'drugA',
 'drugX',
 'drugY',
 'drugY',
 'drugB',
 'drugY',
 'drugX',
 'drugY',
 'drugY',
 'drugY',
 'drugA',
 'drugY',
 'drugA',
 'drugX',
 'drugB',
 'drugX',
 'drugC',
 'drugA',
 'drugC',
 'drugB',
 'drugX',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugY',
 'drugX',
 'drugY',
 'drugY',
 'drugY',
 'drugY',


In [27]:
# =====================
# Cross Validation
# =====================

def evaluate(X, y, algorithm_func):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = [X[i] for i in train_idx], [X[i] for i in test_idx]
        y_train, y_test = [y[i] for i in train_idx], [y[i] for i in test_idx]
        
        attributes = list(X[0].keys())
        tree = algorithm_func(X_train, y_train, attributes)
        
        y_pred = [predict(tree, sample) for sample in X_test]
        
        metrics["accuracy"].append((accuracy_score(y_test, y_pred)))
        metrics["f1"].append(f1_score(y_test, y_pred, average="macro", zero_division=0))
        metrics["precision"].append(precision_score(y_test, y_pred, average="macro", zero_division=0))
        metrics["recall"].append(recall_score(y_test, y_pred, average="macro", zero_division=0))
    
    return {k: round(np.mean(v),4) for k,v in metrics.items()}




In [28]:
id3_tree = id3(X,y, features)
print("=== ID3 Tree ===")
print(id3_tree)

=== ID3 Tree ===
{'Na_to_K': {6.683: 'drugX', 7.798: 'drugX', 8.607: 'drugX', 7.298: 'drugC', 10.114: 'drugC', 11.037: 'drugC', 11.767: 'drugC', 13.972: 'drugA', 13.093: 'drugC', 12.703: 'drugX', 15.171: 'drugY', 16.275: 'drugY', 15.376: 'drugY', 18.043: 'drugY', 19.368: 'drugY', 19.199: 'drugY', 20.942: 'drugY', 15.516: 'drugY', 19.128: 'drugY', 25.355: 'drugY', 25.974: 'drugY', 25.917: 'drugY', 22.697: 'drugY', 27.183: 'drugY', 30.568: 'drugY', 30.366: 'drugY', 31.876: 'drugY', 33.486: 'drugY', 31.686: 'drugY', 27.826: 'drugY', 29.875: 'drugY', 29.45: 'drugY', 29.271: 'drugY', 38.247: 'drugY', 35.639: 'drugY', 8.75: 'drugX', 8.107: 'drugX', 37.188: 'drugY', 9.381: 'drugX', 9.567: 'drugB', 9.445: 'drugA', 9.677: 'drugB', 9.945: 'drugB', 10.189: 'drugB', 10.832: 'drugX', 10.067: 'drugC', 11.198: 'drugA', 11.326: 'drugA', 11.424: 'drugX', 11.939: 'drugX', 11.009: 'drugB', 12.854: 'drugA', 12.006: {'Age': {16: 'drugC', 37: 'drugX'}}, 11.262: 'drugA', 11.686: 'drugX', 11.567: 'drugC', 13.

In [29]:
c45_tree = c45(X,y, features)
print("=== ID3 Tree ===")
print(c45_tree)

=== ID3 Tree ===
{'BP': {'NORMAL': {'Na_to_K': {6.683: 'drugX', 7.798: 'drugX', 8.607: 'drugX', 9.381: 'drugX', 8.75: 'drugX', 7.285: 'drugX', 12.703: 'drugX', 9.709: 'drugX', 14.133: 'drugX', 15.171: 'drugY', 16.275: 'drugY', 9.084: 'drugX', 15.79: 'drugY', 19.221: 'drugY', 12.26: 'drugX', 12.295: 'drugX', 19.675: 'drugY', 22.905: 'drugY', 24.658: 'drugY', 25.917: 'drugY', 17.211: 'drugY', 27.05: 'drugY', 27.064: 'drugY', 22.456: 'drugY', 25.786: 'drugY', 31.686: 'drugY', 25.893: 'drugY', 7.477: 'drugX', 8.107: 'drugX', 8.966: 'drugX', 9.281: 'drugX', 9.514: 'drugX', 9.443: 'drugX', 9.894: 'drugX', 10.898: 'drugX', 10.091: 'drugX', 10.832: 'drugX', 10.605: 'drugX', 10.103: 'drugX', 11.953: 'drugX', 10.065: 'drugX', 10.443: 'drugX', 12.859: 'drugX', 12.879: 'drugX', 13.597: 'drugX', 13.884: 'drugX', 14.216: 'drugX', 14.02: 'drugX', 15.969: 'drugY', 15.891: 'drugY', 16.594: 'drugY', 16.85: 'drugY', 16.753: 'drugY', 17.225: 'drugY', 19.011: 'drugY', 7.761: 'drugX', 7.845: 'drugX', 20.489

In [30]:

print("ID3 CV Results:", evaluate(X, y, id3))
print("C4.5 CV Results:", evaluate(X, y, c45))
print("-------------------------")

ID3 CV Results: {'accuracy': np.float64(0.455), 'f1': np.float64(0.1328), 'precision': np.float64(0.0976), 'recall': np.float64(0.21)}
C4.5 CV Results: {'accuracy': np.float64(0.52), 'f1': np.float64(0.3503), 'precision': np.float64(0.3035), 'recall': np.float64(0.4705)}
-------------------------


In [31]:
for x in X:
    print(predict(id3_tree,x))

metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
        
y_pred = [predict(id3_tree, sample) for sample in X]
        
metrics["accuracy"].append((accuracy_score(y, y_pred)))
metrics["f1"].append(f1_score(y, y_pred, average="macro", zero_division=0))
metrics["precision"].append(precision_score(y, y_pred, average="macro", zero_division=0))
metrics["recall"].append(recall_score(y, y_pred, average="macro", zero_division=0))

print(metrics)

drugY
drugC
drugC
drugX
drugY
drugX
drugY
drugC
drugY
drugY
drugC
drugY
drugY
drugY
drugX
drugY
drugX
drugA
drugC
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugX
drugY
drugY
drugX
drugB
drugX
drugY
drugX
drugX
drugA
drugX
drugX
drugX
drugY
drugB
drugY
drugX
drugX
drugX
drugA
drugC
drugY
drugY
drugY
drugX
drugY
drugY
drugB
drugC
drugB
drugY
drugX
drugY
drugY
drugA
drugY
drugX
drugB
drugY
drugA
drugX
drugY
drugY
drugB
drugY
drugX
drugY
drugY
drugY
drugA
drugY
drugA
drugX
drugB
drugX
drugC
drugA
drugC
drugB
drugX
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugX
drugY
drugY
drugY
drugY
drugA
drugA
drugC
drugX
drugY
drugX
drugX
drugY
drugB
drugY
drugA
drugX
drugX
drugX
drugX
drugY
drugX
drugX
drugA
drugY
drugY
drugY
drugY
drugY
drugB
drugY
drugY
drugX
drugY
drugX
drugY
drugY
drugX
drugY
drugY
drugX
drugB
drugA
drugB
drugX
drugA
drugY
drugB
drugY
drugA
drugX
drugX
drugA
drugX
drugC
drugA
drugB
drugX
drugX
drugY
drugC
drugA
drugY
drugC
drugX
drugX
drugB
drugX
drugY
drugY
drugY
drug

In [32]:
for x in X:
    print(predict(c45_tree,x))


metrics = {"accuracy": [], "f1": [], "precision": [], "recall": []}
        
y_pred = [predict(c45_tree, sample) for sample in X]
        
metrics["accuracy"].append((accuracy_score(y, y_pred)))
metrics["f1"].append(f1_score(y, y_pred, average="macro", zero_division=0))
metrics["precision"].append(precision_score(y, y_pred, average="macro", zero_division=0))
metrics["recall"].append(recall_score(y, y_pred, average="macro", zero_division=0))

print(metrics)

drugY
drugC
drugC
drugX
drugY
drugX
drugY
drugC
drugY
drugY
drugC
drugY
drugY
drugY
drugX
drugY
drugX
drugA
drugC
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugX
drugY
drugY
drugX
drugB
drugX
drugY
drugX
drugX
drugA
drugX
drugX
drugX
drugY
drugB
drugY
drugX
drugX
drugX
drugA
drugC
drugY
drugY
drugY
drugX
drugY
drugY
drugB
drugC
drugB
drugY
drugX
drugY
drugY
drugA
drugY
drugX
drugB
drugY
drugA
drugX
drugY
drugY
drugB
drugY
drugX
drugY
drugY
drugY
drugA
drugY
drugA
drugX
drugB
drugX
drugC
drugA
drugC
drugB
drugX
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugY
drugX
drugY
drugY
drugY
drugY
drugA
drugA
drugC
drugX
drugY
drugX
drugX
drugY
drugB
drugY
drugA
drugX
drugX
drugX
drugX
drugY
drugX
drugX
drugA
drugY
drugY
drugY
drugY
drugY
drugB
drugY
drugY
drugX
drugY
drugX
drugY
drugY
drugX
drugY
drugY
drugX
drugB
drugA
drugB
drugX
drugA
drugY
drugB
drugY
drugA
drugX
drugX
drugA
drugX
drugC
drugA
drugB
drugX
drugX
drugY
drugC
drugA
drugY
drugC
drugX
drugX
drugB
drugX
drugY
drugY
drugY
drug