In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
df = pd.read_csv('iris.csv')
df

Unnamed: 0,ID,sepallength,sepalwidth,petallength,petalwidth,class
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
df.shape

(150, 6)

Add descriptive features

In [4]:
df['sepal_length_bin'] = pd.cut(df['sepallength'], bins=3, labels=["Low", "Medium", "High"])
df['sepal_width_bin'] = pd.cut(df['sepalwidth'], bins=3, labels=["Low", "Medium", "High"])
df['petal_length_bin'] = pd.cut(df['petallength'], bins=3, labels=["Low", "Medium", "High"])
df['petal_width_bin'] = pd.cut(df['petalwidth'], bins=3, labels=["Low", "Medium", "High"])


In [4]:
df.head()

Unnamed: 0,ID,sepallength,sepalwidth,petallength,petalwidth,class,sepal_length_bin,sepal_width_bin,petal_length_bin,petal_width_bin
0,1,5.1,3.5,1.4,0.2,Iris-setosa,Low,Medium,Low,Low
1,2,4.9,3.0,1.4,0.2,Iris-setosa,Low,Medium,Low,Low
2,3,4.7,3.2,1.3,0.2,Iris-setosa,Low,Medium,Low,Low
3,4,4.6,3.1,1.5,0.2,Iris-setosa,Low,Medium,Low,Low
4,5,5.0,3.6,1.4,0.2,Iris-setosa,Low,Medium,Low,Low


2 - Split the features and the target

In [5]:
X = df.drop(columns=["ID","class", "sepallength", "sepalwidth", "petallength", "petalwidth"])
y = df['class']

In [8]:
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 150, dtype: object

In [7]:
X

Unnamed: 0,sepal_length_bin,sepal_width_bin,petal_length_bin,petal_width_bin
0,Low,Medium,Low,Low
1,Low,Medium,Low,Low
2,Low,Medium,Low,Low
3,Low,Medium,Low,Low
4,Low,Medium,Low,Low
...,...,...,...,...
145,Medium,Medium,High,High
146,Medium,Low,High,High
147,Medium,Medium,High,High
148,Medium,Medium,High,High


3 - Split the data in train(0.8) and test(0.2)

In [10]:
# fix a seed so that results are reproduceble
seed = 42

# shuffle all rows
df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)

#Choose the size of train and test dataframes
test_frac = 0.2
cut = int(len(df_shuffled) * (1 - test_frac))

#Separate the daa into train and test
df_train = df_shuffled.iloc[:cut]   # first 80%
df_test  = df_shuffled.iloc[cut:]   # remaining 20%

#Separate features and target
feature_cols = ['sepal_length_bin',
                'sepal_width_bin',
                'petal_length_bin',
                'petal_width_bin']

X_train = df_train[feature_cols]
y_train = df_train['class']

X_test  = df_test[feature_cols]
y_test  = df_test['class']



In [12]:
def calculate_entropy(labels: pd.Series) -> float: #Calculate entropy of a feature
    counts = labels.value_counts() #Counts the number of unique values
    probs = counts/len(labels)

    entropy = - sum([x * math.log2(x) for x in probs])

    return entropy

def information_gain(data: pd.DataFrame, feature: str, target_column: str) -> float:
    # Total entropy before the split
    total_entropy = calculate_entropy(data[target_column])

    # Values and counts for the feature
    values = data[feature].unique()
    weighted_entropy = 0

    for value in values:
        subset = data[data[feature] == value]
        weight = len(subset) / len(data)
        subset_entropy = calculate_entropy(subset[target_column])
        weighted_entropy += weight * subset_entropy

    gain = total_entropy - weighted_entropy
    return gain

def id3(data: pd.DataFrame, target_column: str, features: list) -> dict:
    # If all labels are the same, return that label
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]

    # If no features left, return the most common label
    if len(features) == 0:
        return data[target_column].mode()[0]

    # Find the best feature to split
    gains = {feature: information_gain(data, feature, target_column) for feature in features}
    best_feature = max(gains, key=gains.get)

    tree = {best_feature: {}}

    # For each value of the best feature, create a subtree
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]

        if subset.empty:
            # If no data, return the majority label
            tree[best_feature][value] = data[target_column].mode()[0]
        else:
            # Recursive call with the remaining features
            new_features = [f for f in features if f != best_feature]
            subtree = id3(subset, target_column, new_features)
            tree[best_feature][value] = subtree

    return tree


tree = id3(df, 'class', feature_cols)

In [13]:
tree

{'petal_width_bin': {'Low': 'Iris-setosa',
  'Medium': {'petal_length_bin': {'Medium': {'sepal_length_bin': {'High': 'Iris-versicolor',
      'Medium': 'Iris-versicolor',
      'Low': {'sepal_width_bin': {'Low': 'Iris-versicolor',
        'Medium': 'Iris-versicolor'}}}},
    'High': {'sepal_length_bin': {'Medium': {'sepal_width_bin': {'Medium': 'Iris-versicolor',
        'Low': 'Iris-virginica'}},
      'High': 'Iris-virginica'}}}},
  'High': {'petal_length_bin': {'Medium': {'sepal_width_bin': {'Medium': {'sepal_length_bin': {'Medium': 'Iris-virginica'}},
      'Low': 'Iris-virginica'}},
    'High': 'Iris-virginica'}}}}

Test

In [15]:
def classify(example, tree):
    if not isinstance(tree, dict):
        return tree  # If it's a label, return it

    # Otherwise, get the feature at this node
    feature = next(iter(tree))
    feature_value = example[feature]

    # Move to the subtree for this value
    subtree = tree[feature].get(feature_value)

    if subtree is None:
        return None  # Value not seen during training

    return classify(example, subtree)


In [18]:
df_test.loc[:, 'prediction'] = X_test.apply(lambda row: classify(row, tree), axis=1)
accuracy = (df_test['prediction'] == y_test).mean()
print(f"Test accuracy: {accuracy:.2%}")

Test accuracy: 96.67%
