In [None]:
#ID3

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder 

In [None]:
data = pd.read_csv('id3.csv')
print("Sample Dataset - \n",data,"\n")

Sample Dataset - 
       a1    a2      a3 classification
0   True   Hot    High             No
1   True   Hot    High             No
2  False   Hot    High            Yes
3  False  Cool  Normal            Yes
4  False  Cool  Normal            Yes
5   True  Cool    High             No
6   True   Hot    High             No
7   True   Hot  Normal            Yes
8  False  Cool  Normal            Yes
9  False  Cool    High            Yes 



In [None]:
le_a1 = LabelEncoder()
data['a1_n'] = le_a1.fit_transform(data['a1'])

le_a2 = LabelEncoder()
data['a2_n'] = le_a1.fit_transform(data['a2'])

le_a3 = LabelEncoder()
data['a3_n'] = le_a1.fit_transform(data['a3'])

print("Given Data after Encoding - \n",data,"\n") 

Given Data after Encoding - 
       a1    a2      a3 classification  a1_n  a2_n  a3_n
0   True   Hot    High             No     1     1     0
1   True   Hot    High             No     1     1     0
2  False   Hot    High            Yes     0     1     0
3  False  Cool  Normal            Yes     0     0     1
4  False  Cool  Normal            Yes     0     0     1
5   True  Cool    High             No     1     0     0
6   True   Hot    High             No     1     1     0
7   True   Hot  Normal            Yes     1     1     1
8  False  Cool  Normal            Yes     0     0     1
9  False  Cool    High            Yes     0     0     0 



In [None]:
X = data[['a1_n','a2_n','a3_n']]
print("X - Values\n",X,"\n")

y = data['classification']
print("Y - Values\n",y,"\n")

X - Values
    a1_n  a2_n  a3_n
0     1     1     0
1     1     1     0
2     0     1     0
3     0     0     1
4     0     0     1
5     1     0     0
6     1     1     0
7     1     1     1
8     0     0     1
9     0     0     0 

Y - Values
 0     No
1     No
2    Yes
3    Yes
4    Yes
5     No
6     No
7    Yes
8    Yes
9    Yes
Name: classification, dtype: object 



In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)

print("Values predicted from test dataset - ",model.predict(X_test))
print("Original Values of test dataset - ",y_test.values)
print("Accuracy of Model",model.score(X_test,y_test)) 

Values predicted from test dataset -  ['Yes' 'No' 'Yes']
Original Values of test dataset -  ['Yes' 'No' 'Yes']
Accuracy of Model 1.0


In [None]:
import pandas as pd
import numpy as np
import math

def entropy(data, target_attribute):
    # Calculate the entropy of a dataset
    target_labels = data[target_attribute].unique()
    entropy = 0
    for label in target_labels:
        count = len(data[data[target_attribute] == label])
        p = count / len(data)
        entropy -= p * math.log2(p)
    return entropy

def information_gain(data, attribute, target_attribute):
    # Calculate the information gain of an attribute in a dataset
    attribute_values = data[attribute].unique()
    gain = entropy(data, target_attribute)
    for value in attribute_values:
        subset = data[data[attribute] == value]
        p = len(subset) / len(data)
        gain -= p * entropy(subset, target_attribute)
    return gain

def id3(data, attributes, target_attribute):
    # Build a decision tree using the ID3 algorithm
    unique_labels = data[target_attribute].unique()
    if len(unique_labels) == 1:
        # If all examples have the same label, return a leaf node with that label
        return unique_labels[0]
    if len(attributes) == 0:
        # If there are no more attributes to split on, return a leaf node with the majority label
        label_counts = data[target_attribute].value_counts()
        return label_counts.index[0]
    best_attribute = max(attributes, key=lambda attribute: information_gain(data, attribute, target_attribute))
    tree = {best_attribute: {}}
    remaining_attributes = [attribute for attribute in attributes if attribute != best_attribute]
    for value in data[best_attribute].unique():
        subset = data[data[best_attribute] == value]
        if len(subset) == 0:
            # If there are no examples with this value, return a leaf node with the majority label
            label_counts = data[target_attribute].value_counts()
            tree[best_attribute][value] = label_counts.index[0]
        else:
            # Recursively build the subtree using the remaining attributes
            tree[best_attribute][value] = id3(subset, remaining_attributes, target_attribute)
    return tree

def predict(row, tree):
    # Traverse the decision tree until a leaf node is reached
    while type(tree) == dict:
        attribute = list(tree.keys())[0]
        value = row[attribute]
        if value not in tree[attribute]:
            # If the value is not in the decision tree, return the majority class
            label_counts = {}
            for label in tree[attribute].values():
                if label not in label_counts:
                    label_counts[label] = 0
                label_counts[label] += 1
            return max(label_counts, key=label_counts.get)
        tree = tree[attribute][value]
    return tree

# Load the tennis dataset
data = pd.read_csv('tennis.csv')

# Define the target attribute
target_attribute = 'play'

# Define the attributes
attributes = list(data.columns)
attributes.remove(target_attribute)

# Split the data into training and testing sets
split_index = int(0.8 * len(data))
train_data = data.iloc[:split_index]
test_data = data.iloc[split_index:]

# Train the decision tree
tree = id3(train_data, attributes, target_attribute)

# Test the decision tree
correct_predictions = 0
for index, row in test_data.iterrows():
    if predict(row, tree) == row[target_attribute]:
        correct_predictions += 1

accuracy = correct_predictions
accuracy = correct_predictions / len(test_data)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6666666666666666


CART ALGO without lib

In [None]:
import pandas as pd
import numpy as np

# Define the Node class to represent a decision tree node
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
        self.feature = feature  # index of feature to split on
        self.threshold = threshold  # threshold to split on
        self.left = left  # left subtree
        self.right = right  # right subtree
        self.label = label  # label of leaf node

# Define the decision tree function
def decision_tree(X, y):
    n, m = X.shape

    # Base case: all labels are the same
    if len(np.unique(y)) == 1:
        return Node(label=y[0])

    # Base case: no more features to split on
    if m == 0:
        return Node(label=np.bincount(y).argmax())

    # Find the best feature to split on
    best_feature, best_threshold, min_gini = None, None, 1.0
    for i in range(m):
        for threshold in np.unique(X[:, i]):
            left_indices = X[:, i] < threshold
            left_y = y[left_indices]
            right_y = y[~left_indices]
            if len(left_y) > 0 and len(right_y) > 0:
                gini = (len(left_y) / n) * gini_index(left_y) + (len(right_y) / n) * gini_index(right_y)
                if gini < min_gini:
                    best_feature, best_threshold, min_gini = i, threshold, gini

    # Create the node and its subtrees
    left_indices = X[:, best_feature] < best_threshold
    left = decision_tree(X[left_indices], y[left_indices])
    right = decision_tree(X[~left_indices], y[~left_indices])
    return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

# Define the Gini index function
def gini_index(y):
    _, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return 1 - np.sum(probs ** 2)

# Test the decision tree on the iris dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

tree = decision_tree(X_train, y_train)

# Define a function to predict the label of a single instance using the decision tree
def predict(instance, tree):
    if tree.label is not None:
        return tree.label
    elif instance[tree.feature] < tree.threshold:
        return predict(instance, tree.left)
    else:
        return predict(instance, tree.right)

# Test the accuracy of the decision tree on the test set
y_pred = np.array([predict(instance, tree) for instance in X_test])
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0
