In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter

**Question 1**

In [3]:
# Given data
data1 = [
    ["Rainy", "Hot", "High", "False", "No"],
    ["Rainy", "Hot", "High", "True", "No"],
    ["Overcast", "Hot", "High", "False", "Yes"],
    ["Sunny", "Mild", "High", "False", "Yes"],
    ["Sunny", "Cool", "Normal", "False", "Yes"],
    ["Sunny", "Cool", "Normal", "True", "No"],
    ["Overcast", "Cool", "Normal", "True", "Yes"],
    ["Rainy", "Mild", "High", "False", "No"],
    ["Rainy", "Cool", "Normal", "False", "Yes"],
    ["Sunny", "Mild", "Normal", "False", "Yes"],
    ["Rainy", "Mild", "Normal", "True", "Yes"],
    ["Overcast", "Mild", "High", "True", "Yes"],
    ["Overcast", "Hot", "Normal", "False", "Yes"],
    ["Sunny", "Mild", "High", "True", "No"],
]

# Convert to DataFrame
df1 = pd.DataFrame(data1, columns=["Outlook", "Temp", "Humidity", "Windy", "Play Golf"])
df1["Play Golf"] = df1["Play Golf"].map({"Yes": 1, "No": 0})

In [4]:
# Calculate the entropy of a dataset
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

# Calculate the information gain of a dataset after splitting on a feature
def information_gain(X, y, feature):
    # Calculate the total entropy
    total_entropy = entropy(y)

    # Calculate the weighted entropy after splitting
    values, counts = np.unique(X[:, feature], return_counts=True)
    weighted_entropy = 0
    for i, value in enumerate(values):
        subset_indices = np.where(X[:, feature] == value)[0]
        subset_entropy = entropy(y[subset_indices])
        weighted_entropy += (counts[i] / len(X)) * subset_entropy

    # Information gain is the difference between total entropy and weighted entropy
    return total_entropy - weighted_entropy

In [5]:
# Find the best feature to split on
def best_feature_to_split(X, y):
    num_features = X.shape[1]
    best_feature = None
    best_info_gain = -1

    for feature in range(num_features):
        info_gain = information_gain(X, y, feature)
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature

    return best_feature

# Create a leaf node with the most common class
def create_leaf(y):
    counter = Counter(y)
    return counter.most_common(1)[0][0]


In [6]:
# Build the decision tree recursively
def build_tree(X, y, depth=0, max_depth=None):
    num_samples, num_features = X.shape

    # Stopping conditions
    if len(np.unique(y)) == 1:
        return y[0]
    if num_samples == 0:
        return create_leaf(y)
    if max_depth is not None and depth >= max_depth:
        return create_leaf(y)

    # Find the best feature to split on
    best_feature = best_feature_to_split(X, y)
    if best_feature is None:
        return create_leaf(y)

    # Create the tree structure
    tree = {}
    tree['feature'] = best_feature
    tree['children'] = {}

    # Split the dataset and recursively build the tree
    values = np.unique(X[:, best_feature])
    for value in values:
        subset_indices = np.where(X[:, best_feature] == value)[0]
        tree['children'][value] = build_tree(X[subset_indices], y[subset_indices], depth + 1, max_depth)

    return tree

In [35]:
# Predict the class for a single sample
def predict_sample(tree, sample):
    if not isinstance(tree, dict):  # Leaf node
        return tree

    feature = tree['feature']
    value = sample[feature]

    # If the value is not in the tree's children, return the majority class
    if value not in tree['children']:
        return None

    # Otherwise, traverse the tree
    return predict_sample(tree['children'][value], sample)

# Predict the class for multiple samples
def predict(tree, X):
    return [predict_sample(tree, sample) for sample in X]

In [8]:
# Convert DataFrame to numpy arrays
X1 = df1.drop("Play Golf", axis=1).values
y1 = df1["Play Golf"].values

# Build the decision tree
tree1 = build_tree(X1, y1)

In [11]:
def print_tree(tree, feature_names=None, depth=0):
    # If the current node is a leaf, print the class label
    if not isinstance(tree, dict):
        print("  " * depth + "Predict:", tree)
        return

    # Get the feature name (if feature_names is provided)
    feature_index = tree['feature']
    feature_name = feature_names[feature_index] if feature_names else f"Feature {feature_index}"

    # Print the current feature
    print("  " * depth + f"Feature: {feature_name}")

    # Iterate through the children and print the tree recursively
    for value, child in tree['children'].items():
        print("  " * (depth + 1) + f"If {feature_name} == {value}:")
        print_tree(child, feature_names, depth + 2)

In [12]:
# Define feature names for better readability
feature_names = ["Outlook", "Temp", "Humidity", "Windy"]

# Print the tree with feature names
print_tree(tree1, feature_names)

Feature: Outlook
  If Outlook == Overcast:
    Predict: 1
  If Outlook == Rainy:
    Feature: Humidity
      If Humidity == High:
        Predict: 0
      If Humidity == Normal:
        Predict: 1
  If Outlook == Sunny:
    Feature: Windy
      If Windy == False:
        Predict: 1
      If Windy == True:
        Predict: 0


In [21]:
# Predict the classes for test data
test_data = [["Rainy", "Cool", "High", "True"], ["Sunny", "Mild", "Normal", "False"]]
predictions = predict(tree1, test_data)
print("Predictions:", predictions)


Predictions: [0, 1]


**Question 2**

In [14]:
data2 = [
    ["Aerospace", "Engineering", "High", "NO", "NO"],
    ["Aerospace", "Engineering", "High", "Yes", "NO"],
    ["Auto", "Engineering", "High", "No", "YES"],
    ["Electronics", "Marketing", "High", "No", "YES"],
    ["Urban", "Marketing", "Low", "No", "YES"],
    ["Urban", "Marketing", "Low", "Yes", "NO"],
    ["Auto", "Marketing", "Low", "Yes", "YES"],
    ["Aerospace", "Sales", "High", "No", "NO"],
    ["Aerospace", "Marketing", "Low", "No", "YES"],
    ["Electronics", "Sales", "Low", "No", "NO"],
    ["Aerospace", "Sales", "Low", "Yes", "YES"],
    ["Electronics", "Sales", "High", "Yes", "NO"],
    ["Auto", "Engineering", "Low", "No", "YES"],
    ["Electronics", "Sales", "High", "Yes", "NO"],
]

df2 = pd.DataFrame(data2, columns=["Industry", "JobType", "Income", "PreviousCustomer", "Class"])
df2["Class"] = df2["Class"].map({"YES": 1, "NO": 0})



In [15]:
# Convert DataFrame to numpy arrays
X2 = df2.drop("Class", axis=1).values
y2 = df2["Class"].values

# Build the decision tree
tree2 = build_tree(X2, y2)

In [16]:
# Define feature names for better readability
feature_names = ["Industry", "JobType", "Income", "PreviousCustomer"]

# Print the tree with feature names
print_tree(tree2, feature_names)

Feature: Industry
  If Industry == Aerospace:
    Feature: Income
      If Income == High:
        Predict: 0
      If Income == Low:
        Predict: 1
  If Industry == Auto:
    Predict: 1
  If Industry == Electronics:
    Feature: JobType
      If JobType == Marketing:
        Predict: 1
      If JobType == Sales:
        Predict: 0
  If Industry == Urban:
    Feature: PreviousCustomer
      If PreviousCustomer == No:
        Predict: 1
      If PreviousCustomer == Yes:
        Predict: 0


In [36]:
# Predict the classes for the test data
test_data = [["Urban", "Sales", "Low", "Yes" ], ["Electronics", "Engineering", "High", "No"]]
predictions = predict(tree2, test_data)
print("Predictions:", predictions)

Predictions: [0, None]
