In [26]:
import pandas as pd
from sklearn.datasets import load_iris
import numpy as np
from graphviz import Digraph
# Load the Iris dataset
iris = load_iris()
# Create a DataFrame
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target_names[iris.target] # This line creates the column!

# A1. 


In [16]:
def calculate_entropy(target_col):
    """
    Calculates the entropy for a given target column (pandas Series).
    
    Parameters:
    - target_col (pd.Series): The data column for which to calculate entropy.
    
    Returns:
    - float: The entropy value.
    """
    # Find unique values and their counts
    elements, counts = np.unique(target_col, return_counts=True)
    
    # Calculate probabilities
    probabilities = counts / len(target_col)
    
    # Calculate entropy using the formula
    entropy = -np.sum(probabilities * np.log2(probabilities))
    
    return entropy

# --- Example Usage ---
# Calculate the entropy of the whole dataset's target variable
total_entropy = calculate_entropy(df['target'])
print(f"\nEntropy of the Iris dataset target: {total_entropy:.4f}")


Entropy of the Iris dataset target: 1.5850


# A2


In [17]:
def func2(dataset):
    df = pd.read_csv(dataset)
    count = df['Species'].value_counts()
    probs = count/len(df)
    op = np.sum(1-probs**2)
    return op

f = func2("Iris.csv")
print(f)

2.6666666666666665


# A3

In [22]:
def find_best_split(df, target_col_name):
    """
    Finds the best feature to split on by maximizing Information Gain.

    Parameters:
    - df (pd.DataFrame): The dataset (or subset) to split.
    - target_col_name (str): The name of the target variable column.
    
    Returns:
    - str: The name of the feature that provides the highest information gain.
    """
    # Calculate the entropy of the current dataset (before splitting)
    total_entropy = calculate_entropy(df[target_col_name])
    
    # Get feature names (all columns except the target)
    features = [col for col in df.columns if col != target_col_name]
    
    max_info_gain = -1
    best_feature = None
    
    # Iterate through each feature to calculate its information gain
    for feature in features:
        weighted_entropy = 0
        # For each unique value in the feature, calculate its weighted entropy
        for value in df[feature].unique():
            subset = df[df[feature] == value]
            subset_proportion = len(subset) / len(df)
            subset_entropy = calculate_entropy(subset[target_col_name])
            weighted_entropy += subset_proportion * subset_entropy
            
        # Calculate the information gain for the current feature
        info_gain = total_entropy - weighted_entropy
        
        # Update the best feature if this one is better
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
            
    return best_feature

# --- Example Usage ---
# We use the binned dataframe to find the root node
root_node_feature = find_best_split(binned_df, 'target')
print(f"\nBest feature for the root node is: '{root_node_feature}'")


Best feature for the root node is: 'petal width (cm)'


# A4


In [21]:
def custom_binning(feature_col, num_bins=4, method='equal_width'):
    """
    Manually implements binning for a continuous feature.
    
    Parameters:
    - feature_col (pd.Series): The continuous data to bin.
    - num_bins (int): The number of bins to create (default is 4).
    - method (str): 'equal_width' or 'equal_frequency' (default is 'equal_width').
    
    Returns:
    - pd.Series: The binned data as a series of strings.
    """
    if method == 'equal_width':
        min_val = feature_col.min()
        max_val = feature_col.max()
        bin_width = (max_val - min_val) / num_bins
        # Define the edges of the bins
        bins = np.arange(min_val, max_val, bin_width)
        bins = np.append(bins, max_val * (1 + 1e-6)) # Add a small buffer to include the max value
        
        # Assign each data point to a bin
        binned_data = np.digitize(feature_col, bins=bins, right=False)
        # Convert from 1-based to 0-based index
        return pd.Series(binned_data - 1, index=feature_col.index).astype(str)
        
    elif method == 'equal_frequency':
        # Find bin edges using quantiles
        bin_edges = pd.qcut(feature_col, q=num_bins, retbins=True, duplicates='drop')[1]
        
        # Assign each data point to a bin
        binned_data = np.digitize(feature_col, bins=bin_edges, right=False)
        binned_data[feature_col == feature_col.min()] = 1 # Ensure min value is in the first bin
        # Convert from 1-based to 0-based index
        return pd.Series(binned_data - 1, index=feature_col.index).astype(str)
        
    else:
        raise ValueError("Method must be 'equal_width' or 'equal_frequency'")

# --- Example Usage ---
# Let's create a binned version of our dataframe for the next steps
binned_df = pd.DataFrame()
for col in iris.feature_names:
    binned_df[col] = custom_binning(df[col], num_bins=4, method='equal_width')
binned_df['target'] = df['target']

print("\nBinned Dataset (first 5 rows):")
print(binned_df.head())


Binned Dataset (first 5 rows):
  sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)  \
0                 0                2                 0                0   
1                 0                1                 0                0   
2                 0                2                 0                0   
3                 0                1                 0                0   
4                 0                2                 0                0   

   target  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


# A5


In [23]:
def build_decision_tree(df, target_col_name, features):
    """
    Recursively builds a decision tree from a dataframe.

    Parameters:
    - df (pd.DataFrame): The training data.
    - target_col_name (str): The name of the target column.
    - features (list): A list of feature names to consider for splitting.
    
    Returns:
    - dict or value: A nested dictionary representing the tree.
    """
    target_values = df[target_col_name]
    
    # --- Base Cases (Stopping Conditions) ---
    # 1. If all target values are the same, return that value (pure node).
    if len(target_values.unique()) == 1:
        return target_values.unique()[0]
        
    # 2. If there are no more features left to split on, return the majority class.
    if len(features) == 0:
        return target_values.mode()[0]
        
    # --- Recursive Step ---
    # Find the best feature to split on for the current dataset
    best_feature = find_best_split(df[[*features, target_col_name]], target_col_name)
    
    # If no feature provides any information gain, stop and return the majority class.
    if best_feature is None:
        return target_values.mode()[0]
        
    # Create the tree structure as a nested dictionary
    tree = {best_feature: {}}
    
    # Get remaining features for the next recursive call
    remaining_features = [f for f in features if f != best_feature]
    
    # For each unique value of the best feature, create a new branch
    for value in df[best_feature].unique():
        subset = df[df[best_feature] == value]
        # Recursively build the subtree for this branch
        subtree = build_decision_tree(subset, target_col_name, remaining_features)
        tree[best_feature][value] = subtree
        
    return tree

# --- Example Usage ---
features = list(binned_df.columns)
features.remove('target')
iris_tree = build_decision_tree(binned_df, 'target', features)

# Print the generated tree (it's a nested dictionary)
import json
print("\nGenerated Decision Tree (dictionary format):")
print(json.dumps(iris_tree, indent=4))


Generated Decision Tree (dictionary format):
{
    "petal width (cm)": {
        "0": "setosa",
        "2": {
            "petal length (cm)": {
                "2": {
                    "sepal length (cm)": {
                        "2": {
                            "sepal width (cm)": {
                                "2": "versicolor",
                                "1": "versicolor",
                                "0": "versicolor"
                            }
                        },
                        "1": {
                            "sepal width (cm)": {
                                "0": "versicolor",
                                "1": "versicolor",
                                "2": "versicolor"
                            }
                        },
                        "0": "virginica"
                    }
                },
                "1": "versicolor",
                "3": "virginica"
            }
        },
        "1": "versicolor",
     

# A6

In [29]:
def print_tree(tree, indent=""):
    """
    Prints a text-based representation of a decision tree.

    Parameters:
    - tree (dict or value): The nested dictionary representing the tree.
    - indent (str): The string used for indenting child nodes.
    """
    # Base case: If the node is not a dictionary, it's a leaf node (a prediction).
    if not isinstance(tree, dict):
        print(f" -> Class: {tree}")
        return

    # It's a decision node. Get the feature name and the branches.
    feature_name, branches = list(tree.items())[0]

    # Recursively print each branch
    for value, subtree in branches.items():
        print(f"{indent}{feature_name} is '{value}'", end="")
        print_tree(subtree, indent + "|   ")

# --- Example Usage ---
# Assuming you have already built your 'iris_tree' dictionary from the previous step.
# If not, you need to run the code that generates it first.
#
# iris_tree = build_decision_tree(binned_df, 'target', features)

print("Decision Tree Structure:")
print_tree(iris_tree)

Decision Tree Structure:
petal width (cm) is '0' -> Class: setosa
petal width (cm) is '2'|   petal length (cm) is '2'|   |   sepal length (cm) is '2'|   |   |   sepal width (cm) is '2' -> Class: versicolor
|   |   |   sepal width (cm) is '1' -> Class: versicolor
|   |   |   sepal width (cm) is '0' -> Class: versicolor
|   |   sepal length (cm) is '1'|   |   |   sepal width (cm) is '0' -> Class: versicolor
|   |   |   sepal width (cm) is '1' -> Class: versicolor
|   |   |   sepal width (cm) is '2' -> Class: versicolor
|   |   sepal length (cm) is '0' -> Class: virginica
|   petal length (cm) is '1' -> Class: versicolor
|   petal length (cm) is '3' -> Class: virginica
petal width (cm) is '1' -> Class: versicolor
petal width (cm) is '3' -> Class: virginica


# A7

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# --- Step 1: Corrected prediction function ---
def predict_single(query, tree, default=None):
    """
    Predicts the class for a single instance (a dictionary).
    This version is robust against unseen data combinations.
    """
    # Start at the root of the current subtree
    feature = list(tree.keys())[0]
    value = query.get(feature)

    # If the feature value doesn't exist as a branch, return the default
    if value not in tree[feature]:
        return default

    # Traverse to the next node in the tree
    result = tree[feature][value]

    # If we've reached another decision node, recurse
    if isinstance(result, dict):
        return predict_single(query, result, default)
    # If we've reached a leaf, it's our prediction
    else:
        return result

# --- Step 2: Updated function to plot the boundary ---
def visualize_decision_boundary(df, target_col, num_bins=4):
    """
    Builds a tree on the first two features and visualizes its boundary.
    """
    f1, f2 = df.columns[0], df.columns[1]
    X_plot = df[[f1, f2]]
    
    y_map = {name: i for i, name in enumerate(df[target_col].unique())}
    y_plot = df[target_col].map(y_map)
    
    # --- Build a 2D tree on binned data ---
    binned_f1 = custom_binning(X_plot[f1], num_bins=num_bins)
    binned_f2 = custom_binning(X_plot[f2], num_bins=num_bins)
    train_df = pd.DataFrame({f1: binned_f1, f2: binned_f2, 'target': df[target_col]})
    tree_2d = build_decision_tree(train_df, 'target', [f1, f2])

    # ADDED: Determine the majority class to use as a default for unknown points
    default_prediction = df[target_col].mode()[0]

    # --- Create a mesh grid for the plot background ---
    x_min, x_max = X_plot[f1].min() - 0.5, X_plot[f1].max() + 0.5
    y_min, y_max = X_plot[f2].min() - 0.5, X_plot[f2].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))

    # --- Predict class for each point on the grid ---
    grid_f1 = custom_binning(pd.Series(xx.ravel(), name=f1), num_bins=num_bins)
    grid_f2 = custom_binning(pd.Series(yy.ravel(), name=f2), num_bins=num_bins)
    grid_df = pd.DataFrame({f1: grid_f1, f2: grid_f2})

    # CHANGED: Pass the default_prediction to the robust predict_single function
    predictions = grid_df.apply(
        lambda row: predict_single(row.to_dict(), tree_2d, default=default_prediction), 
        axis=1
    )
    Z = predictions.map(y_map).values.reshape(xx.shape)

    # --- Plot the results ---
    plt.figure(figsize=(10, 7))
    plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.RdYlBu)
    plt.scatter(X_plot[f1], X_plot[f2], c=y_plot, s=20, edgecolor='k', cmap=plt.cm.RdYlBu)
    plt.title(f"Decision Boundary for '{f1}' vs '{f2}'")
    plt.xlabel(f1)
    plt.ylabel(f2)
    plt.show()

# --- Example Usage ---
# Ensure you have run all previous code blocks to define df, custom_binning, etc.
# print("\nGenerating decision boundary plot for the first two features...")
# visualize_decision_boundary(df, 'target')