In [1]:
# Import libraries
import numpy as np
import pandas as pd
import math

In [2]:
# Define data as a list of dictionaries
data = [

   {'Student_Id': 'A', 'Name': 'Oni', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'CSE', 'Hall_Name': 'Jahanara Imam', 'Percentage_of_Attendance': '90%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'B', 'Name': 'Mangsura', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'Math', 'Hall_Name': 'Khaleda Zia', 'Percentage_of_Attendance': '100%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'C', 'Name': 'Keya', 'Gender': 'F', 'Session': '2016-2017', 'Department': 'Chemistry', 'Hall_Name': 'Sheikh Hasina', 'Percentage_of_Attendance': '80%', 'Is_Alloted': 'No'},
{'Student_Id': 'D', 'Name': 'Tanzila', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'CSE', 'Hall_Name': 'Pritilata', 'Percentage_of_Attendance': '30%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'E', 'Name': 'Rupa', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'Math', 'Hall_Name': 'Khaleda Zia', 'Percentage_of_Attendance': '85%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'F', 'Name': 'Sayma', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'CSE', 'Hall_Name': 'Khaleda Zia', 'Percentage_of_Attendance': '40%', 'Is_Alloted': 'No'},
{'Student_Id': 'G', 'Name': 'Shifati', 'Gender': 'M', 'Session': '2016-2017', 'Department': 'Chemistry', 'Hall_Name': 'MH', 'Percentage_of_Attendance': '70%', 'Is_Alloted': 'No'},
{'Student_Id': 'H', 'Name': 'Rabbi', 'Gender': 'M', 'Session': '2019-2020', 'Department': 'CSE', 'Hall_Name': 'BBH', 'Percentage_of_Attendance': '50%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'I', 'Name': 'Rahim', 'Gender': 'M', 'Session': '2018-2019', 'Department': 'Math', 'Hall_Name': 'MH', 'Percentage_of_Attendance': '60%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'J', 'Name': 'Karim', 'Gender': 'M', 'Session': '2018-2019', 'Department': 'Math', 'Hall_Name': 'MH', 'Percentage_of_Attendance': '80%', 'Is_Alloted': 'Yes'},
{'Student_Id': 'K', 'Name': 'Simita', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'CSE', 'Hall_Name': 'Jahanara Imam', 'Percentage_of_Attendance': '90%', 'Is_Alloted': 'No'},
{'Student_Id': 'L', 'Name': 'Oni', 'Gender': 'F', 'Session': '2016-2017', 'Department': 'Chemistry', 'Hall_Name': 'Sheikh Hasina', 'Percentage_of_Attendance': '30%', 'Is_Alloted': 'No'},
{'Student_Id': 'M', 'Name': 'Rupa', 'Gender': 'F', 'Session': '2019-2020', 'Department': 'CSE', 'Hall_Name': 'Jahanara Imam', 'Percentage_of_Attendance': '20%', 'Is_Alloted': 'No'},
{'Student_Id': 'N', 'Name': 'Keya', 'Gender': 'F', 'Session': '2018-2019', 'Department': 'CSE', 'Hall_Name': 'Khaleda Zia', 'Percentage_of_Attendance': '95%', 'Is_Alloted': 'Yes'},
]

In [3]:
df = pd.DataFrame(data)

In [4]:
# Define target and features
target_variable = "Is_Alloted"
features = list(df.columns)
features.remove(target_variable)

In [5]:
# Calculate entropy
def entropy(data, target_variable):
    value_counts = data[target_variable].value_counts()
    probabilities = value_counts / len(data)
    entropy = -sum(probabilities * np.log2(probabilities))
    return entropy

# Calculate information gain for a feature
def information_gain(data, feature, target_variable):
    entropy_before = entropy(data, target_variable)
    weighted_entropy_after = 0
    for value in data[feature].unique():
        filtered_data = data[data[feature] == value]
        probablity = len(filtered_data) / len(data)
        weighted_entropy_after += probablity * entropy(filtered_data, target_variable)
    return entropy_before - weighted_entropy_after

In [11]:
def is_pure(data, target_variable):
    """
    Checks if the data at a node is pure, meaning all instances have the same class.
    """
    # Get unique values of the target variable in the data
    unique_classes = data[target_variable].unique()
    
    # If there is only one unique class, the data is pure
    return len(unique_classes) == 1


In [14]:
def select_best_feature(data, features, target_variable):
    """
    Selects the best feature to split the data on based on information gain.
    """
    best_gain = float('-inf')
    best_feature = None
    
    for feature in features:
        gain = information_gain(data, feature, target_variable)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
    
    return best_feature


In [17]:
class DecisionTreeNode:
    """
    Represents a node in the decision tree.
    """
    def __init__(self):
        self.feature = None
        self.value = None
        self.children = {}
        self.label = None


In [20]:
def get_unique_values(data, feature):
    """
    Returns the unique values of the specified feature in the dataset.
    """
    return data[feature].unique()


In [23]:
def filter_data(data, feature, value):
    """
    Filters the dataset based on the specified feature and its value.
    """
    return data[data[feature] == value]


In [27]:
# Build a decision tree node
class TreeNode:
    def __init__(self, value=None, children=None):
        self.value = value
        self.children = children  # dictionary: feature values -> subtrees

# Build the decision tree
def build_decision_tree(data, features, target_variable):
    # Check for termination conditions
    if len(data[target_variable].unique()) == 1:
        return TreeNode(data[target_variable].iloc[0])  # Return the majority class
    if not features:
        return TreeNode(data[target_variable].mode().iloc[0])  # Return the most frequent value

    # Find the feature with the highest information gain
    max_gain, best_feature = 0, None
    for feature in features:
        gain = information_gain(data, feature, target_variable)
        if gain > max_gain:
            max_gain, best_feature = gain, feature

    # Check if there are no remaining features or the data is pure
    if not features or is_pure(data, target_variable):
        return TreeNode(data[target_variable].mode().iloc[0])  # Return the most frequent value

    # Select the best feature to split on
    best_feature = select_best_feature(data, features, target_variable)
    root = TreeNode()
    root.value = best_feature

    # Split the data based on the best feature
    for value in get_unique_values(data, best_feature):
        filtered_data = filter_data(data, best_feature, value)
        subtree = build_decision_tree(filtered_data, features.copy(), target_variable)
        subtree.value = value
        root.children[value] = subtree

    return root

In [29]:
# Print the decision tree structure (simplified)
def print_tree(node, depth=0):
    if node.value is not None:
        print(f"{' ' * depth}Feature: {node.value}")
        if node.children:
            for value, subtree in node.children.items():
                print(f"{' ' * (depth + 2)}Value: {value}")
                print_tree(subtree, depth + 4)

In [32]:
# Call the print_tree function
print_tree(tree)

In [33]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt


NameError: name 'decision_tree_model' is not defined

<Figure size 2000x1000 with 0 Axes>