In [None]:
# Creating decision tree model from scratch

In [None]:
import numpy as np
from collections import Counter
# counter is used to count the frequency of each class
from math import log2

class DecisionTree:
    def __init__(self):
        self.tree = None

    def entropy(self, y):
        """Calculate the entropy of a dataset."""
        class_counts = Counter(y)
        total_instances = len(y)
        entropy = 0
        for count in class_counts.values():
            prob = count / total_instances
            entropy -= prob * log2(prob)
        return entropy

    def information_gain(self, X, y, feature_index):
        """Calculate the information gain of a split on a feature."""
        total_entropy = self.entropy(y)
        values = [x[feature_index] for x in X]
        unique_values = set(values)
        weighted_entropy = 0
        for value in unique_values:
            subset_y = [y[i] for i in range(len(y)) if X[i][feature_index] == value]
            prob = len(subset_y) / len(y)
            weighted_entropy += prob * self.entropy(subset_y)
        return total_entropy - weighted_entropy

    def best_split(self, X, y):
        """Find the best feature to split on based on information gain."""
        num_features = len(X[0])
        best_feature = None
        max_info_gain = -1
        for i in range(num_features):
            info_gain = self.information_gain(X, y, i)
            if info_gain > max_info_gain:
                max_info_gain = info_gain
                best_feature = i
        return best_feature

    def build_tree(self, X, y):
        """Recursively build the decision tree."""
        unique_classes = set(y)
        if len(unique_classes) == 1:
            return unique_classes.pop()

        if len(X[0]) == 0:
            return Counter(y).most_common(1)[0][0]

        best_feature = self.best_split(X, y)
        tree = {best_feature: {}}

        values = set([x[best_feature] for x in X])
        for value in values:
            subset_X = [x[:best_feature] + x[best_feature+1:] for x in X if x[best_feature] == value]
            subset_y = [y[i] for i in range(len(y)) if X[i][best_feature] == value]
            subtree = self.build_tree(subset_X, subset_y)
            tree[best_feature][value] = subtree

        return tree

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, tree, sample):
        if isinstance(tree, dict):
            feature = list(tree.keys())[0]
            value = sample[feature]
            subtree = tree[feature].get(value)
            if subtree is None:
                return None
            return self.predict_sample(subtree, sample)
        else:
            return tree

    def predict(self, X):
        return [self.predict_sample(self.tree, sample) for sample in X]


# apply the model
model = DecisionTree()
model.fit(X, y)
predictions = model.predict(X)
print(predictions)


['No', 'No', 'Yes', 'Yes', 'Yes']


In [None]:

# Example Usage
if __name__ == "__main__":
    # Sample dataset
    X = [
        ['Sunny', 'Hot', 'High', 'Weak'],
        ['Sunny', 'Hot', 'High', 'Strong'],
        ['Overcast', 'Hot', 'High', 'Weak'],
        ['Rain', 'Mild', 'High', 'Weak'],
        ['Rain', 'Cool', 'Normal', 'Weak']
    ]
    y = ['No', 'No', 'Yes', 'Yes', 'Yes']

    # Convert categorical data to integers for simplicity
    feature_mapping = {'Outlook': {'Sunny': 0, 'Overcast': 1, 'Rain': 2},
                       'Temperature': {'Hot': 0, 'Mild': 1, 'Cool': 2},
                       'Humidity': {'High': 0, 'Normal': 1},
                       'Wind': {'Weak': 0, 'Strong': 1}}

    X_encoded = [[feature_mapping[feature][value] for feature, value in zip(['Outlook', 'Temperature', 'Humidity', 'Wind'], row)] for row in X]

    # Create and train the decision tree
    tree = DecisionTree()
    tree.fit(X_encoded, y)

    # Predict on new data
    test_data = [
        ['Sunny', 'Mild', 'High', 'Strong'],
        ['Rain', 'Cool', 'Normal', 'Weak']
    ]
    test_data_encoded = [[feature_mapping[feature][value] for feature, value in zip(['Outlook', 'Temperature', 'Humidity', 'Wind'], row)] for row in test_data]

    predictions = tree.predict(test_data_encoded)
    for sample, prediction in zip(test_data, predictions):
        print(f"Sample: {sample} => Prediction: {prediction}")


In [None]:
import pandas as pd
X = [
        ['Sunny', 'Hot', 'High', 'Weak'],
        ['Sunny', 'Hot', 'High', 'Strong'],
        ['Overcast', 'Hot', 'High', 'Weak'],
        ['Rain', 'Mild', 'High', 'Weak'],
        ['Rain', 'Cool', 'Normal', 'Weak']
    ]

y = ['No', 'No', 'Yes', 'Yes', 'Yes']

# Convert categorical data to integers for simplicity
feature_mapping = {'Outlook': {'Sunny': 0, 'Overcast': 1, 'Rain': 2},
                       'Temperature': {'Hot': 0, 'Mild': 1, 'Cool': 2},
                       'Humidity': {'High': 0, 'Normal': 1},
                       'Wind': {'Weak': 0, 'Strong': 1}}


# make dataframe
df = pd.DataFrame(X, columns=['Outlook', 'Temperature', 'Humidity', 'Wind'])
df


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = df.apply(le.fit_transform)


# apply the sklearn model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(df, y)
predictions = model.predict(df)
print(predictions)

['No' 'No' 'Yes' 'Yes' 'Yes']


In [16]:
# How decision tree works
# 1. Initialize the decision tree
# 2. Calculate the information gain for each feature
# 3. Select the feature with the highest information gain
# 4. Split the dataset based on the selected feature
# 5. Recursively build the decision tree
# 6. stopping criteria

In [17]:
from collections import Counter
from math import log2

def entropy(y):
    """Calculate the entropy of a dataset."""
    class_counts = Counter(y)
    total_instances = len(y)
    entropy = 0
    for count in class_counts.values():
        prob = count / total_instances
        entropy -= prob * log2(prob)
    return entropy


# apply the entropy function on an dummy data

X = { # use tuple instead of list for the set
    ('sunny', 'hot', 'high', 'weak'),
    ('sunny', 'hot', 'high', 'strong'),
    ('overcast', 'hot', 'high', 'weak'),
    ('rain', 'mild', 'high', 'weak'),
    ('rain', 'cool', 'normal', 'weak')
}

y = ( # use tuple instead of list
     'no', 'no', 'yes', 'yes', 'yes'
)


entropy(y)

# 0.97 is entropy

0.9709505944546686

In [18]:
Data = {
    'X' : ['A' , 'B' , 'C' , 'D' , 'E', 'F' , 'G' , 'H' , 'I ', 'J'],
    # only 2 no all yes
    'y' : ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes','Yes','No','Yes','No']
}

import pandas as pd
df = pd.DataFrame(Data)
df

X = df['X']
y = df['y']

prob_yes = len([i for i in y if i == 'Yes']) / len(y)
prob_no = len([i for i in y if i == 'No']) / len(y)
print(prob_yes)
print(prob_no)

entropy(y)

0.6
0.4


0.9709505944546686

In [21]:
# calculate infromation gain


# suppose atrribute A has two attribute A1 and A2


# A1 has 4 instances(3 "yes" , 1 "no") and A2 has 4 instances(3 "yes" , 3 "no")


# calculate A1 and A2 entropy
# lets create A1 and A2 data

A1 = {
    'X' : ['A' , 'B' , 'C' , 'D'],
    'y' : ['Yes', 'No', 'Yes', 'Yes']
}

A2 = {
    'X' : ['E' , 'F' , 'G' , 'H','I','J'],
    'y' : ['Yes', 'No', 'Yes', 'No','Yes','No']
}

df1 = pd.DataFrame(A1)
df2 = pd.DataFrame(A2)
y1 = df1['y']
y2 = df2['y']
print(entropy(y1))
print(entropy(y2))
# calculate  Weighted average entropy after split
Hsplit = prob_no * entropy(y1) + prob_yes * entropy(y2)
print(Hsplit)
# Information gain
IG = entropy(y) - Hsplit
print(IG)

0.8112781244591328
1.0
0.9245112497836532
0.0464393446710154


In [46]:
class DT:
  def __init__(self):
    self.tree = None

  def entropy(y):
    """Calculate the entropy of a dataset."""
    class_counts = Counter(y)
    total_instances = len(y)
    entropy = 0
    for count in class_counts.values():
        prob = count / total_instances
        entropy -= prob * log2(prob)
    return entropy


  def split_data(X, y, feature, value):
    """Split the dataset based on the selected feature and value."""
    X_subset = []
    y_subset = []
    for i in range(len(X)):
        if X[i][feature] == value:
            X_subset.append(X[i])
            y_subset.append(y[i])
    return X_subset, y_subset

  # calculate entropy after split for each subset
  def entropy_after_split(X_subset, y_subset):
    # calculate for both
    for subset in X_subset:
        Entropy_X_sub = (entropy(subset))
    return Entropy_X_sub

    for subset in y_subset:
        Entropy_y_sub = (entropy(subset))
    return Entropy_y_sub


    # calculate weighted average
  def weighted_average(prob_yes , prob_no , y):
    Hsplit = prob_no * entropy(y1) + prob_yes * entropy(y2)
    return Hsplit


  def information_gain(y , Hsplit):
    IG = entropy(y) - Hsplit
    return IG


Data = {
    'X' : ['A' , 'B' , 'C' , 'D' , 'E', 'F' , 'G' , 'H' , 'I ', 'J'],
    # only 2 no all yes
    'y' : ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes','Yes','No','Yes','No']
}
import pandas as pd
df = pd.DataFrame(Data)
X = df['X']
y = df['y']
prob_yes = len([i for i in y if i == 'Yes']) / len(y)
prob_no = len([i for i in y if i == 'No']) / len(y)



Des = DT()
Des.entropy(y)




TypeError: DT.entropy() takes 1 positional argument but 2 were given

In [39]:
import pandas as pd
from collections import Counter
from math import log2
import graphviz

class DT:
    def __init__(self):
        self.tree = None

    def entropy(self, y):
        """Calculate the entropy of a dataset."""
        class_counts = Counter(y)
        total_instances = len(y)
        entropy = 0
        for count in class_counts.values():
            prob = count / total_instances
            entropy -= prob * log2(prob)
        return entropy

    def split_data(self, X, y, feature):
        """Split the dataset based on the unique values of the selected feature."""
        unique_values = set(X[feature])
        subsets = {}
        for value in unique_values:
            X_subset = []
            y_subset = []
            for i in range(len(X)):
                if X[feature].iloc[i] == value:
                    X_subset.append(X.iloc[i])
                    y_subset.append(y.iloc[i])
            subsets[value] = (X_subset, y_subset)
        return subsets

    def entropy_after_split(self, y_subsets):
        """Calculate the entropy after the split for each subset."""
        total_instances = sum([len(y_subset) for y_subset in y_subsets.values()])
        weighted_entropy = 0
        for y_subset in y_subsets.values():
            prob = len(y_subset) / total_instances
            weighted_entropy += prob * self.entropy(y_subset)
        return weighted_entropy

    def information_gain(self, y, y_subsets):
        """Calculate the information gain."""
        Hsplit = self.entropy_after_split(y_subsets)
        IG = self.entropy(y) - Hsplit
        return IG

    def best_split(self, X, y):
        """Find the best feature to split on."""
        best_ig = -1
        best_feature = None
        for feature in X.columns:
            subsets = self.split_data(X, y, feature)
            y_subsets = {k: v[1] for k, v in subsets.items()}
            ig = self.information_gain(y, y_subsets)
            if ig > best_ig:
                best_ig = ig
                best_feature = feature
        return best_feature, best_ig

    def visualize_tree(self):
        def add_nodes_edges(tree, graph, parent=None, label=None):
            if isinstance(tree, dict):
                for key, value in tree.items():
                    if parent is not None:
                        graph.node(key)
                        graph.edge(parent, key, label=label)
                    add_nodes_edges(value, graph, parent=key, label=str(key))
            else:
                graph.node(str(tree), shape='ellipse')
                if parent is not None:
                    graph.edge(parent, str(tree), label=label)

        dot = graphviz.Digraph()
        add_nodes_edges(self.tree, dot)
        return dot

# Example usage with your dataset
Data = {
    'X': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'y': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']
}
df = pd.DataFrame(Data)

X = df[['X']]  # Note that X needs to be a DataFrame, not a Series
y = df['y']

dt = DT()

# Calculate the entropy of the dataset
print("Entropy of dataset:", dt.entropy(y))

# Find the best feature to split on
best_feature, best_ig = dt.best_split(X, y)
print("Best feature to split on:", best_feature)
print("Information Gain:", best_ig)

# Split the data based on the best feature
subsets = dt.split_data(X, y, best_feature)
for value, (X_subset, y_subset) in subsets.items():
    print(f"Value: {value}")
    print(f"X subset: {X_subset}")
    print(f"y subset: {y_subset}")





Entropy of dataset: 0.9709505944546686
Best feature to split on: X
Information Gain: 0.9709505944546686
Value: B
X subset: [X    B
Name: 1, dtype: object]
y subset: ['No']
Value: H
X subset: [X    H
Name: 7, dtype: object]
y subset: ['No']
Value: C
X subset: [X    C
Name: 2, dtype: object]
y subset: ['Yes']
Value: F
X subset: [X    F
Name: 5, dtype: object]
y subset: ['Yes']
Value: D
X subset: [X    D
Name: 3, dtype: object]
y subset: ['Yes']
Value: A
X subset: [X    A
Name: 0, dtype: object]
y subset: ['Yes']
Value: E
X subset: [X    E
Name: 4, dtype: object]
y subset: ['No']
Value: I
X subset: [X    I
Name: 8, dtype: object]
y subset: ['Yes']
Value: J
X subset: [X    J
Name: 9, dtype: object]
y subset: ['No']
Value: G
X subset: [X    G
Name: 6, dtype: object]
y subset: ['Yes']


In [42]:
# sklearn implementation

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score


Data = {
    'X': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'y': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']
}
df = pd.DataFrame(Data)


# encode the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df = df.apply(le.fit_transform)

X = df[['X']]
y = df['y']


DecT = DecisionTreeClassifier()
DecT.fit(X, y)
predictions = DecT.predict(X)
print(predictions)

[1 0 1 1 0 1 1 0 1 0]
