In [1]:
import pandas as pd
import numpy as np
eps = np.finfo(float).eps
from numpy import log2 as log

In [2]:
data = [
    ["Sunny", "Hot", "High", "Weak", "No"],
    ["Sunny", "Hot", "High", "Strong", "No"],
    ["Overcast", "Hot", "High", "Weak", "Yes"],
    ["Rain", "Mild", "High", "Weak", "Yes"],
    ["Rain", "Cool", "Normal", "Weak", "Yes"],
    ["Rain", "Cool", "Normal", "Strong", "No"],
    ["Overcast", "Cool", "Normal", "Strong", "Yes"],
    ["Sunny", "Mild", "High", "Weak", "No"],
    ["Sunny", "Cool", "Normal", "Weak", "Yes"],
    ["Rain", "Mild", "Normal", "Weak", "Yes"],
    ["Sunny", "Mild", "Normal", "Strong", "Yes"],
    ["Overcast", "Mild", "High", "Strong", "Yes"],
    ["Overcast", "Hot", "Normal", "Weak", "Yes"],
    ["Rain", "Mild", "High", "Strong", "No"]
]

columns = ["Outlook", "Temperature", "Humidity", "Wind", "play"]

In [3]:
df = pd.DataFrame(data, columns=columns)
print(df)

     Outlook Temperature Humidity    Wind play
0      Sunny         Hot     High    Weak   No
1      Sunny         Hot     High  Strong   No
2   Overcast         Hot     High    Weak  Yes
3       Rain        Mild     High    Weak  Yes
4       Rain        Cool   Normal    Weak  Yes
5       Rain        Cool   Normal  Strong   No
6   Overcast        Cool   Normal  Strong  Yes
7      Sunny        Mild     High    Weak   No
8      Sunny        Cool   Normal    Weak  Yes
9       Rain        Mild   Normal    Weak  Yes
10     Sunny        Mild   Normal  Strong  Yes
11  Overcast        Mild     High  Strong  Yes
12  Overcast         Hot   Normal    Weak  Yes
13      Rain        Mild     High  Strong   No


In [4]:
def find_entropy(df):
    Class = df.keys()[-1]
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value] / len(df[Class])
        entropy += -fraction * np.log2(fraction)
    return entropy

def find_entropy_attribute(df, attribute):
    Class = df.keys()[-1]
    target_variables = df[Class].unique()
    variables = df[attribute].unique()
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute] == variable][df[Class] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            fraction = num / (den + eps)
            entropy += -fraction * log(fraction + eps)
        fraction2 = den / len(df)
        entropy2 += -fraction2 * entropy
    return abs(entropy2)



def find_split_info(df, attribute):
    Class = df.keys()[-1]
    variables = df[attribute].unique()
    split_info = 0
    for variable in variables:
        num = len(df[attribute][df[attribute] == variable])
        den = len(df[attribute])
        fraction = num / den
        split_info += -fraction * log(fraction)
    return split_info



def find_parent(df):
    information_gain_ratio = []
    for key in df.keys()[:-1]:
        information_gain = find_entropy(df) - find_entropy_attribute(df, key)
        split_info = find_split_info(df,key)
        if split_info==0:
            information_gain_ratio.append(0)
        else:
            information_gain_ratio.append(information_gain/(split_info ))
        
    return df.keys()[:-1][np.argmax(information_gain_ratio)]

def get_subtable(df, node, value):
    return df[df[node] == value].reset_index(drop=True)

def buildTree(df, tree=None):
    Class = df.keys()[-1]
    node = find_parent(df)
    attValue = np.unique(df[node])
    if tree is None:
        tree = {}
        tree[node] = {}
    for value in attValue:
        subtable = get_subtable(df, node, value)
        clValue, counts = np.unique(subtable[Class], return_counts=True)
        if len(counts) == 1:
            tree[node][value] = clValue[0]
        else:
            tree[node][value] = buildTree(subtable)
    return tree



def predict(test, tree, default=None):
    attribute = next(iter(tree))
    if test[attribute] in tree[attribute].keys():
        result = tree[attribute][test[attribute]]
        if isinstance(result, dict):
            return predict(test, result)
        else:
            return result
    else:
        return default



In [5]:
tree = buildTree(df)

In [6]:
import pprint
pprint.pprint(tree)

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [7]:
test = {'Outlook': 'Sunny', 'Temperature': 'Hot', 'Humidity': 'High', 'Wind': 'Weak'}
prediction = predict(test, tree)
print(prediction)


No


In [8]:
import pandas as pd

# Define the data
data = {
    'a1': ['x', 'x', 'x', 'y', 'y', 'x', 'x', 'y', 'x', 'x', 'y', 'x'],
    'a2': ['u', 'u', 'u', 'u', 'v', 'v', 'u', 'v', 'u', 'w', 'w', 'w'],
    'a3': ['n', 'p', 'n', 'n', 'n', 'n', 'p', 'm', 'n', 'p', 'n', 'n'],
    'a4': ['e', 'f', 'g', 'e', 'f', 'e', 'e', 'f', 'f', 'f', 'f', 'g'],
    'Class': ['+', '+', '+', '+', '-', '+', '-', '+', '+', '+', '-', '+']
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)


   a1 a2 a3 a4 Class
0   x  u  n  e     +
1   x  u  p  f     +
2   x  u  n  g     +
3   y  u  n  e     +
4   y  v  n  f     -
5   x  v  n  e     +
6   x  u  p  e     -
7   y  v  m  f     +
8   x  u  n  f     +
9   x  w  p  f     +
10  y  w  n  f     -
11  x  w  n  g     +


In [9]:
tree = buildTree(df)

In [10]:
import pprint
pprint.pprint(tree)

{'a1': {'x': {'a3': {'n': '+', 'p': {'a4': {'e': '-', 'f': '+'}}}},
        'y': {'a3': {'m': '+', 'n': {'a4': {'e': '+', 'f': '-'}}}}}}


In [11]:
import pandas as pd

# Define the data
data = {
    'Day': [f'D{i}' for i in range(1, 15)],
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature (oF)': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity (%)': [85, 90, 86, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 91],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)


    Day   Outlook  Temperature (oF)  Humidity (%)    Wind PlayTennis
0    D1     Sunny                85            85    Weak         No
1    D2     Sunny                80            90  Strong         No
2    D3  Overcast                83            86    Weak        Yes
3    D4      Rain                70            96    Weak        Yes
4    D5      Rain                68            80    Weak        Yes
5    D6      Rain                65            70  Strong         No
6    D7  Overcast                64            65  Strong        Yes
7    D8     Sunny                72            95    Weak         No
8    D9     Sunny                69            70    Weak        Yes
9   D10      Rain                75            80    Weak        Yes
10  D11     Sunny                75            70  Strong        Yes
11  D12  Overcast                72            90  Strong        Yes
12  D13  Overcast                81            75    Weak        Yes
13  D14      Rain                7

In [12]:
tree = buildTree(df)

In [13]:
pprint.pprint(tree)

{'Day': {'D1': 'No',
         'D10': 'Yes',
         'D11': 'Yes',
         'D12': 'Yes',
         'D13': 'Yes',
         'D14': 'No',
         'D2': 'No',
         'D3': 'Yes',
         'D4': 'Yes',
         'D5': 'Yes',
         'D6': 'No',
         'D7': 'Yes',
         'D8': 'No',
         'D9': 'Yes'}}


In [14]:
from sklearn.metrics import classification_report, confusion_matrix

# Assuming you have already built your decision tree model 'tree' and have test data 'df_test'
# Make predictions on the test data
predictions = df.apply(lambda x: predict(x, tree), axis=1)

# Extract true labels from test data
true_labels = df['PlayTennis']

# Print classification report
print("Classification Report:")
print(classification_report(true_labels, predictions))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))


Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         9

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14


Confusion Matrix:
[[5 0]
 [0 9]]


In [15]:
# import numpy as np
# from numpy import log2 as log

# class DecisionTreeC45:
#     def __init__(self):
#         self.tree = None
#         self.eps = np.finfo(float).eps

#     def find_entropy(self, df):
#         Class = df.keys()[-1]
#         entropy = 0
#         values = df[Class].unique()
#         for value in values:
#             fraction = df[Class].value_counts()[value] / len(df[Class])
#             entropy += -fraction * np.log2(fraction)
#         return entropy

#     def find_entropy_attribute(self, df, attribute):
#         Class = df.keys()[-1]
#         target_variables = df[Class].unique()
#         variables = df[attribute].unique()
#         entropy2 = 0
#         for variable in variables:
#             entropy = 0
#             for target_variable in target_variables:
#                 num = len(df[attribute][df[attribute] == variable][df[Class] == target_variable])
#                 den = len(df[attribute][df[attribute] == variable])
#                 fraction = num / (den + self.eps)
#                 entropy += -fraction * log(fraction + self.eps)
#             fraction2 = den / len(df)
#             entropy2 += -fraction2 * entropy
#         return abs(entropy2)

#     def find_split_info(self, df, attribute):
#         Class = df.keys()[-1]
#         variables = df[attribute].unique()
#         split_info = 0
#         for variable in variables:
#             num = len(df[attribute][df[attribute] == variable])
#             den = len(df[attribute])
#             fraction = num / den
#             split_info += -fraction * log(fraction)
#         return split_info

#     def find_parent(self, df):
#         information_gain_ratio = []
#         for key in df.keys()[:-1]:
#             information_gain = self.find_entropy(df) - self.find_entropy_attribute(df, key)
#             split_info = self.find_split_info(df, key)
#             if split_info == 0:
#                 information_gain_ratio.append(0)
#             else:
#                 information_gain_ratio.append(information_gain / split_info)
#         return df.keys()[:-1][np.argmax(information_gain_ratio)]

#     def get_subtable(self, df, node, value):
#         return df[df[node] == value].reset_index(drop=True)

#     def build_tree(self, df, tree=None):
#         Class = df.keys()[-1]
#         node = self.find_parent(df)
#         attValue = np.unique(df[node])
#         if tree is None:
#             tree = {}
#             tree[node] = {}
#         for value in attValue:
#             subtable = self.get_subtable(df, node, value)
#             clValue, counts = np.unique(subtable[Class], return_counts=True)
#             if len(counts) == 1:
#                 tree[node][value] = clValue[0]
#             else:
#                 tree[node][value] = self.build_tree(subtable)
#         return tree

#     def fit(self, df):
#         self.tree = self.build_tree(df)

#     def predict(self, test, tree,default=None):
#         if self.tree is None:
#             raise RuntimeError("The model has not been trained yet. Use the fit method to train the model.")
#         attribute = next(iter(self.tree))
#         if test[attribute] in self.tree[attribute].keys():
#             result = self.tree[attribute][test[attribute]]
#             if isinstance(result, dict):
#                 return self.predict(test, result)
#             else:
#                 return result
#         else:
#             return default


In [16]:
# t= DecisionTreeC45()
# t.fit(df)

In [17]:
# t.predict(test, tree)