In [None]:
import numpy as np
import pandas as pd
import math

In [3]:
data = {
    'age': ['<=30', '<=30', '31 to 40', '>40', '>40', '>40', '31 to 40', '<=30', '<=30', '>40', '<=30', '31 to 40', '31 to 40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(data)

# Saving DataFrame as CSV
df.to_csv('data_decision_tree.csv', index=False)

In [4]:
df = pd.read_csv('data_decision_tree.csv')#Reading the Csv file
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31 to 40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31 to 40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [8]:
def convert_test_data_point(data_points):
    column_mappings = {
        'age': {'<=30': 0, '31 to 40': 1, '>40': 2},
        'income': {'low': 0, 'medium': 1, 'high': 2},
        'student': {'no': 0, 'yes': 1},
        'credit_rating': {'fair': 0, 'excellent': 1}
    }

    converted_data_points = []

    for data_point in data_points:
        converted_data_point = [column_mappings[column][value] for column, value in zip(['age', 'income', 'student', 'credit_rating'], data_point)]
        converted_data_points.append(converted_data_point)

    return np.array(converted_data_points)

    data_points = np.array([['<=30', 'medium', 'no', 'fair'], ['>40', 'medium', 'yes', 'excellent']])
    converted_data_points = convert_test_data_point(data_points)
    print(converted_data_points)


In [9]:
column_mappings = {
    'age': {'<=30': 0, '31 to 40': 1, '>40': 2},
    'income': {'low': 0, 'medium': 1, 'high': 2},
    'student': {'no': 0, 'yes': 1},
    'credit_rating': {'fair': 0, 'excellent': 1},
    'buys_computer': {'no': 0, 'yes': 1}
}

df.replace(column_mappings, inplace=True)

In [10]:
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,0,2,0,0,0
1,0,2,0,1,0
2,1,2,0,0,1
3,2,1,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,1,0,1,1,1
7,0,1,0,0,0
8,0,0,1,0,1
9,2,1,1,0,1


In [11]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth #Initializing Max Depth for Stoping Criteria

    def fit(self, X, y, depth=0):
        if depth == self.max_depth or len(set(y)) == 1:
            # If the maximum depth is reached or all labels are the same, create a leaf node
            unique_labels, counts = np.unique(y, return_counts=True)
            return {'class': unique_labels[np.argmax(counts)]}

        best_feature, best_threshold = self.find_best_split(X, y)

        if best_feature is None:
            #if there is no feature to divide node create it to leaf node
            unique_labels, counts = np.unique(y, return_counts=True)
            return {'class': unique_labels[np.argmax(counts)]}

        left_indices = X[:, best_feature] <= best_threshold #Thresholding and adding feature to left node
        right_indices = ~left_indices

        #Recursion
        left_subtree = self.fit(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self.fit(X[right_indices], y[right_indices], depth + 1)

        return {
            'feature_index': best_feature,
            'threshold': best_threshold,
            'left': left_subtree,
            'right': right_subtree
        }


    def find_best_split(self, X, y):
        num_features = X.shape[1]
        best_feature = None
        best_threshold = None
        best_info_gain = 0

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])

            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = ~left_indices

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                info_gain = self.calculate_info_gain(y, y[left_indices], y[right_indices])

                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def calculate_info_gain(self, parent, left_child, right_child):
        parent_entropy = self.entropy(parent)
        left_child_entropy = (len(left_child) / len(parent)) * self.entropy(left_child)
        right_child_entropy = (len(right_child) / len(parent)) * self.entropy(right_child)

        info_gain = parent_entropy - (left_child_entropy + right_child_entropy)
        return info_gain

    def entropy(self,labels):
        # Count the occurrences of each unique label
        label_counts = {}
        for label in labels:
            if label in label_counts:
              label_counts[label] += 1
            else:
              label_counts[label] = 1

         # Calculate entropy using the formula
         # Entropy(s)= -P(yes)log2 P(yes)- P(no) log2 P(no)
        entropy_value = 0
        total_instances = len(labels)
        for count in label_counts.values():
            probability = count / total_instances
            entropy_value -= probability * math.log2(probability)

            return entropy_value

    def predict_instance(self, instance, tree):
        if 'class' in tree:
            return tree['class']

        if instance[tree['feature_index']] <= tree['threshold']:
            return self.predict_instance(instance, tree['left'])
        else:
            return self.predict_instance(instance, tree['right'])

    def predict(self, X, tree):
        return [self.predict_instance(instance, tree) for instance in X]

In [12]:
X = np.array(df[['age', 'income', 'student', 'credit_rating']])
y=np.array(df['buys_computer'])
print(X[:10])
print(y[:10])

[[0 2 0 0]
 [0 2 0 1]
 [1 2 0 0]
 [2 1 0 0]
 [2 0 1 0]
 [2 0 1 1]
 [1 0 1 1]
 [0 1 0 0]
 [0 0 1 0]
 [2 1 1 0]]
[0 0 1 1 1 0 1 0 1 1]


In [13]:
tree = DecisionTree(max_depth=6)
model = tree.fit(X, y)
X_test= np.array([['<=30', 'medium', 'no', 'fair'], ['<=30', 'low', 'yes', 'fair']])
X_test=convert_test_data_point(X_test)
print(X_test)
predictions = tree.predict(X_test, model)

print("Predictions:", predictions)

[[0 1 0 0]
 [0 0 1 0]]
Predictions: [0, 1]
