In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
%matplotlib inline

read the CSV file

In [52]:
df=pd.read_csv("play_tennis.csv")
print(df.head())

  day   outlook  temp humidity    wind play
0  D1     Sunny   Hot     High    Weak   No
1  D2     Sunny   Hot     High  Strong   No
2  D3  Overcast   Hot     High    Weak  Yes
3  D4      Rain  Mild     High    Weak  Yes
4  D5      Rain  Cool   Normal    Weak  Yes


split the data

In [53]:
X=df.drop(columns='play')
y=df['play']

In [64]:
X = df.drop('play', axis=1)  # Assuming 'PlayTennis' is the target column
y = df['play']

# Convert categorical variables to numerical values (if needed)
X = pd.get_dummies(X)
y = y.map({'Yes': 1, 'No': 0})  # Map the target to 1 and 0

In [65]:
def entropy(y):
    # Get the unique classes and their counts
    class_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    
    # Calculate the entropy
    ent = -np.sum([p * np.log2(p) for p in probabilities])
    return ent


In [66]:
def gini_index(y):
    class_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    
    gini = 1 - np.sum([p ** 2 for p in probabilities])
    return gini

In [67]:
def information_gain(parent_set, left_child, right_child, criterion='entropy'):
    # Compute the entropy/gini of the parent node
    if criterion == 'entropy':
        parent_entropy = entropy(parent_set)
    elif criterion == 'gini':
        parent_entropy = gini_index(parent_set)
    
    # Calculate the weighted average entropy/gini of the children
    weight_left = len(left_child) / len(parent_set)
    weight_right = len(right_child) / len(parent_set)
    
    if criterion == 'entropy':
        gain = parent_entropy - (weight_left * entropy(left_child) + weight_right * entropy(right_child))
    elif criterion == 'gini':
        gain = parent_entropy - (weight_left * gini_index(left_child) + weight_right * gini_index(right_child))
    
    return gain

In [72]:
def best_split(X, y, criterion='entropy'):
    best_feature = None
    best_threshold = None
    best_gain = -1
    n_features = X.shape[1]
    
    # Iterate over all features
    for feature in range(n_features):
        # Fixing the feature_values access
        feature_values = X.iloc[:, feature].values
        possible_thresholds = np.unique(feature_values)
        
        # Try every possible threshold
        for threshold in possible_thresholds:
            left_indices = np.where(feature_values <= threshold)[0]
            right_indices = np.where(feature_values > threshold)[0]
            
            if len(left_indices) == 0 or len(right_indices) == 0:
                continue
            
            left_child, right_child = y[left_indices], y[right_indices]
            
            # Calculate the gain for this split
            gain = information_gain(y, left_child, right_child, criterion=criterion)
            
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
    
    return best_feature, best_threshold, best_gain


In [73]:
class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature  # Index of feature to split on
        self.threshold = threshold  # Value of threshold to split
        self.left = left  # Left subtree
        self.right = right  # Right subtree
        self.value = value  # Value if this node is a leaf

    def is_leaf_node(self):
        return self.value is not None

In [74]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=10, criterion='entropy'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.root = None
    
    def build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # If only one class left or max depth reached, create leaf node
        if n_labels == 1 or depth >= self.max_depth:
            leaf_value = self.most_common_label(y)
            return DecisionTreeNode(value=leaf_value)
        
        # Find the best split
        feature, threshold, gain = best_split(X, y, criterion=self.criterion)
        if gain == -1:
            leaf_value = self.most_common_label(y)
            return DecisionTreeNode(value=leaf_value)
        
        # Create the child nodes by splitting the data
        left_indices = np.where(X[:, feature] <= threshold)[0]
        right_indices = np.where(X[:, feature] > threshold)[0]
        
        left_subtree = self.build_tree(X[left_indices, :], y[left_indices], depth + 1)
        right_subtree = self.build_tree(X[right_indices, :], y[right_indices], depth + 1)
        
        return DecisionTreeNode(feature=feature, threshold=threshold, left=left_subtree, right=right_subtree)
    
    def most_common_label(self, y):
        counts = np.bincount(y)
        return np.argmax(counts)
    
    def fit(self, X, y):
        self.root = self.build_tree(X, y)
    
    def predict_one(self, x, node):
        # If it's a leaf node, return the value
        if node.is_leaf_node():
            return node.value
        
        # Split based on the feature and threshold
        if x[node.feature] <= node.threshold:
            return self.predict_one(x, node.left)
        else:
            return self.predict_one(x, node.right)
    
    def predict(self, X):
        return [self.predict_one(x, self.root) for x in X]


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset (replace with actual loading code)
# X, y = load_your_data()

# Split the data (90% training, 10% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the Decision Tree with both criteria
tree_entropy = DecisionTreeClassifier(max_depth=10, criterion='entropy')
tree_entropy.fit(X_train, y_train)

tree_gini = DecisionTreeClassifier(max_depth=10, criterion='gini')
tree_gini.fit(X_train, y_train)

# Predict
y_pred_entropy = tree_entropy.predict(X_test)
y_pred_gini = tree_gini.predict(X_test)

# Evaluate
print(f"Accuracy with Entropy: {accuracy_score(y_test, y_pred_entropy)}")
print(f"Accuracy with Gini Index: {accuracy_score(y_test, y_pred_gini)}")


KeyError: '[9, 11] not in index'