##  Iterative Dichotomiser 3 (ID3) Algorithms

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log2

# Function to calculate entropy
def entropy(y):
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    return -sum(p * log2(p) for p in probabilities)

# Function to calculate the information gain
def information_gain(X, y, attribute):
    # Calculate the entropy of the full dataset
    entropy_full = entropy(y)
    
    # Get the values and counts of the attribute
    values, counts = np.unique(X[attribute], return_counts=True)
    
    # Calculate the weighted entropy for the subsets
    entropy_subset = 0
    for value, count in zip(values, counts):
        subset_y = y[X[attribute] == value]
        entropy_subset += (count / len(y)) * entropy(subset_y)
    
    # Calculate the information gain
    return entropy_full - entropy_subset

# Function to create the decision tree
def id3(X, y, attributes, depth=0, max_depth=None):
    # If all labels are the same, return the label
    if len(set(y)) == 1:
        return y[0]
    
    # If there are no more attributes or max depth is reached, return the most common label
    if not attributes or (max_depth is not None and depth == max_depth):
        return Counter(y).most_common(1)[0][0]
    
    # Find the best attribute to split on
    gains = {attribute: information_gain(X, y, attribute) for attribute in attributes}
    best_attribute = max(gains, key=gains.get)
    
    # Create the tree/sub-tree
    tree = {best_attribute: {}}
    
    # Get the values of the best attribute
    values = np.unique(X[best_attribute])
    
    # Recur for each subset
    for value in values:
        subset_X = X[X[best_attribute] == value]
        subset_y = y[X[best_attribute] == value]
        subtree = id3(subset_X, subset_y, [attr for attr in attributes if attr != best_attribute], depth + 1, max_depth)
        tree[best_attribute][value] = subtree
    
    return tree

# Function to make predictions with the decision tree
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = sample[attribute]
    subtree = tree[attribute].get(value, None)
    if subtree is None:
        return None
    return predict(subtree, sample)

# Example usage
if __name__ == "__main__":
    # Example dataset
    data = {
        'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
        'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
        'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
        'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
        'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Features and labels
    X = df.drop(columns='PlayTennis')
    y = df['PlayTennis']

    # Build the tree
    tree = id3(X, y, X.columns, max_depth=3)
    print("Decision Tree:", tree)

    # Predict a sample
    sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
    prediction = predict(tree, sample)
    print("Prediction for sample:", prediction)


ValueError: The truth value of a Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### C5. Algorithms

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log2

# Function to calculate entropy
def entropy(y):
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    return -sum(p * log2(p) for p in probabilities)

# Function to calculate the information gain
def information_gain(X, y, attribute):
    # Calculate the entropy of the full dataset
    entropy_full = entropy(y)
    
    # Get the values and counts of the attribute
    values, counts = np.unique(X[attribute], return_counts=True)
    
    # Calculate the weighted entropy for the subsets
    entropy_subset = 0
    for value, count in zip(values, counts):
        subset_y = y[X[attribute] == value]
        entropy_subset += (count / len(y)) * entropy(subset_y)
    
    # Calculate the information gain
    return entropy_full - entropy_subset

# Function to calculate the intrinsic value of an attribute
def intrinsic_value(X, attribute):
    values, counts = np.unique(X[attribute], return_counts=True)
    total = sum(counts)
    return -sum((count / total) * log2(count / total) for count in counts)

# Function to calculate the gain ratio
def gain_ratio(X, y, attribute):
    gain = information_gain(X, y, attribute)
    intrinsic = intrinsic_value(X, attribute)
    return gain / intrinsic if intrinsic != 0 else 0

# Function to create the decision tree
def c50(X, y, attributes, depth=0, max_depth=None):
    # If all labels are the same, return the label
    if len(set(y)) == 1:
        return y[0]
    
    # If there are no more attributes or max depth is reached, return the most common label
    if not attributes or (max_depth is not None and depth == max_depth):
        return Counter(y).most_common(1)[0][0]
    
    # Find the best attribute to split on
    gain_ratios = {attribute: gain_ratio(X, y, attribute) for attribute in attributes}
    best_attribute = max(gain_ratios, key=gain_ratios.get)
    
    # Create the tree/sub-tree
    tree = {best_attribute: {}}
    
    # Get the values of the best attribute
    values = np.unique(X[best_attribute])
    
    # Recur for each subset
    for value in values:
        subset_X = X[X[best_attribute] == value]
        subset_y = y[X[best_attribute] == value]
        subtree = c50(subset_X, subset_y, [attr for attr in attributes if attr != best_attribute], depth + 1, max_depth)
        tree[best_attribute][value] = subtree
    
    return tree

# Function to make predictions with the decision tree
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = sample[attribute]
    subtree = tree[attribute].get(value, None)
    if subtree is None:
        return None
    return predict(subtree, sample)

# Example usage
if __name__ == "__main__":
    # Example dataset
    data = {
        'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
        'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
        'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
        'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
        'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Features and labels
    X = df.drop(columns='PlayTennis')
    y = df['PlayTennis']

    # Build the tree
    tree = c50(X, y, X.columns, max_depth=3)
    print("Decision Tree:", tree)

    # Predict a sample
    sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
    prediction = predict(tree, sample)
    print("Prediction for sample:", prediction)


#### Classification and Regression Trees Algorithms

##### Helper Functions

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

# Function to calculate Gini impurity for classification
def gini_impurity(y):
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    return 1 - sum(p ** 2 for p in probabilities)

# Function to calculate Mean Squared Error for regression
def mean_squared_error(y):
    mean = np.mean(y)
    return np.mean((y - mean) ** 2)

# Function to calculate the impurity for a given split
def calculate_impurity(X, y, split_attribute, split_value, task='classification'):
    left_mask = X[split_attribute] <= split_value
    right_mask = X[split_attribute] > split_value
    
    if task == 'classification':
        left_impurity = gini_impurity(y[left_mask])
        right_impurity = gini_impurity(y[right_mask])
    elif task == 'regression':
        left_impurity = mean_squared_error(y[left_mask])
        right_impurity = mean_squared_error(y[right_mask])
    
    left_weight = len(y[left_mask]) / len(y)
    right_weight = len(y[right_mask]) / len(y)
    
    return left_weight * left_impurity + right_weight * right_impurity

# Function to find the best split for the dataset
def find_best_split(X, y, attributes, task='classification'):
    best_impurity = float('inf')
    best_split = None
    
    for attribute in attributes:
        values = np.unique(X[attribute])
        for value in values:
            impurity = calculate_impurity(X, y, attribute, value, task)
            if impurity < best_impurity:
                best_impurity = impurity
                best_split = (attribute, value)
    
    return best_split


#### Decision Tree Implementation

In [None]:
class DecisionTree:
    def __init__(self, task='classification', max_depth=None, min_samples_split=2):
        self.task = task
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)
    
    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1:
            return y.iloc[0] if self.task == 'classification' else np.mean(y)
        if len(y) < self.min_samples_split or (self.max_depth is not None and depth >= self.max_depth):
            return Counter(y).most_common(1)[0][0] if self.task == 'classification' else np.mean(y)
        
        best_split = find_best_split(X, y, X.columns, self.task)
        if best_split is None:
            return Counter(y).most_common(1)[0][0] if self.task == 'classification' else np.mean(y)
        
        attribute, value = best_split
        tree = {attribute: {}}
        
        left_mask = X[attribute] <= value
        right_mask = X[attribute] > value
        
        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        
        tree[attribute]['<= {}'.format(value)] = left_subtree
        tree[attribute]['> {}'.format(value)] = right_subtree
        
        return tree
    
    def predict(self, X):
        return X.apply(self._predict_sample, axis=1, args=(self.tree,))
    
    def _predict_sample(self, sample, tree):
        if not isinstance(tree, dict):
            return tree
        attribute = next(iter(tree))
        value = float(next(iter(tree[attribute].keys())).split()[1])
        if sample[attribute] <= value:
            return self._predict_sample(sample, tree[attribute]['<= {}'.format(value)])
        else:
            return self._predict_sample(sample, tree[attribute]['> {}'.format(value)])

# Example usage
if __name__ == "__main__":
    # Example dataset for classification
    data = {
        'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
        'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
        'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
        'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
        'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Features and labels for classification
    X_classification = df.drop(columns='PlayTennis')
    y_classification = df['PlayTennis']

    # Build and fit the classification tree
    tree_classification = DecisionTree(task='classification', max_depth=3)
    tree_classification.fit(X_classification, y_classification)
    print("Classification Tree:", tree_classification.tree)

    # Predict a sample for classification
    sample_classification = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
    prediction_classification = tree_classification.predict(pd.DataFrame([sample_classification]))
    print("Prediction for classification sample:", prediction_classification.iloc[0])

    # Example dataset for regression
    data_regression = {
        'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Feature2': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
        'Target': [2.3, 2.1, 3.6, 3.8, 5.0, 5.2, 7.8, 7.6, 8.5, 8.7]
    }

    # Convert to DataFrame
    df_regression = pd.DataFrame(data_regression)

    # Features and labels for regression
    X_regression = df_regression.drop(columns='Target')
    y_regression = df_regression['Target']

    # Build and fit the regression tree
    tree_regression = DecisionTree(task='regression', max_depth=3)
    tree_regression.fit(X_regression, y_regression)
    print("Regression Tree:", tree_regression.tree)

    # Predict a sample for regression
    sample_regression = {'Feature1': 5, 'Feature2': 6}
    prediction_regression = tree_regression.predict(pd.DataFrame([sample_regression]))
    print("Prediction for regression sample:", prediction_regression.iloc[0])
