In [5]:
import pandas as pd

# Function to calculate prior probabilities
def calculate_prior_probabilities(y):
    return y.value_counts(normalize=True)

# Function to calculate likelihoods with Laplace smoothing
def calculate_likelihoods_with_smoothing(X, y):
    likelihoods = {}
    for column in X.columns:
        likelihoods[column] = {}
        for class_ in y.unique():
            # Calculate normalized counts with smoothing
            class_data = X[y == class_][column]
            counts = class_data.value_counts()
            total_count = len(class_data) + len(X[column].unique())  # total count with smoothing
            likelihoods[column][class_] = (counts + 1) / total_count  # add-1 smoothing
    return likelihoods

# Naive Bayes classifier function
def naive_bayes_classifier(X_test, priors, likelihoods):
    predictions = []
    for _, data_point in X_test.iterrows():
        class_probabilities = {}
        for class_ in priors.index:
            class_probabilities[class_] = priors[class_]
            for feature in X_test.columns:
                # Use .get to safely retrieve probability and get a default of 1/total to handle unseen values
                # TODO: Retrieve feature_probs for the current class and feature
                feature_probs = likelihoods[feature][class_]
                # TODO: Safely retrieve the likelihood from feature_probs using .get(data_point[feature], default value)
                class_probabilities[class_] *= feature_probs.get(data_point[feature], 1 / (len(feature_probs) + 1))
                # TODO: Update the class_probabilities for the class using the retrieved likelihood

        # TODO: Append to predictions the class with the maximum posterior probability for this data point
    predictions.append(max(class_probabilities, key=class_probabilities.get))

    return predictions

df = pd.DataFrame({
    'Temperature': ['Hot', 'Cold', 'Cold', 'Hot', 'Cold'],
    'Outlook': ['Sunny', 'Rainy', 'Rainy', 'Sunny', 'Sunny'],
    'Play': ['Yes', 'Yes', 'No', 'Yes', 'No']
})

# Calculate the prior probabilities
priors = calculate_prior_probabilities(df['Play'])

# Calculate the likelihoods with Laplace smoothing
likelihoods = calculate_likelihoods_with_smoothing(df[['Temperature', 'Outlook']], df['Play'])

# Predicting whether we can play or not on a new day
new_day = pd.DataFrame([{'Temperature': 'Hot', 'Outlook': 'Sunny'}])
predictions = naive_bayes_classifier(new_day, priors, likelihoods)
print("Can we play on a new day? ", predictions[0])

Can we play on a new day?  Yes


In [6]:
## Decision Tree

In [14]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def gini_index(groups, classes):
    total_instances = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = sum([(group.count(class_val) / size) ** 2 for class_val in classes])
        gini += (1 - score) * (size / total_instances)
    return gini

def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            # TODO: call the test_split function to get groups
            group = test_split(index , row[index], dataset)
            # TODO: calculate the gini index using the gini_index function
            gini = gini_index(group , class_values)
            if gini < b_score:
                # TODO: update best index, best value, best score and best groups 
                b_index , b_value , b_score , b_groups = index , row[index] , gini , group 
    return {'index': b_index, 'value': b_value, 'groups': b_groups}

# Dataset: Age, Preferred Genre, Likelihood to Watch
dataset = [[16, 'Comedy', 'Yes'], [21, 'Action', 'No'], [25, 'Comedy', 'Yes']]
split = get_split(dataset)
print('Best Split: Column Index:', split['index'], 'Value:', split['value'])

Best Split: Column Index: 0 Value: 16


In [15]:
def create_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [16]:
def build_tree(train, max_depth, min_size):
    root = get_best_split(train)
    recurse_split(root, max_depth, min_size, 1)
    return root

In [18]:
# Function for getting the majority class for creating a terminal node
def create_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Select the best split point for a dataset
def get_best_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = float('inf'), float('inf'), float('inf'), None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Split a dataset based on an attribute and attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini

# Recursive function to create child splits or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right or depth >= max_depth:
        node['left'] = node['right'] = create_terminal(left + right)
        return
    if len(left) <= min_size:
        node['left'] = create_terminal(left)
    else:
        node['left'] = get_best_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = create_terminal(right)
    else:
        node['right'] = get_best_split(right)
        split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_best_split(train)
    split(root, max_depth, min_size, 1)
    return root

# Function to print a decision tree recursively
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s[%s]' % ((depth*' ', node))) 

def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Example usage:
for row in dataset:
    prediction = predict(tree, row)
    print('Expected=%d, Got=%d' % (row[-1], prediction))

# Test building a tree with recursive functions
dataset = [[2.771244718,1.784783929,0],
           [1.728571309,1.169761413,0],
           [3.678319846,2.81281357,0],
           [3.961043357,2.61995032,0],
           [2.999208922,2.209014212,0],
           [7.497545867,3.162953546,1],
           [9.00220326,3.339047188,1],
           [7.444542326,0.476683375,1],
           [10.12493903,3.234550982,1],
           [6.642287351,3.319983761,1]]
max_depth = 3
min_size = 1
tree = build_tree(dataset, max_depth, min_size)
print_tree(tree)

Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
[X1 < 6.642]
 [X1 < 2.771]
  [0]
  [X1 < 2.771]
   [0]
   [0]
 [X1 < 7.498]
  [X1 < 7.445]
   [1]
   [1]
  [X1 < 7.498]
   [1]
   [1]


In [21]:
def entropy(groups, classes):
    total_instances = sum(len(group) for group in groups)
    entropy_value = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            if p > 0:
                score -= p * math.log2(p)
        entropy_value += score * (size / total_instances)
    return entropy_value

def gini_index(groups, classes):
    total_instances = sum(len(group) for group in groups)
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / total_instances)
    return gini 

class DecisionTree:
    def __init__(self):
        self.root = None

    def fit(self, dataset):
        # Implement tree building logic here
        pass

    def predict(self, row):
        return self._predict(self.root, row)

    def _predict(self, node, row):
        if node.label is not None:
            return node.label
        if row[node.index] < node.value:
            return self._predict(node.left, row)
        else:
            return self._predict(node.right, row) 

class InnerNode:
    def __init__(self, index=None, value=None, left=None, right=None, label=None):
        self.index = index  # Feature index for splitting
        self.value = value  # Feature value for splitting
        self.left = left    # Left child node
        self.right = right  # Right child node
        self.label = label  # Class label for leaf nodes