In [24]:
import numpy as np

#
# Suppose num_bins=3,
# if bins=[v1,v2,v3,v4],
# v1 < 1st interval(bin) <= v2
# v2 < 2nd interval(bin) <= v3
# v3 < 3rd interval(bin) <= v4

# Equal width discretization function
# hint: may use numpy's linspace
def equal_width(data, num_bins):
    min_value = np.min(data)
    max_value = np.max(data)
    bin_width = (max_value - min_value) / num_bins

    bins = np.linspace(min_value, max_value, num_bins + 1)
    return bins

# Continuous data
data = np.array([1,2,3,4,5,6,7,8,9,10])

# Define the number of bins
num_bins = 4

# compute intervals using equal width function

bins=equal_width(data, num_bins)

# Using bins, discretize the data by assigning it to bins

disc_data = np.digitize(data, bins, right=True)

# Display the results
# show the results of original data, bins, disc_data, etc

print("Original data:", data)
print("Bins:", bins)
print("Discretized data:", disc_data)

# 2) Entropy-based discretization: The following program finds the best split point in a numeric data array.

# (input) y: array of target values
# (output) entropy: entropy value of y
# for simplicity. assume binary class only
# e.g., [0,0,1,0,0,1,1,1]
print('----------------------------------------------------------------------------------------------------------------')

def calculate_entropy(y, smoothing=1):

    positive_count = np.sum(y) + smoothing
    negative_count = len(y) - np.sum(y) + smoothing

    total_count = len(y) + 2 * smoothing

    positive_prob = positive_count / total_count
    negative_prob = negative_count / total_count

    entropy = -(positive_prob * np.log2(positive_prob) + negative_prob * np.log2(negative_prob))

    return entropy

y = np.array([0,0,1,0,0,1,1,1])

# Calculate entropy
entropy = calculate_entropy(y)

# Print the result
print("Entropy:", entropy)

print('----------------------------------------------------------------------------------------------------------------')

# (input) y: array of target values
# y_left: left interval
# y_right: right interval
# e.g., y=[1,2,3,4,5], y_left=[1,2], y_right=[3,4,5]
# (output) info_gain: information gain between entropy(y) and average entropy after split.

def calculate_variance(y):
    return np.var(y)

def information_gain(y, y_left, y_right):

    # compute entropy of y
    parent_entropy = calculate_variance(y)

    # calculate entropy of y_left and y_right, respectively
    left_entropy = calculate_variance(y_left)
    right_entropy = calculate_variance(y_right)

    # compute information gain
    info_gain = parent_entropy - (len(y_left) / len(y)) * left_entropy - (len(y_right) / len(y)) * right_entropy


    return info_gain

y = np.array([1,2,3,4,5])
y_left = np.array([1,2])
y_right = np.array([3,4,5])

variance = calculate_variance(y)
info_gain = information_gain(y, y_left, y_right)

print("Information Gain:", info_gain)

print('---------------------------------------------------------------------')

def best_split(X, y):
    best_info_gain = -1
    best_split_point = None

    for value in np.unique(X):

        y_left = y[X <= value]
        y_right = y[X > value]

        if len(y_left) > 0 and len(y_right) > 0:

            info_gain = information_gain(y, y_left, y_right)

            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_split_point = value

    return best_split_point, best_info_gain


X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 3, 4, 5])

split_point, info_gain = best_split(X, y)

print("Best Split Point:", split_point)



Original data: [ 1  2  3  4  5  6  7  8  9 10]
Bins: [ 1.    3.25  5.5   7.75 10.  ]
Discretized data: [0 1 1 2 2 3 3 4 4 4]
----------------------------------------------------------------------------------------------------------------
Entropy: 1.0
----------------------------------------------------------------------------------------------------------------
Information Gain: 1.5
---------------------------------------------------------------------
Best Split Point: 2
