In [6]:
#10. 1)
#Equal Width Discretization
import numpy as np

# Equal width discretization function
def equal_width(data, num_bins):
    min_val = np.min(data)
    max_val = np.max(data)
    bins = np.linspace(min_val, max_val, num_bins + 1)  # Create num_bins + 1 edges
    return bins

# Continuous data
data = np.array([1.5, 2.2, 2.8, 3.0, 3.7, 4.1, 5.5, 5.8])

# Define the number of bins
num_bins = 3

# Compute intervals using equal width function
bins = equal_width(data, num_bins)

# Using bins, discretize the data by assigning it to bins
disc_data = np.digitize(data, bins, right=True)

# Display the results
print("Original Data:", data)
print("Bins:", bins)
print("Discretized Data:", disc_data)


ModuleNotFoundError: No module named 'numpy'

In [None]:
#10. 2)
# Entropy-based Discretization 
#Calculate entropy of target values
def calculate_entropy(y):
    value, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Avoid log(0)
    return entropy

# Information gain calculation
def information_gain(y, y_left, y_right):
    p = len(y_left) / len(y)
    info_gain = calculate_entropy(y) - (p * calculate_entropy(y_left) + (1 - p) * calculate_entropy(y_right))
    return info_gain

# Find the best split point based on information gain
def best_split(X, y):
    best_info_gain = -1
    best_split_point = None
    
    for value in np.unique(X):
        y_left = y[X <= value]
        y_right = y[X > value]
        
        if len(y_left) == 0 or len(y_right) == 0:
            continue
            
        info_gain = information_gain(y, y_left, y_right)
        
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_split_point = value
    
    return best_split_point

# Example data for testing the best split
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
target = np.array([0, 0, 1, 1, 1, 0, 0, 1, 1])

# Running best_split and displaying results
split_point = best_split(data, target)
print("Best Split Point:", split_point)


In [None]:
trans_db = [
    ['milk', 'bread', 'biscuit'], 
    ['bread', 'milk', 'biscuit', 'cornflakes'],
    ['bread', 'tea', 'bournvita'], 
    ['jam', 'maggi', 'bread', 'milk'], 
    ['maggi', 'tea', 'biscuit'], 
    ['bread', 'tea', 'bournvita'], 
    ['maggi', 'tea', 'cornflakes'], 
    ['maggi', 'bread', 'tea', 'biscuit'],
    ['jam', 'maggi', 'bread', 'tea'], 
    ['bread', 'milk'], 
    ['coffee', 'cock', 'biscuit', 'cornflakes'], 
    ['coffee', 'cock', 'biscuit', 'cornflakes'], 
    ['coffee', 'sugar', 'bournvita'], 
    ['bread', 'coffee', 'cock'],   
    ['bread', 'sugar', 'biscuit'], 
    ['coffee', 'sugar', 'cornflakes'], 
    ['bread', 'sugar', 'bournvita'],
    ['bread', 'coffee', 'sugar'], 
    ['bread', 'coffee', 'sugar'], 
    ['tea', 'milk', 'coffee', 'cornflakes']
]

min_support = 3

# Compute support value of an itemset
def compute_support(itemset):
    support = 0
    for trans in trans_db:
        if set(itemset).issubset(set(trans)):
            support += 1
    return support

# Generate k+1 frequent itemsets from k frequent itemsets
def generate_k_1_itemsets(k_itemsets):
    k_1_itemsets = []
    
    for i in range(len(k_itemsets)):
        for j in range(i + 1, len(k_itemsets)):
            new_itemset = sorted(set(k_itemsets[i]) | set(k_itemsets[j]))
            if len(new_itemset) == len(k_itemsets[0]) + 1:
                support = compute_support(new_itemset)
                if support >= min_support:
                    k_1_itemsets.append(new_itemset)
                else:
                    infreq_itemsets.append(new_itemset)
                    
    return k_1_itemsets

# Original 1-items (without considering min_support values)
all_list_items = list(set(item for trans in trans_db for item in trans))

# Frequent 1-itemsets
itemsets_1 = [[item] for item in all_list_items if compute_support([item]) >= min_support]

# Displaying frequent 1-itemsets
print("Frequent 1-itemsets:", itemsets_1)

# Finding frequent 2-itemsets
itemsets_2 = generate_k_1_itemsets(itemsets_1)
print("Frequent 2-itemsets:", itemsets_2)

# Finding frequent 3-itemsets
itemsets_3 = generate_k_1_itemsets(itemsets_2)
print("Frequent 3-itemsets:", itemsets_3)


In [None]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer

# Load Iris dataset
dataset = load_iris()
X = dataset.data
response = dataset.target
feature_names = dataset.feature_names

# Equal Width Discretization
equal_width_discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
X_binned_equal_width = equal_width_discretizer.fit_transform(X)

# Equal Frequency Discretization
equal_frequency_discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
X_binned_equal_frequency = equal_frequency_discretizer.fit_transform(X)

print("Discretized Features (Equal Width):")
print(X_binned_equal_width.toarray())

print("Discretized Features (Equal Frequency):")
print(X_binned_equal_frequency.toarray())
