In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, zscore, median_abs_deviation

In [None]:
def generate_dataset(mean_male_height, mean_female_height, std_deviation, size):
    male_heights = np.random.normal(mean_male_height, std_deviation, size)
    female_heights = np.random.normal(mean_female_height, std_deviation, size)

    return male_heights, female_heights

In [None]:
# Likelihood-based classification
def likelihood(male_heights, female_heights, std_deviation):
    male_likelihood_m = norm.pdf(male_heights, np.mean(male_heights), std_deviation)
    female_likelihood_m = norm.pdf(male_heights, np.mean(female_heights), std_deviation)
    misclassified_males = np.sum(female_likelihood_m > male_likelihood_m)

    male_likelihood_f = norm.pdf(female_heights, np.mean(male_heights), std_deviation)
    female_likelihood_f = norm.pdf(female_heights, np.mean(female_heights), std_deviation)
    misclassified_females = np.sum(male_likelihood_f > female_likelihood_f)

    total_samples = len(male_heights) + len(female_heights)
    total_misclassified = misclassified_males + misclassified_females

    accuracy = 1 - (total_misclassified / total_samples)
    return accuracy

In [None]:
# Optimal threshold classification
def optimal_threshold(male_heights, female_heights):
    best_threshold = None
    min_misclassifications = float('inf')

    # threshold_val = (np.mean(male_heights) + np.mean(female_heights)) / 2
    #
    # misclassified_males = np.sum(male_heights <= threshold_val)
    # misclassified_females = np.sum(female_heights > threshold_val)

    for threshold_val in np.linspace(male_heights.min(), female_heights.max(), 1000):
        misclassified_males = np.sum(male_heights <= threshold_val)
        misclassified_females = np.sum(female_heights > threshold_val)
        total_misclassified = misclassified_males + misclassified_females

        if total_misclassified < min_misclassifications:
            min_misclassifications = total_misclassified
            best_threshold = threshold_val

    total_samples = len(male_heights) + len(female_heights)
    # total_misclassified = misclassified_males + misclassified_females

    # accuracy = 1 - (total_misclassified / total_samples)

    accuracy = 1 - (min_misclassifications / total_samples)

    return best_threshold, accuracy

In [None]:
# Quantization-based classification
def quantization(male_heights, female_heights, interval):
    min_height = min(male_heights.min(), female_heights.min())
    max_height = max(male_heights.max(), female_heights.max())
    bins = np.arange(min_height, max_height + interval, interval)

    male_counts = np.zeros(len(bins) - 1)
    female_counts = np.zeros(len(bins) - 1)

    for height in male_heights:
        bin_index = np.searchsorted(bins, height, side='right') - 1
        if 0 <= bin_index < len(male_counts):
            male_counts[bin_index] += 1

    for height in female_heights:
        bin_index = np.searchsorted(bins, height, side='right') - 1
        if 0 <= bin_index < len(female_counts):
            female_counts[bin_index] += 1

    misclassified_males = 0
    misclassified_females = 0
    single_sample_bins = 0

    for i in range(len(male_counts)):
        total_samples = male_counts[i] + female_counts[i]

        if total_samples == 0:
            continue  # Ignore empty bins
        elif total_samples == 1:
            single_sample_bins += 1  # Track single-sample bins separately
        else:
            if female_counts[i] > male_counts[i]:
                misclassified_males += male_counts[i]
            else:
                misclassified_females += female_counts[i]

    total_samples = len(male_heights) + len(female_heights)
    total_misclassified = misclassified_males + misclassified_females

    accuracy_including_all = 1 - (total_misclassified / total_samples)
    accuracy_excluding_single_bins = 1 - (total_misclassified / (total_samples - single_sample_bins))

    return accuracy_including_all, accuracy_excluding_single_bins

In [None]:
# Compute Accuracies
def compute_accuracies(male_heights, female_heights, std_deviation_values, intervals, size=1000):
    accuracies = {
        "likelihood": [],
        "threshold": [],
        "quantization": []
    }

    for std_deviation in std_deviation_values:

        # Likelihood Accuracy
        likelihood_accuracy = likelihood(male_heights, female_heights, std_deviation) * 100
        accuracies["likelihood"].append(likelihood_accuracy)

        # Threshold Accuracy
        _, threshold_accuracy = optimal_threshold(male_heights, female_heights)
        accuracies["threshold"].append(threshold_accuracy * 100)

        # Quantization Accuracy
        quantization_accs = []
        for interval in intervals:
            quantization_accuracy, _ = quantization(male_heights, female_heights, interval)
            quantization_accs.append(quantization_accuracy * 100)
        accuracies["quantization"].append(quantization_accs)

    return accuracies

In [None]:
# Plotting Height Distribution
def plot_height(female_heights):
    plt.figure(figsize=(10, 7))

    # Plot histograms
    plt.subplot(2, 1, 1)
    plt.hist(female_heights, bins=30, alpha=0.7, color='blue', label='Female Heights')
    plt.title("Histogram of Heights")
    plt.legend()

    # Plot female box plot
    plt.subplot(2, 1, 2)
    sns.boxplot(x=female_heights, color='blue')
    plt.title("Box and Whisker Plot - Female Heights")

    plt.tight_layout()
    plt.show()

In [None]:
mean_male_height = 166
mean_female_height = 152
size = 1000

std_deviation_values = [2.5]
intervals = [4]

male_heights, female_heights = generate_dataset(mean_male_height, mean_female_height, std_deviation_values[0], size)

In [None]:
new_female_heights = np.sort(female_heights)
new_female_heights[-50:] += 10

mean_before = np.mean(female_heights)
std_deviation_before = np.std(female_heights)
mean_after = np.mean(new_female_heights)
std_deviation_after = np.std(new_female_heights)

print(f"Mean Before : {mean_before} Standard Deviation Before: {std_deviation_before}")
print(f"Mean After : {mean_after} Standard Deviation After: {std_deviation_after}")

In [None]:
# Calculating accuracies before introduction of outliers
accuracies = compute_accuracies(male_heights, female_heights, std_deviation_values, intervals, size)
print("Accuracies Before Introduction of Outliers")
print("Likelihood Accuracies:", accuracies["likelihood"])
print("Threshold Accuracies:", accuracies["threshold"])
print("Quantization Accuracies:", accuracies["quantization"])
plot_height(female_heights)

In [None]:
# Calculating accuracies after introduction of outliers
accuracies = compute_accuracies(male_heights, new_female_heights, std_deviation_values, intervals, size)
print("Accuracies Before Introduction of Outliers")
print("Likelihood Accuracies:", accuracies["likelihood"])
print("Threshold Accuracies:", accuracies["threshold"])
print("Quantization Accuracies:", accuracies["quantization"])
plot_height(new_female_heights)

In [None]:
# Using z-score to remove outliers ( as values above certain z-score threshold indicate an extreme value )
z_score_females = zscore(new_female_heights)

outlier_mask_using_zscore = np.abs(z_score_females) > 2

num_outlier_zscore = np.sum(outlier_mask_using_zscore)
print(f"Number of Outliers using Z-Score: {num_outlier_zscore}")

female_heights_using_zscore_cutoff = new_female_heights[~outlier_mask_using_zscore]

plot_height(female_heights_using_zscore_cutoff)

In [None]:
# Using IQR to remove outliers
Q1, Q3 = np.percentile(new_female_heights, [25, 75])
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_mask_using_iqr = (new_female_heights < lower_bound) | (new_female_heights > upper_bound)

num_outlier_iqr = np.sum(outlier_mask_using_iqr)
print(f"Number of Outliers using IQR: {num_outlier_iqr}")

female_heights_using_iqr_cutoff = new_female_heights[~outlier_mask_using_iqr]

plot_height(female_heights_using_iqr_cutoff)

In [None]:
# Using MAD to remove outliers
median_female = np.median(new_female_heights)
absolute_deviation = np.abs(new_female_heights - median_female)

MAD = np.median(absolute_deviation)

In [None]:
# Using 1.5 as multiplier for MAD
outlier_mask_using_mad = absolute_deviation > 1.5 * MAD

num_outlier_mad = np.sum(outlier_mask_using_mad)
print(f"Number of Outliers using MAD with 1.5 multiplier: {num_outlier_mad}")

female_heights_using_mad_cutoff = new_female_heights[absolute_deviation <= 1.5 * MAD]

plot_height(female_heights_using_mad_cutoff)

In [None]:
# Using 3 as multiplier for MAD
outlier_mask_using_mad = absolute_deviation > 3 * MAD

num_outlier_mad = np.sum(outlier_mask_using_mad)
print(f"Number of Outliers using MAD with 3 multiplier: {num_outlier_mad}")

female_heights_using_mad_cutoff = new_female_heights[absolute_deviation <= 3 * MAD]

plot_height(female_heights_using_mad_cutoff)

In [None]:
# Calculating accuracies after removal of outliers ( using values obtained from Z-Score cutoff )
accuracies_after = compute_accuracies(male_heights, female_heights_using_zscore_cutoff, std_deviation_values, intervals, size)
print("Accuracies After Removal of Outliers using Z-Score Cutoff")
print("Likelihood Accuracies:", accuracies_after["likelihood"])
print("Threshold Accuracies:", accuracies_after["threshold"])
print("Quantization Accuracies:", accuracies_after["quantization"])

In [None]:
# Calculating accuracies after removal of outliers ( using values obtained from IQR cutoff )
accuracies_after = compute_accuracies(male_heights, female_heights_using_iqr_cutoff, std_deviation_values, intervals, size)
print("Accuracies After Removal of Outliers using IQR Cutoff")
print("Likelihood Accuracies:", accuracies_after["likelihood"])
print("Threshold Accuracies:", accuracies_after["threshold"])
print("Quantization Accuracies:", accuracies_after["quantization"])

In [None]:
# Calculating accuracies after removal of outliers ( using values obtained from MAD cutoff )
accuracies_after = compute_accuracies(male_heights, female_heights, std_deviation_values, intervals, size)
print("Accuracies After Removal of Outliers using MAD Cutoff")
print("Likelihood Accuracies:", accuracies_after["likelihood"])
print("Threshold Accuracies:", accuracies_after["threshold"])
print("Quantization Accuracies:", accuracies_after["quantization"])

In [None]:
# Data Trimming
def trim_data(heights, k):
    lower_bound = np.percentile(heights, k)
    upper_bound = np.percentile(heights, 100 - k)
    return heights[(heights >= lower_bound) & (heights <= upper_bound)]

In [None]:
trim_percentages = np.arange(1, 16, 1)
accuracies = {"likelihood": [], "threshold": [], "quantization": []}

for k in trim_percentages:
    trimmed_female_heights = trim_data(new_female_heights, k)

    accuracy_values = compute_accuracies(male_heights, trimmed_female_heights, std_deviation_values, intervals, size)

    accuracies["likelihood"].append(accuracy_values["likelihood"][0])
    accuracies["threshold"].append(accuracy_values["threshold"][0])
    accuracies["quantization"].append(accuracy_values["quantization"][0][0])

plt.figure(figsize=(10, 5))
plt.plot(trim_percentages, accuracies["likelihood"], marker='o', label="Likelihood")
plt.plot(trim_percentages, accuracies["threshold"], marker='s', label="Threshold")
plt.plot(trim_percentages, accuracies["quantization"], marker='^', label="Quantization")
plt.xlabel("Trimming Percentage (k%)")
plt.ylabel("Classification Accuracy (%)")
plt.title("Impact of Data Trimming on Classification Accuracy")
plt.legend()
plt.show()