In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import zscore, norm

In [None]:
mean_male_height = 166
mean_female_height = 152

In [None]:
np.random.seed(42)
male_heights = np.random.normal(loc=mean_male_height, scale=5.5, size=1000)
female_heights = np.random.normal(loc=mean_female_height, scale=4.5, size=1000)

In [None]:
male_labels = np.zeros(1000)
female_labels = np.ones(1000)

In [None]:
heights = np.concatenate((male_heights, female_heights))
labels = np.concatenate((male_labels, female_labels))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(heights, labels, test_size=0.2, stratify=labels, random_state=42)

In [None]:
def predict_likelihood(X, mean_male, mean_female, std):
    male_likelihood = norm.pdf(X, mean_male, std)
    female_likelihood = norm.pdf(X, mean_female, std)
    return (female_likelihood > male_likelihood).astype(int)

In [None]:
mean_male_height = np.mean(X_train[Y_train == 0])
mean_female_height = np.mean(X_train[Y_train == 1])
std_deviation = np.std(X_train)

In [None]:
Y_train_pred = predict_likelihood(X_train, mean_male_height, mean_female_height, std_deviation)
Y_test_pred = predict_likelihood(X_test, mean_male_height, mean_female_height, std_deviation)

In [None]:
train_accuracy = np.mean(Y_train_pred == Y_train)
test_accuracy = np.mean(Y_test_pred == Y_test)
print(f'Initial Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

In [None]:
female_train_indices = np.where(Y_train == 1)[0]
top_50_female_indices = female_train_indices[np.argsort(X_train[female_train_indices])[-50:]]
X_train[top_50_female_indices] += 10

In [None]:
mean_male_height = np.mean(X_train[Y_train == 0])
mean_female_height = np.mean(X_train[Y_train == 1])
std_deviation = np.std(X_train)

In [None]:
Y_train_pred = predict_likelihood(X_train, mean_male_height, mean_female_height, std_deviation)
Y_test_pred = predict_likelihood(X_test, mean_male_height, mean_female_height, std_deviation)

In [None]:
new_train_accuracy = np.mean(Y_train_pred == Y_train)
new_test_accuracy = np.mean(Y_test_pred == Y_test)
print(f'After Height Increase - Train Accuracy: {new_train_accuracy:.4f}, Test Accuracy: {new_test_accuracy:.4f}')

In [None]:
female_train_scores = zscore(X_train[female_train_indices])
non_outlier_indices = female_train_indices[np.abs(female_train_scores) < 3]

X_train_filtered = np.concatenate((X_train[non_outlier_indices], X_train[Y_train == 0]))
y_train_filtered = np.concatenate((Y_train[non_outlier_indices], Y_train[Y_train == 0]))

In [None]:
mean_male_height = np.mean(X_train_filtered[y_train_filtered == 0])
mean_female_height = np.mean(X_train_filtered[y_train_filtered == 1])
std_deviation = np.std(X_train_filtered)

In [None]:
Y_train_pred = predict_likelihood(X_train_filtered, mean_male_height, mean_female_height, std_deviation)
Y_test_pred = predict_likelihood(X_test, mean_male_height, mean_female_height, std_deviation)

In [None]:
filtered_train_accuracy = np.mean(Y_train_pred == y_train_filtered)
filtered_test_accuracy = np.mean(Y_test_pred == Y_test)
print(f'After Outlier Removal - Train Accuracy: {filtered_train_accuracy:.4f}, Test Accuracy: {filtered_test_accuracy:.4f}')

In [None]:
# Trimming
trim_results = {}
train_accs = []
test_accs = []

for k in range(1, 16):
    lower_percentile = np.percentile(X_train[female_train_indices], k)
    upper_percentile = np.percentile(X_train[female_train_indices], 100 - k)
    trimmed_indices = female_train_indices[(X_train[female_train_indices] >= lower_percentile) &
                                           (X_train[female_train_indices] <= upper_percentile)]

    X_train_trimmed = np.concatenate((X_train[trimmed_indices], X_train[Y_train == 0]))
    y_train_trimmed = np.concatenate((Y_train[trimmed_indices], Y_train[Y_train == 0]))

    mean_male_height = np.mean(X_train_trimmed[y_train_trimmed == 0])
    mean_female_height = np.mean(X_train_trimmed[y_train_trimmed == 1])
    std_deviation = np.std(X_train_trimmed)

    Y_train_pred = predict_likelihood(X_train_trimmed, mean_male_height, mean_female_height, std_deviation)
    Y_test_pred = predict_likelihood(X_test, mean_male_height, mean_female_height, std_deviation)

    trimmed_train_accuracy = np.mean(Y_train_pred == y_train_trimmed)
    trimmed_test_accuracy = np.mean(Y_test_pred == Y_test)

    trim_results[k] = (trimmed_train_accuracy, trimmed_test_accuracy)
    train_accs.append(trimmed_train_accuracy)
    test_accs.append(trimmed_test_accuracy)

    print(f'Trimming {k}% - Train Accuracy: {trimmed_train_accuracy:.4f}, Test Accuracy: {trimmed_test_accuracy:.4f}')

In [None]:
# Plotting accuracy vs trimming
plt.figure(figsize=(10, 6))
plt.plot(range(1, 16), train_accs, marker='o', label='Train Accuracy')
plt.plot(range(1, 16), test_accs, marker='s', label='Test Accuracy')
plt.xlabel('Trimming Percentage (from each tail of female heights)')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Trimming Percentage')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()