In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import powerlaw, geom, zscore, rankdata
from sklearn.preprocessing import quantile_transform
import seaborn as sns

In [None]:
B = np.random.normal(loc=5, scale=2, size=10000)  
I = powerlaw.rvs(a=0.3, size=10000)
H = geom.rvs(p=0.01, size=10000)

In [None]:
plt.figure(figsize=(10, 6))
plt.boxplot([B, I, H], labels=['B (Gaussian)', 'I (Power Law)', 'H (Geometric)'])
plt.title('Comparison of Variables')
plt.ylabel('Values')
plt.grid(axis='y')
plt.show()

In [None]:
def compare_histograms(original, normalized, name):
    plt.figure(figsize=(14, 8))

    plt.subplot(1, 2, 1)
    sns.histplot(original, bins='auto', color='orange', kde=False)
    plt.title(f'{name} Original')
    plt.xlabel('Values')
    plt.ylabel('Frequency')

    plt.subplot(1, 2, 2)
    sns.histplot(normalized, bins='auto', color='blue', kde=False)
    plt.title(f'{name} Normalized')
    plt.xlabel('Values')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

In [None]:
def compare_boxplots(*args, labels):
    plt.figure(figsize=(14, 8))
    plt.boxplot(args, labels=labels, vert=False)
    plt.title('Comparison of Normalized Variables')
    plt.xlabel('Values')
    plt.grid(axis='x')
    plt.show()

In [None]:
# Divide each variable by max
B_max = B / np.max(B)
I_max = I / np.max(I)
H_max = H / np.max(H)

compare_histograms(B, B_max, 'B (Max Normalized)')
compare_histograms(I, I_max, 'I (Max Normalized)')
compare_histograms(H, H_max, 'H (Max Normalized)')
compare_boxplots(B_max, I_max, H_max, labels=['B_max', 'I_max', 'H_max'])

In [None]:
# Divide by sum of its values
B_sum = B / np.sum(B)
I_sum = I / np.sum(I)
H_sum = H / np.sum(H)

compare_histograms(B, B_sum, 'B (Sum Normalized)')
compare_histograms(I, I_sum, 'I (Sum Normalized)')
compare_histograms(H, H_sum, 'H (Sum Normalized)')
compare_boxplots(B_sum, I_sum, H_sum, labels=['B_sum', 'I_sum', 'H_sum'])

In [None]:
# Convert into z-score 
B_zscore = zscore(B)
I_zscore = zscore(I)
H_zscore = zscore(H)

compare_histograms(B, B_zscore, 'B (Z-score Normalized)')
compare_histograms(I, I_zscore, 'I (Z-score Normalized)')
compare_histograms(H, H_zscore, 'H (Z-score Normalized)')
compare_boxplots(B_zscore, I_zscore, H_zscore, labels=['B_zscore', 'I_zscore', 'H_zscore'])

In [None]:
# Convert into percentile
B_percentile = rankdata(B, method='max') / len(B)
I_percentile = rankdata(I, method='dense') / len(I)
H_percentile = rankdata(H, method='min') / len(H)

compare_histograms(B, B_percentile, 'B (Percentile Normalized)')
compare_histograms(I, I_percentile, 'I (Percentile Normalized)')
compare_histograms(H, H_percentile, 'H (Percentile Normalized)')
compare_boxplots(B_percentile, I_percentile, H_percentile, labels=['B_percentile', 'I_percentile', 'H_percentile'])

In [None]:
# Make mean of all variables same
medians = [np.median(B), np.median(I), np.median(H)]
m1 = np.mean(medians)
B_median_adjusted = B * (m1 / medians[0])
I_median_adjusted = I * (m1 / medians[1])
H_median_adjusted = H * (m1 / medians[2])

compare_histograms(B, B_median_adjusted, 'B (Median Adjusted)')
compare_histograms(I, I_median_adjusted, 'I (Median Adjusted)')
compare_histograms(H, H_median_adjusted, 'H (Median Adjusted)')
compare_boxplots(B_median_adjusted, I_median_adjusted, H_median_adjusted, labels=['B_median_adjusted', 'I_median_adjusted', 'H_median_adjusted'])

In [None]:
# Quantile Normalization
stacked = np.column_stack([B, I, H])
quantile_normalized = quantile_transform(stacked, axis=0, copy=True, output_distribution='normal')
B_quantile, I_quantile, H_quantile = quantile_normalized.T

compare_histograms(B, B_quantile, 'B (Quantile Normalized)')
compare_histograms(I, I_quantile, 'I (Quantile Normalized)')
compare_histograms(H, H_quantile, 'H (Quantile Normalized)')
compare_boxplots(B_quantile, I_quantile, H_quantile, labels=['B_quantile', 'I_quantile', 'H_quantile'])