## Mann U Test (aka Wilcoxon Rank Sum test)
### for seed rank

In [13]:
import ast
import numpy as np
from scipy.stats import wasserstein_distance
import os
import statistics
from sklearn.preprocessing import MinMaxScaler

In [20]:
def load_or_average(folder_path, base_name):
    """Load data from base_name.txt if it exists, otherwise average base_name0.txt ... base_name19.txt."""
    file_path = os.path.join(folder_path, f"{base_name}.txt")
    
    if os.path.exists(file_path):
        # Load single file
        with open(file_path, 'r') as f:
            data = ast.literal_eval(f.read())
        data.sort(reverse=True)
        return data
    else:
        # Load and average multiple files
        all_lists = []
        for i in range(20):
            alt_path = os.path.join(folder_path, f"{base_name}{i}.txt")
            if os.path.exists(alt_path):
                with open(alt_path, 'r') as f:
                    lst = ast.literal_eval(f.read())
                lst.sort(reverse=True)
                all_lists.append(lst)
        
        if not all_lists:
            raise FileNotFoundError(f"No files found for {base_name} in {folder_path}")
        
        # Average element-wise
        list_length = len(all_lists[0])
        for lst in all_lists:
            if len(lst) != list_length:
                raise ValueError(f"Length mismatch in {base_name} files inside {folder_path}")
        
        mean_list = [statistics.mean(values) for values in zip(*all_lists)]
        print(mean_list)
        return mean_list

def get_rank_data(genome_filename, canon_site, replacement):
    base_folder = f"Final Seed Count/{genome_filename}/Canon Site {canon_site}/Replacement_{replacement}"
    
    # Retrieve real data
    real_data = load_or_average(base_folder, "real")  # This will just load real.txt normally
    
    # Retrieve optimal data (with fallback to averaging)
    optimal_data = load_or_average(base_folder, "optimal")
    
    # Retrieve random data (with fallback to averaging)
    random_data = load_or_average(base_folder, "random")
    
    return real_data, optimal_data, random_data

In [15]:
def wasserstein(x_real, x_rand, x_opt):# Your real samples:
   # Convert to arrays and reshape
    x_real = np.array(x_real).reshape(-1, 1)
    x_rand = np.array(x_rand).reshape(-1, 1)
    x_opt = np.array(x_opt).reshape(-1, 1)

    # Fit scaler only on x_opt
    scaler = MinMaxScaler()
    scaler.fit(x_opt)

    # Transform all using the same scaler
    x_opt = scaler.transform(x_opt).flatten()
    x_real = scaler.transform(x_real).flatten()
    x_rand = scaler.transform(x_rand).flatten()

    dist_rand = wasserstein_distance(x_real, x_rand)
    dist_opt = wasserstein_distance(x_real, x_opt)

    #combine real+rand samples
    real_rand_combined = np.concatenate([x_real, x_rand])
    real_opt_combined = np.concatenate([x_real, x_opt])
    n_real = len(x_real)
    n_permutations = 10000 
    perm_rand = []
    perm_opt = []
    for _ in range(n_permutations):
        np.random.shuffle(real_rand_combined)
        np.random.shuffle(real_opt_combined)
        rand_sample_real = real_rand_combined[:n_real]
        rand_sample_rand = real_rand_combined[n_real:]

        opt_sample_real = real_opt_combined[:n_real]
        opt_sample_opt = real_opt_combined[n_real:]

        d_rand = wasserstein_distance(rand_sample_real, rand_sample_rand)
        d_opt = wasserstein_distance(opt_sample_real, opt_sample_opt)
        perm_rand.append(d_rand)
        perm_opt.append(d_opt)

    # Two-tailed p-value: proportion of permuted distances >= observed
    p_value_rand = np.mean(np.array(perm_rand) >= dist_rand)
    p_value_opt = np.mean(np.array(perm_opt) >= dist_opt)

    print(f"Random Wasserstein distance: {dist_rand:.4f}, P-value: {p_value_rand:.4f}")
    print(f"Optimal Wasserstein distance: {dist_opt:.4f}, P-value: {p_value_opt:.4f}")

# Calculations for Mature Data

In [4]:
x_real, x_opt, x_rand = get_rank_data("Human genes (GRCh38.p13)",'B', True)
wasserstein(x_real, x_rand, x_opt)

Random Wasserstein distance: 0.0632, P-value: 0.0000
Optimal Wasserstein distance: 0.1053, P-value: 0.0000


In [22]:
x_real, x_opt, x_rand = get_rank_data("Drosophila melanogaster (Fruit fly) genes (BDGP6.46)",'B', True)
wasserstein(x_real, x_rand, x_opt)

Random Wasserstein distance: 0.0174, P-value: 0.0999
Optimal Wasserstein distance: 0.1954, P-value: 0.0000


In [6]:
x_real, x_opt, x_rand = get_rank_data("Caenorhabditis elegans (PRJNA13758) genes (WBcel235)",'B', True)
wasserstein(x_real, x_rand, x_opt)

Random Wasserstein distance: 0.0212, P-value: 0.0099
Optimal Wasserstein distance: 0.2469, P-value: 0.0000


# Calculations for Star Data

In [16]:
x_real, x_opt, x_rand = get_rank_data("Human genes (GRCh38.p13)_star",'C', True)
wasserstein(x_real, x_rand, x_opt)

Random Wasserstein distance: 0.0339, P-value: 0.0000
Optimal Wasserstein distance: 0.1283, P-value: 0.0000


In [21]:
x_real, x_opt, x_rand = get_rank_data("Drosophila melanogaster (Fruit fly) genes (BDGP6.46)_star",'C', True)
wasserstein(x_real, x_rand, x_opt)

[3379, 3284, 3244, 3029, 3027, 2970, 2592.6, 2360.2, 2256.85, 2181.2, 2151.1, 2102.55, 2053.95, 2007.35, 1952.1, 1915.25, 1905.55, 1894.25, 1836.45, 1782.15, 1756.95, 1744.8, 1738.35, 1731.2, 1721.55, 1710.05, 1694.55, 1680.5, 1665.55, 1651.9, 1637.25, 1622.2, 1602.05, 1578.55, 1561.7, 1541.5, 1523.7, 1509.05, 1493.3, 1473.55, 1460.7, 1446.8, 1434.5, 1411.2, 1401.6, 1392.5, 1382.05, 1376.2, 1372.6, 1359.3, 1350.75, 1337.55, 1316.3, 1304.4, 1287.2, 1278, 1267, 1255.85, 1242.85, 1229.6, 1215.55, 1200.45, 1189.75, 1177.7, 1164.6, 1142.65, 1124.2, 1113.5, 1098.1, 1080.35, 1036.3, 1018.5, 1002.8, 995.5, 978.5, 972.35, 935.85, 911.65, 881.7, 853.65, 837.8, 824.7, 806.3, 791.45, 778, 765.65, 753.4, 742.4, 729.4, 720.85, 714.7, 706.8, 696, 686.3, 672.95, 659.3, 643.8, 633.6, 621.8, 607.95, 596.2, 584.4, 572.1, 561.85, 551.4, 542.2, 531.75, 523.65, 507.45, 491.7, 479.25, 466.85, 456.6, 448.5, 431.85, 419.85, 396.95, 382.1, 360.6, 331.1, 294.15, 266.15, 206.75]
[1514.1, 1156.05, 976.2, 838.95, 7

# Calculations for Human on Fly

In [23]:
x_real, x_opt, x_rand = get_rank_data("Human vs Fly",'C', True)
wasserstein(x_real, x_rand, x_opt)

Random Wasserstein distance: 0.0057, P-value: 0.6999
Optimal Wasserstein distance: 0.0703, P-value: 0.0000
