In [1]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
from scipy.stats import wasserstein_distance
sys.path.append(os.path.abspath("../../src"))  # Adds 'src' to the module search path

In [2]:
from plot import plot_wasserstein_comparison

In [3]:
DATASET_1 = "../../raw_data/massive_complete.parquet"
DATASET_2 = "../../raw_data/proteome.parquet"

In [4]:
def compute_mean_wasserstein_distance(df,min_files,col='filename'):
    grouped = df.groupby('sequence')
    mean_distances = {}

    for sequence, group in grouped:
        distributions = group.groupby(col)['iRT'].apply(list)
        distributions_dict = {k: v for k, v in distributions.to_dict().items() if len(v) >= min_files}  # Exclude single-value distributions
        filenames = list(distributions_dict.keys())

        distances = [
            wasserstein_distance(distributions_dict[file1], distributions_dict[file2])
            for file1, file2 in itertools.combinations(filenames, 2)
        ]

        if distances:
            mean_distances[sequence] = sum(distances) / len(distances)

    return mean_distances


In [5]:
# Load your datasets
df1 = pd.read_parquet(DATASET_1)
df2 = pd.read_parquet(DATASET_2)

In [None]:
mean_wasserstein1 = compute_mean_wasserstein_distance(df1, 1,'filename')
mean_wasserstein2 = compute_mean_wasserstein_distance(df2, 1,'pool')

In [None]:
# Plot the comparison
plot_wasserstein_comparison(
    mean_wasserstein1, 
    mean_wasserstein2,
    label1="MassiveKB", 
    label2="Proteome"
)