### Note: All synIDS are specific to the 2025 DREAM Olfactory Mixtures Prediction Challenge Final Round

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
import synapseclient

SUBMISSION_VIEWS = {
    "Task 1": "syn68879001",
    "Task 2": "syn68878940"
}

# Mapping for goldstandard and submission views by task
GOLDSTANDARD_SYNIDS = {1: "syn68736530", 2: "syn68736533"}
SUBMISSION_VIEWS = {"Task 1": "syn68879001", "Task 2": "syn68878940"}
INDEX_COL = "stimulus"

# Synapse login
syn = synapseclient.Synapse()
syn.login()

## Functions to Extract ground truth, submissions, and their file data from Synapse

In [74]:
def load_goldstandard(syn, task):
    gold_synid = GOLDSTANDARD_SYNIDS[task]
    file_entity = syn.get(gold_synid, downloadFile=True)
    file_path = file_entity.path
    gold_df = pd.read_csv(file_path)
    return gold_df
    
def load_team_predictions(syn, submissions_df):
    team_dfs = []
    # No handling of the submission that's not recorded in the submission table 
    # for task 2 required given the late submission is not in the top 2 or 3
    for _, row in submissions_df.iterrows():
        team_id = row['submitterid']
        sub_id = row['id']
        file_path = syn.getSubmission(sub_id)['filePath']
        df = pd.read_csv(file_path)
        df = df.sort_values(INDEX_COL).reset_index(drop=True)
        feature_cols = [col for col in df.columns if col != INDEX_COL]
        rename_dict = {col: f"team_{team_id}_{col}" for col in feature_cols}
        df = df.rename(columns=rename_dict)
        df = df[[INDEX_COL] + list(rename_dict.values())]
        team_dfs.append(df)
    merged_df = team_dfs[0]
    for df in team_dfs[1:]:
        merged_df = pd.merge(merged_df, df, on=INDEX_COL, how='outer')
    return merged_df

def select_final_round_submissions(
    syn: synapseclient.Synapse, subview_id: str, evaluation_id: str
) -> pd.DataFrame:
    """
    Get final round submissions from synapse tables that include all submissions for both rounds.
    Outputs a final averaged rank and sorted leaderboard.
    Only submissions from the specified evaluation are considered.
    For each submitter, but one team, only the submission closest to August 8, 2025 is kept.
    If any of the IDs [9756929, 9756930, 9756939, 9756938, 9756943, 9756942] are present,
    keep only 9756929 if present, otherwise keep only 9756930 if present, and remove the rest.
    Also manually add the permitted late submission for Task 2.
    """

    query = (
        f"SELECT id, pearson_correlation, cosine, createdOn, submitterid FROM {subview_id} "
        f"WHERE score_status = 'SCORED' "
    )  # query_df should have columns: 'id' and 'submitterid'
    submissions = syn.tableQuery(query).asDataFrame()

    # Special handling for the specified IDs
    replace_ids = [9756929, 9756930, 9756939, 9756938, 9756943, 9756942]
    present_ids = [rid for rid in replace_ids if rid in submissions['id'].values]
    if present_ids:
        if 9756929 in present_ids:
            submissions = submissions[~submissions['id'].isin(replace_ids) | (submissions['id'] == 9756929)]
        elif 9756930 in present_ids:
            submissions = submissions[~submissions['id'].isin(replace_ids) | (submissions['id'] == 9756930)]
        else:
            submissions = submissions[~submissions['id'].isin(replace_ids)]

    # Only add late submission for Task 2
    if "Task 2" in evaluation_id or subview_id == SUBMISSION_VIEWS["Task 2"]:
        entity = syn.get("syn68843729", downloadFile=False)
        late_row = {
            "submitterid": entity.createdBy,
            "id": "syn68843729",
            "pearson_correlation": 0.6668633229642209,
            "cosine": 0.22813314634185586,
            "createdOn": 1754692655346,
            "createdOn_diff": 4943654
        }
        submissions = pd.concat([submissions, pd.DataFrame([late_row])], ignore_index=True)

    # Find the submission closest to August 8, 2025 for each submitter
    target_date = int(pd.Timestamp("2025-08-08T23:59:59Z").timestamp() * 1000)
    submissions['createdOn_diff'] = np.abs(submissions['createdOn'] - target_date)
    submissions = submissions.sort_values(['submitterid', 'createdOn_diff'])
    submissions = submissions.groupby('submitterid', as_index=False).first()

    submissions['pearson_rank'] = submissions['pearson_correlation'].rank(
        ascending=False, method="min", na_option='bottom')
    submissions['cosine_rank'] = submissions['cosine'].rank(
        ascending=True, method="min", na_option='bottom')
    submissions['final_rank'] = (
        submissions['pearson_rank'] + submissions['cosine_rank']) / 2

    # Select the top 3 ranked submissions for future bootstrapping
    top_submissions = submissions.nsmallest(3, 'final_rank')

    return submissions, top_submissions

def split_merged_to_team_dfs(merged_df, gold_df, index_col="stimulus"):
    """
    Convert merged_df from load_team_predictions into
    a dict of team_name -> dataframe with [index_col + gold attribute columns].
    Ensures predictors align with gold_df columns.
    """
    # attribute columns are everything except index_col
    attr_cols = [c for c in gold_df.columns if c != index_col]
    
    team_dfs = {}
    for col in merged_df.columns:
        if col == index_col:
            continue
        
        # Parse out team_id from column name "team_<id>_<attr>"
        parts = col.split("_", 2)  # split into 3 parts: ["team", "id", "AttrName"]
        if len(parts) < 3:
            raise ValueError(f"Unexpected column format: {col}")
        team_id = parts[1]
        attr_name = parts[2]
        
        # Initialize dataframe for this team
        if team_id not in team_dfs:
            team_dfs[team_id] = merged_df[[index_col, col]].rename(columns={col: attr_name})
        else:
            team_dfs[team_id] = team_dfs[team_id].merge(
                merged_df[[index_col, col]].rename(columns={col: attr_name}),
                on=index_col,
                how="outer"
            )
    
    # Reorder each team_df to match gold_df attribute order
    for team_id in team_dfs:
        cols = [index_col] + attr_cols
        team_dfs[team_id] = team_dfs[team_id][cols]
    
    return team_dfs



## Functions Applied in Ranking, Bootstrapping, and winner determination

In [75]:
def metrics_on_rows(rows, gold_mat, team_mat):
    """Compute pearson and cosine between gold and team predictions on sampled rows."""
    g = gold_mat[rows, :].ravel()
    t = team_mat[rows, :].ravel()
    if np.std(t) == 0 or np.std(g) == 0:
        return np.nan, np.nan
    pearson, _ = pearsonr(g, t)
    cosine = 1 - cosine_similarity([g], [t])[0,0]  # lower = better
    return pearson, cosine

def baseline_and_bootstrap(
    gold_df,
    team_dfs,
    index_col="stimulus",
    sample_frac=1.0,
    N=10000,
    random_state=None
):
    """
    gold_df: dataframe with gold labels (must include index_col and attribute columns)
    team_dfs: dict[name -> dataframe] (each must include index_col and same attribute columns)
    """
    rng = np.random.default_rng(random_state)
    
    # Automatically infer attribute columns (everything except the index_col)
    attr_cols = [c for c in gold_df.columns if c != index_col]
    if not attr_cols:
        raise ValueError("No attribute columns found besides index_col.")
    
    # Align to common IDs
    gold_ids = gold_df[index_col].astype(str)
    team_ids = [set(df[index_col].astype(str)) for df in team_dfs.values()]
    common_ids_all = set(gold_ids).intersection(*team_ids)
    if not common_ids_all:
        raise ValueError("No overlapping stimuli across all teams.")
    
    keep_gold = gold_df[index_col].astype(str).isin(common_ids_all)
    gold_master = gold_df.loc[keep_gold, attr_cols].to_numpy()
    order_ids = gold_df.loc[keep_gold, index_col].astype(str).tolist()
    
    team_mats = {}
    for nm, df in team_dfs.items():
        sub = df[df[index_col].astype(str).isin(common_ids_all)]
        sub = sub.set_index(index_col).reindex(order_ids)
        team_mats[nm] = sub[attr_cols].to_numpy()
    
    team_names = list(team_mats.keys())
    n = gold_master.shape[0]
    
    # ==== BASELINE ====
    base_metrics = []
    for nm in team_names:
        p, c = metrics_on_rows(np.arange(n), gold_master, team_mats[nm])
        base_metrics.append([nm, p, c])
    baseline_df = pd.DataFrame(base_metrics, columns=["team","pearson","cosine"])
    
    baseline_df["rank_pearson"] = baseline_df["pearson"].rank(ascending=False, method="average")
    baseline_df["rank_cosine"]  = baseline_df["cosine"].rank(ascending=True,  method="average")
    baseline_df["average_rank"] = (baseline_df["rank_pearson"] + baseline_df["rank_cosine"]) / 2
    baseline_df = baseline_df.sort_values(
        ["average_rank","rank_pearson","rank_cosine","pearson","cosine"],
        ascending=[True,True,True,False,True]
    )
    
    top2 = baseline_df["team"].iloc[:2].tolist()
    
    # ==== BOOTSTRAP ====
    B = max(1, round(n * sample_frac))
    wins = 0
    ties = 0
    
    for _ in range(N):
        rows = rng.choice(n, size=B, replace=True)
        met = {nm: metrics_on_rows(rows, gold_master, mat) for nm, mat in team_mats.items()}
        pearsons = {nm: m[0] for nm, m in met.items()}
        cosines  = {nm: m[1] for nm, m in met.items()}
        
        df_tmp = pd.DataFrame({
            "team": team_names,
            "pearson": [pearsons[nm] for nm in team_names],
            "cosine": [cosines[nm] for nm in team_names]
        })
        df_tmp["rP"] = df_tmp["pearson"].rank(ascending=False, method="average")
        df_tmp["rC"] = df_tmp["cosine"].rank(ascending=True,  method="average")
        df_tmp["avg"] = (df_tmp["rP"] + df_tmp["rC"]) / 2
        avg_dict = df_tmp.set_index("team")["avg"].to_dict()
        
        a1, a2 = avg_dict[top2[0]], avg_dict[top2[1]]
        if np.isnan(a1) or np.isnan(a2):
            continue
        if a1 < a2: 
            wins += 1
        elif a1 == a2: 
            ties += 1
    
    p_win = (wins + 0.5 * ties) / N
    BF_odds = (p_win + 1e-8) / (1 - p_win + 1e-8)
    
    return baseline_df, {"p_win": p_win, "BF_odds": BF_odds,
                         "N": N, "rows_per_draw": B, "sample_frac": sample_frac}



## Task 1 Processing

In [None]:
subview_id = SUBMISSION_VIEWS[
        f"Task {1}"]
submissions_df = select_final_round_submissions(
        syn, subview_id, "Final Round DREAM Olfactory Mixtures Prediction Challenge 2025 - Task 1")

Downloading files: 100%|██████████| 3.91k/3.91k [00:00<00:00, 18.3kB/s, syn68879001]

Downloaded syn68879001 to /Users/mdiaz/.synapseCache/481/161875481/SYNAPSE_TABLE_QUERY_161875481.csv


Downloading files: 100%|██████████| 3.91k/3.91k [00:00<00:00, 18.0kB/s, syn68879001]


Downloading files:   0%|          | 0.00/2.72k [00:00<?, ?B/s, syn68878940]

In [77]:
submissions_df[1]

Unnamed: 0,submitterid,id,pearson_correlation,cosine,createdOn,createdOn_diff,pearson_rank,cosine_rank,final_rank
7,3516194,9756912,0.752053,0.176111,1754667730603,29868397,1.0,1.0,1.0
6,3506852,9756908,0.73026,0.179923,1754667145501,30453499,2.0,2.0,2.0
23,3550368,9756933,0.727444,0.184437,1754688407578,9191422,3.0,3.0,3.0


In [78]:
team_predictions = load_team_predictions(syn, submissions_df[1])
gold_df = load_goldstandard(syn, 1)
team_dfs = split_merged_to_team_dfs(team_predictions, gold_df, INDEX_COL)


In [79]:
baseline, bootstrap_summary = baseline_and_bootstrap(
    gold_df, team_dfs, INDEX_COL,
    sample_frac=1.0, N=10000, random_state=42
)

In [80]:
print(baseline)
print(bootstrap_summary)

        team   pearson    cosine  rank_pearson  rank_cosine  average_rank
1  3506852.0  0.046004  0.482542           1.0          2.0           1.5
0  3516194.0  0.040188  0.417884           2.0          1.0           1.5
2  3550368.0  0.034675  0.494238           3.0          3.0           3.0
{'p_win': 0.4003, 'BF_odds': 0.667500422419536, 'N': 10000, 'rows_per_draw': 31, 'sample_frac': 1.0}


## Task 2 processing

In [81]:
subview_id2 = SUBMISSION_VIEWS[
        f"Task {2}"]
submissions_df2 = select_final_round_submissions(
        syn, subview_id2, "Final Round DREAM Olfactory Mixtures Prediction Challenge 2025 - Task 2")

Downloading files:   0%|          | 0.00/1.00 [00:00<?, ?B/s, syn68878940]

Downloaded syn68878940 to /Users/mdiaz/.synapseCache/484/161875484/SYNAPSE_TABLE_QUERY_161875484.csv


Downloading files: 100%|██████████| 2.72k/2.72k [00:00<00:00, 29.0kB/s, syn68878940]


In [82]:
submissions_df2[1]

Unnamed: 0,submitterid,id,pearson_correlation,cosine,createdOn,createdOn_diff,pearson_rank,cosine_rank,final_rank
4,3506852,9756909,0.789749,0.133586,1754667217304,30381696,1.0,1.0,1.0
0,3319559,9756951,0.789684,0.136842,1754695242525,2356475,2.0,2.0,2.0
12,3550368,9756934,0.785382,0.137598,1754688452187,9146813,3.0,3.0,3.0


In [83]:
team_predictions2 = load_team_predictions(syn, submissions_df2[1])
gold_df2 = load_goldstandard(syn, 2)
team_dfs2 = split_merged_to_team_dfs(team_predictions2, gold_df2, INDEX_COL)

In [None]:
baseline2, bootstrap_summary2 = baseline_and_bootstrap(
    gold_df2, team_dfs2, INDEX_COL,
    sample_frac=1.0, N=10000, random_state=42
)

In [None]:
print(baseline2)
print(bootstrap_summary2)

      team   pearson    cosine  rank_pearson  rank_cosine  average_rank
2  3550368 -0.000368  0.462619           1.0          2.0           1.5
1  3319559 -0.004662  0.430524           2.0          1.0           1.5
0  3506852 -0.006974  0.467343           3.0          3.0           3.0
{'p_win': 0.4111, 'BF_odds': 0.6980811734066705, 'N': 10000, 'rows_per_draw': 130, 'sample_frac': 1.0}
