In [28]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.manifold import MDS
from sklearn.cross_decomposition import CCA
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
distance_matrix_raw = pd.read_csv("../data/distance_matrix.csv")
distance_matrix = distance_matrix_raw.values
labels = distance_matrix_raw.columns.tolist()

In [3]:
def run_many_seeds(dist_matrix, seeds, **mds_kwargs):
    results = []
    rank = []
    for seed in seeds:
        mds = MDS(dissimilarity='precomputed',
                  random_state=seed,
                  **mds_kwargs)
        coords = mds.fit_transform(dist_matrix)
        results.append((seed, mds.stress_, coords))
        # print(f"seed {seed:3d}  stress {mds.stress_:,.2f}")
        rank.append((seed, mds.stress_))
    # pick the best one
    best_seed, best_stress, best_coords = min(results, key=lambda t: t[1])
    
    return best_coords, best_seed, best_stress, results, rank



# example usage
seeds   = range(600, 699)            # 0 … 99
coords, seed, stress, log, rank = run_many_seeds(distance_matrix_raw, seeds,
                                           n_components=2,
                                           normalized_stress='auto',
                                           max_iter=300)
#print(f"Best seed = {seed}, stress = {stress:,.4f}")
sorted_rank = sorted(rank, key = lambda x:x[1])
print(sorted_rank)


[(647, 36.14899100002157), (672, 36.168808204855715), (604, 36.17170420137036), (645, 36.18656168785771), (614, 36.18761408613946), (680, 36.219144047791254), (681, 36.22352007012533), (642, 36.22532126301649), (657, 36.22908004183385), (629, 36.23438371859026), (625, 36.25836126897329), (688, 36.25957050183476), (686, 36.26431780922391), (666, 36.26432867385557), (692, 36.26478545009342), (646, 36.27031552662681), (621, 36.27567104048174), (627, 36.284516987785395), (639, 36.28994660452686), (665, 36.294593926853544), (651, 36.31337754178899), (653, 36.3137560681352), (676, 36.322051802700855), (689, 36.32705576310068), (670, 36.3279360707246), (630, 36.34387854883242), (618, 36.346731533472614), (632, 36.34868885268246), (694, 36.35970357413367), (611, 36.36313405271154), (652, 36.3859658451029), (662, 36.3952781760564), (659, 36.39605570921588), (687, 36.403246686317594), (677, 36.40858189123222), (696, 36.42040684569623), (616, 36.421476326065715), (617, 36.43343960489805), (612, 3

In [None]:
sorted_log = sorted(log, key=lambda t: t[1])

best = sorted_log[0]
second_best = next(t for t in sorted_log[1:] if t[1] > best[1])
worst = sorted_log[-1]

selected = [("Best", best), ("Second Best", second_best), ("Worst", worst)]

for label, (seed, stress, coords) in selected:
    mds_df = pd.DataFrame(coords, columns=["Dim1", "Dim2"])
    mds_df["Label"] = labels  

    fig = px.scatter(
        mds_df, x="Dim1", y="Dim2", text="Label",
        title=f"{label} (Seed {seed}, Stress {stress:.4f})"
    )
    fig.update_traces(marker=dict(size=6, line=dict(width=1, color='black')))
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=30))
    fig.show()

## CCA

In [None]:
# reference: https://stackoverflow.com/questions/69800500/how-to-calculate-correlation-coefficients-using-sklearn-cca-module

In [5]:
top_10_seeds = [seed for seed, _ in sorted_rank[:10]]
top_10_seeds

[647, 672, 604, 645, 614, 680, 681, 642, 657, 629]

In [36]:
top_coords_dict = {}
for seed in top_10_seeds:
    mds = MDS(dissimilarity='precomputed',
              random_state=seed,
              n_components=2,
              normalized_stress='auto',
              max_iter=300)
    coords = mds.fit_transform(distance_matrix_raw)
    top_coords_dict[seed] = coords
cca_matrix_1 = np.zeros((10, 10))
cca_matrix_2 = np.zeros((10, 10))
seed_to_idx = {seed: i for i, seed in enumerate(top_10_seeds)}

In [38]:
# first correlation
for (s1, s2) in combinations(top_10_seeds, 2):
    X1 = top_coords_dict[s1]
    X2 = top_coords_dict[s2]
    
    cca = CCA(n_components=2)
    X1_c, X2_c = cca.fit_transform(X1, X2)
    
    corr1 = np.corrcoef(X1_c[:, 0], X2_c[:, 0])[0, 1]
    corr2 = np.corrcoef(X1_c[:, 1], X2_c[:, 1])[0, 1]
    
    i, j = seed_to_idx[s1], seed_to_idx[s2]
    cca_matrix_1[i, j] = corr1
    cca_matrix_1[j, i] = corr1  # symmetric
    cca_matrix_2[i, j] = corr2
    cca_matrix_2[j, i] = corr2

np.fill_diagonal(cca_matrix_1, 1.0)
np.fill_diagonal(cca_matrix_2, 1.0)

df1 = pd.DataFrame(cca_matrix_1, 
                  index=[f'seed {s}' for s in top_10_seeds], 
                  columns=[f'seed {s}' for s in top_10_seeds])

df2 = pd.DataFrame(cca_matrix_2, 
                  index=[f'seed {s}' for s in top_10_seeds], 
                  columns=[f'seed {s}' for s in top_10_seeds])

In [39]:
# first cc
df1

Unnamed: 0,seed 647,seed 672,seed 604,seed 645,seed 614,seed 680,seed 681,seed 642,seed 657,seed 629
seed 647,1.0,0.999959,0.999949,0.999969,0.999782,0.999641,0.999739,0.999941,0.999645,0.999927
seed 672,0.999959,1.0,0.999879,0.999955,0.999787,0.999745,0.999839,0.999856,0.999697,0.999893
seed 604,0.999949,0.999879,1.0,0.999934,0.999712,0.99974,0.999845,0.999875,0.999403,0.999917
seed 645,0.999969,0.999955,0.999934,1.0,0.999768,0.999716,0.999832,0.999877,0.999717,0.999934
seed 614,0.999782,0.999787,0.999712,0.999768,1.0,0.999756,0.999908,0.999954,0.999857,0.999896
seed 680,0.999641,0.999745,0.99974,0.999716,0.999756,1.0,0.999779,0.999932,0.999327,0.999834
seed 681,0.999739,0.999839,0.999845,0.999832,0.999908,0.999779,1.0,0.999909,0.999332,0.9999
seed 642,0.999941,0.999856,0.999875,0.999877,0.999954,0.999932,0.999909,1.0,0.999618,0.999882
seed 657,0.999645,0.999697,0.999403,0.999717,0.999857,0.999327,0.999332,0.999618,1.0,0.999694
seed 629,0.999927,0.999893,0.999917,0.999934,0.999896,0.999834,0.9999,0.999882,0.999694,1.0


In [40]:
# second cc
df2

Unnamed: 0,seed 647,seed 672,seed 604,seed 645,seed 614,seed 680,seed 681,seed 642,seed 657,seed 629
seed 647,1.0,0.999643,0.999923,0.999546,0.999641,0.999221,0.998737,0.999058,0.99954,0.999058
seed 672,0.999643,1.0,0.999847,0.999749,0.998534,0.99802,0.996845,0.998128,0.998571,0.998501
seed 604,0.999923,0.999847,1.0,0.999737,0.999237,0.998824,0.998004,0.998848,0.999399,0.999061
seed 645,0.999546,0.999749,0.999737,1.0,0.998495,0.998291,0.9971,0.998236,0.998459,0.998571
seed 614,0.999641,0.998534,0.999237,0.998495,1.0,0.999078,0.999457,0.999076,0.999664,0.998432
seed 680,0.999221,0.99802,0.998824,0.998291,0.999078,1.0,0.99939,0.99869,0.999679,0.998931
seed 681,0.998737,0.996845,0.998004,0.9971,0.999457,0.99939,1.0,0.998957,0.999573,0.998233
seed 642,0.999058,0.998128,0.998848,0.998236,0.999076,0.99869,0.998957,1.0,0.999634,0.998591
seed 657,0.99954,0.998571,0.999399,0.998459,0.999664,0.999679,0.999573,0.999634,1.0,0.998153
seed 629,0.999058,0.998501,0.999061,0.998571,0.998432,0.998931,0.998233,0.998591,0.998153,1.0


In [None]:
### testing

In [None]:
cca = CCA(n_components=2)
X1_c, X2_c = cca.fit_transform(top_coords_dict[647], top_coords_dict[672])

In [21]:
score = cca.score(top_coords_dict[647], top_coords_dict[672])
score

0.9672557977080001

In [None]:
X1_c, X2_c = cca.transform(X1, X2)
corrs = [np.corrcoef(X1_c[:, i], X2_c[:, i])[0, 1] for i in range(2)]
print(f"First: {corrs[0]:.4f}, Second: {corrs[1]:.4f}, Sum of squares: {sum(c**2 for c in corrs):.4f}")

In [20]:
# Fit CCA
cca = CCA(n_components=2)
cca.fit(top_coords_dict[647], top_coords_dict[672])

# Transform both MDS results into canonical space
X1_c, X2_c = cca.transform(top_coords_dict[647], top_coords_dict[672])

# Compute first and second canonical correlations
first_corr = np.corrcoef(X1_c[:, 0], X2_c[:, 0])[0, 1]
second_corr = np.corrcoef(X1_c[:, 1], X2_c[:, 1])[0, 1]

print(f"First canonical correlation:  {first_corr:.4f}")
print(f"Second canonical correlation: {second_corr:.4f}")


First canonical correlation:  1.0000
Second canonical correlation: 0.9996


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df, 
            annot=True, 
            fmt=".5f", 
            cmap='viridis', 
            square=True,
            cbar_kws={'label': 'Canonical Corr'})
plt.title('Pairwise Canonical Correlation (First Component)')
plt.tight_layout()
plt.show()