In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import umap

# --- Configuration ---
EMBEDDING_PATH = "./results/embeddings.csv"

def run_evaluation():
    # 1. Load data
    df = pd.read_csv(EMBEDDING_PATH)
    # Convert string-style list back to numpy array if needed
    embeddings = np.array([np.fromstring(e[1:-1], sep=' ') for e in df['embedding']])
    labels = df['label'].values  # 1 for Positive, 0 for Hard Negative [cite: 32]
    names = df['name'].values

    # 2. Functional Separation via PCA [cite: 51]
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=pca_result[:,0], y=pca_result[:,1], hue=labels, style=labels, s=100)
    plt.title("Intra-genomic Functional Separation (PCA)")
    plt.savefig("./results/figure_2a_pca.png")
    print("PCA plot generated.")

    # 3. Leave-one-out Ranking Strategy [cite: 42]
    # Testing if POS_Bgy1 can find POS_Bgy2 as the top match
    sim_matrix = cosine_similarity(embeddings)

    for i in range(len(names)):
        if labels[i] == 1: # Only for positive anchors
            scores = sim_matrix[i]
            # Rank excluding self
            rankings = pd.Series(scores, index=names).sort_values(ascending=False).iloc[1:]
            print(f"\nQuery: {names[i]}")
            print(f"Top 3 Matches:\n{rankings.head(3)}")
            # Confirm if the other positive is at Rank 1-2 [cite: 59]

if __name__ == "__main__":
    run_evaluation()