# Meta-Analysis of 20 Orphan GPCRs

This notebook contains the final analysis and visualization of the orphan GPCR embeddings generated by the ESM-2 protein language model. The steps are as follows:

1.  **Load Data**: Import the pre-computed `embeddings.pt` file and the `target20.csv` metadata.
2.  **Prepare Data**: Align the embeddings with their corresponding labels.
3.  **Dimensionality Reduction**: Use UMAP to project the high-dimensional embeddings into a 2D space.
4.  **Clustering**: Apply K-Means to the 2D data to identify clusters.
5.  **Visualization & Interpretation**: Generate a labeled scatter plot and print the cluster compositions for analysis.

In [None]:
import os
import torch
import pandas as pd
import numpy as np
import umap
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Set a consistent style for the plots
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
# 1. Load Data

# Check if the critical embeddings file exists
EMBEDDINGS_FILE = "embeddings.pt"
if not os.path.exists(EMBEDDINGS_FILE):
    raise FileNotFoundError(f"ERROR: {EMBEDDINGS_FILE} not found. Please run the ESM-2 generation step on Google Colab and place the file in the project root.")

embeddings_data = torch.load(EMBEDDINGS_FILE)
target_df = pd.read_csv("target20.csv", sep='\t', header=None, names=['Receptor', 'PDB_ID', 'DOI', 'Year', 'Key_Mystery'])

print("Data loaded successfully.")

In [None]:
# 2. Prepare Data for Machine Learning

# Align embeddings with the metadata. We will only analyze the primary chain for each PDB entry.
labels = []
embeddings_list = []
for index, row in target_df.iterrows():
    pdb_id = row['PDB_ID']
    # Assume the primary chain is labeled 'A'
    primary_chain_key = f"{pdb_id}_A"
    if primary_chain_key in embeddings_data:
        labels.append(row['Receptor'])
        embeddings_list.append(embeddings_data[primary_chain_key].numpy())

# Convert to a NumPy array for scikit-learn
embedding_matrix = np.array(embeddings_list)

print(f"Aligned {len(labels)} receptors for analysis.")

In [None]:
# 3. Perform Dimensionality Reduction and Clustering

# UMAP is a powerful technique for visualizing high-dimensional data.
# n_neighbors balances local vs. global structure. min_dist controls how tightly points are packed.
reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, random_state=42)
umap_results = reducer.fit_transform(embedding_matrix)

# Use K-Means to find clusters in the UMAP-reduced data.
# We hypothesize 5 distinct functional groups based on preliminary analysis.
N_CLUSTERS = 5
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
clusters = kmeans.fit_predict(umap_results)

print("UMAP and K-Means analysis complete.")

In [None]:
# 4. Create the Final Visualization

plt.figure(figsize=(16, 12))
scatter = plt.scatter(umap_results[:, 0], umap_results[:, 1], c=clusters, cmap='viridis', s=150, alpha=0.7)

# Add labels to each point for clear identification
for i, label in enumerate(labels):
    plt.annotate(label, 
                 (umap_results[i, 0], umap_results[i, 1]), 
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center',
                 fontsize=9)

plt.title('UMAP Projection of Orphan GPCR Embeddings', fontsize=18)
plt.xlabel('UMAP Dimension 1', fontsize=12)
plt.ylabel('UMAP Dimension 2', fontsize=12)

# Create a clear legend
plt.legend(*scatter.legend_elements(), title="Clusters", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# 5. Print Cluster Composition for Interpretation

cluster_map = {i: [] for i in range(N_CLUSTERS)}
for i, receptor in enumerate(labels):
    cluster_map[clusters[i]].append(receptor)

print("\n--- Cluster Composition ---")
for cluster_id, receptors in sorted(cluster_map.items()):
    print(f"  Cluster {cluster_id}: {', '.join(receptors)}")