In [2]:
# Install required packages
# !pip install torch transformers sentence-transformers pandas

import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Step 1. Load the dataset
df = pd.read_csv("/content/drive/MyDrive/merged_df.csv")

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,pathway_name,complex_ID,pathway_ID,top_level_pathway_ID,top_level_pathway_name,species
0,1,A tetrasaccharide linker sequence is required ...,,,,,
1,2,Abacavir ADME,,,,,
2,3,ABC transporter disorders,,,,,
3,4,ABC transporters in lipid homeostasis,R-HSA-1454940,R-HSA-1369062,R-HSA-382551,Transport of small molecules,Homo sapiens (Human)
4,5,ABC-family proteins mediated transport,R-HSA-5223347,R-HSA-382556,R-HSA-382551,Transport of small molecules,Homo sapiens (Human)


In [None]:
# Step 2. Separate labeled vs unlabeled
known_df = df[df["top_level_pathway_name"].notna()].reset_index(drop=True)
unknown_df = df[df["top_level_pathway_name"].isna()].reset_index(drop=True)

print(f"Known: {len(known_df)} | Unknown: {len(unknown_df)}")

# Step 3. Load BioBERT or SciBERT
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

# Step 4. Encode the pathway names
known_embeddings = model.encode(
    known_df["pathway_name"].tolist(), convert_to_tensor=True, show_progress_bar=True
)
unknown_embeddings = model.encode(
    unknown_df["pathway_name"].tolist(), convert_to_tensor=True, show_progress_bar=True
)

# Step 5. Compute cosine similarity matrix
cosine_scores = util.cos_sim(unknown_embeddings, known_embeddings)

# Step 6. Get top match for each unknown
best_match_idx = torch.argmax(cosine_scores, dim=1).cpu().numpy()
best_match_scores = torch.max(cosine_scores, dim=1).values.cpu().numpy()

# Step 7. Assign predicted top-level names
unknown_df["predicted_top_level_pathway_name"] = [
    known_df.loc[i, "top_level_pathway_name"] for i in best_match_idx
]
unknown_df["similarity_score"] = best_match_scores

# Step 8. Merge results
filled_df = pd.concat([known_df, unknown_df], ignore_index=True)

# Step 9. Save output
filled_df.to_csv("Reactome_Pathway_Filled_BioBERT.csv", index=False)

unknown_df.to_csv("Reactome_Pathway_Filled_BioBERT.csv", index=False)



# Step 10. Inspect top predictions
print(
    unknown_df[["pathway_name", "predicted_top_level_pathway_name", "similarity_score"]]
    .sort_values(by="similarity_score", ascending=False)
    .head(10)
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Known: 1232 | Unknown: 438


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

                                          pathway_name  \
137                FGFR3 ligand binding and activation   
135                FGFR2 ligand binding and activation   
133                FGFR1 ligand binding and activation   
414  Transcriptional activity of SMAD2/SMAD3:SMAD4 ...   
380                           Signaling by RAS mutants   
316      Regulation of TP53 Expression and Degradation   
326                     RNA Polymerase I Transcription   
322                                   RHO GTPase cycle   
327                    RNA Polymerase II Transcription   
353                       Signaling by FGFR in disease   

    predicted_top_level_pathway_name  similarity_score  
137              Signal Transduction          0.973647  
135              Signal Transduction          0.970761  
133              Signal Transduction          0.968780  
414              Signal Transduction          0.967442  
380                          Disease          0.966133  
316  Gene expressio