In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Step 1: Pretrained language model and data(frame) files prep for embedding computing per label

In [5]:
# install the language model to gnerate similarity values per label
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [4]:
# load the dataframe files
# N.B. each column of df contains four columns: URI, Label, Definition, Text (label+definition)
df_to = pd.read_pickle("/content/drive/MyDrive/Projects/semantic_alignment_ontology/data/to.pkl")
df_go = pd.read_pickle("/content/drive/MyDrive/Projects/semantic_alignment_ontology/data/go.pkl")

Step 2: Computing vector embeddings

In [5]:
# Set up the pretrained language model
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Compute sentence embeddings per label

# Encode the Text (Label+Definition) from Plant trait ontology
df_to["Embedding"] = df_to["Text"].apply(lambda x: model.encode(x, show_progress_bar=False))

In [12]:
# Also encode the Text from Gene Ontology (but had to do that step by step due to network failure)
df_go["Embedding"] = None  # initialize the column

In [39]:
for inx in range(45000,len(df_go["Text"])):
  text = df_go["Text"].iloc[inx]
  emb = model.encode(text, show_progress_bar=False)
  df_go.at[inx, "Embedding"] = emb

In [41]:
# Double check if embeddings are computed properly
import numpy as np
# shape
#print(np.array(df_to["Embedding"].iloc[0]).shape) # should be (384,) as the vector size is 384
print(np.array(df_go["Embedding"].iloc[48000])) # same
# one embedding?
#print(np.array(df_to["Text"].iloc[100]))
#print(np.array(df_to["Embedding"].iloc[100]))

[-6.06856607e-02 -1.29217189e-02 -1.17196385e-02 -2.63589900e-02
  7.80542046e-02  4.14247215e-02 -6.39015138e-02  5.33708744e-02
  5.70967831e-02  3.51323485e-02  8.50360692e-02  6.25061318e-02
 -4.00778651e-03  1.20000169e-02 -1.41202984e-02  7.59869069e-03
  9.28496663e-03  2.03721728e-02 -2.35474352e-02 -4.17999402e-02
  4.71388809e-02 -1.88359115e-02 -4.22661491e-02 -2.91018728e-02
  8.89484026e-03 -5.28580099e-02 -1.09820567e-01 -2.84810569e-02
  4.03946154e-02 -3.08265928e-02 -6.79805726e-02  6.36262819e-02
 -4.02635410e-02  3.74441370e-02  4.53279689e-02 -4.68401704e-03
  5.10313222e-03  1.95257273e-02 -2.04112418e-02 -9.97910742e-03
  2.47644112e-02 -2.44500376e-02 -8.13512132e-02 -1.15313930e-02
  1.18534463e-02  5.58284111e-02 -7.65291182e-03 -5.05726114e-02
 -8.23423564e-02 -5.16016735e-03  5.06386384e-02 -3.69884633e-02
 -1.50492257e-02  4.18375023e-02  5.80368526e-02  3.70299220e-02
 -4.28057415e-03  1.11212237e-02  1.97176393e-02 -1.84149276e-02
 -2.62689870e-03 -3.95794

In [42]:
# Save the df with embeddings for later
#df_to.to_pickle("/content/drive/MyDrive/Projects/semantic_alignment_ontology/data/to_embeddings.pkl")
df_go.to_pickle("/content/drive/MyDrive/Projects/semantic_alignment_ontology/data/go_embeddings.pkl")