In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

os.chdir("/content/drive/MyDrive/Colab Notebooks/Searched Ranking/")
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Colab Notebooks/Searched Ranking'

In [13]:
# !pip install pandas
# !pip install sentence_transformers
# !pip install faiss-gpu

In [3]:
import pandas as pd

# Used to create the dense document vectors.
import torch
import sentence_transformers
from sentence_transformers import SentenceTransformer

# Used to create and store the Faiss index.
import faiss
import numpy as np
import pickle
from pathlib import Path

print(torch.__version__)
print(pd.__version__)
print(sentence_transformers.__version__)
print(faiss.__version__)
print(np.__version__)

2.0.1+cu118
1.5.3
2.2.2
1.7.2
1.23.5


In [4]:
def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.

    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I


def id2details(df, I, column):
    """Returns the paper titles based on the paper index."""
    return [list(df[df.index == idx][column]) for idx in I[0]]

In [5]:
# Read a CSV in a table
df = pd.read_csv('data/data_curated.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,index
0,0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,0
1,1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,1
2,2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,2
3,3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,3
4,4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,4


In [6]:
# we will use the distilbert-base-nli-stsb-mean-tokens model which has the best performance on Semantic
# Textual Similarity tasks among the DistilBERT versions

model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
# Check if GPU is available and use it
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

Downloading (…)d7125/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)90f41d7125/README.md:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading (…)f41d7125/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)d7125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)90f41d7125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)41d7125/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

cuda:0


In [7]:
# Convert abstracts to vectors
import time

start_time = time.time()
embeddings = model.encode(df.title.to_list()[:100000], show_progress_bar=True)
diff_normal = time.time() - start_time
print("Done after {:.2f}".format(diff_normal))

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

Done after 33.52


In [9]:
import os

def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (MB):', size/1e6)
    os.remove('temp.p')
    return size

In [12]:
f=print_size_of_model(model,"model_gpu")

model:  model_gpu  	 Size (MB): 69.578645


## Indexing with minimization on Euclidean distance

In [None]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.index.values[:400000])

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 400000


In [None]:
# Searching the index

print(df.loc[555, "title"])
print(df.loc[555, "abstract"])
print()

# Retrieve the 10 nearest neighbours
D, I = index.search(np.array([embeddings[555]]), k=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

Gene Expression Profile during Chondrogenesis in Human Bone Marrow derived Mesenchymal Stem Cells using a cDNA Microarray
Mesenchymal stem cells (MSCs) have the capacity to proliferate and differentiate into multiple connective tissue lineages, which include cartilage, bone, and fat. Cartilage differentiation and chondrocyte maturation are required for normal skeletal development, but the intracellular pathways regulating this process remain largely unclear. This study was designed to identify novel genes that might help clarify the molecular mechanisms of chondrogenesis. Chondrogenesis was induced by culturing human bone marrow (BM) derived MSCs in micromass pellets in the presence of defined medium for 3, 7, 14 or 21 days. Several genes regulated during chondrogenesis were then identified by reverse transcriptase-polymerase chain reaction (RT-PCR). Using an ABI microarray system, we determined the differential gene expression profiles of differentiated chondrocytes and BM-MSCs. Norma

In [None]:
# Fetch the paper titles based on their index
id2details(df, I, 'title')

[['Gene Expression Profile during Chondrogenesis in Human Bone Marrow derived Mesenchymal Stem Cells using a cDNA Microarray'],
 ['Optimization of Gmp-compatible Biobanking of Allogeneic Bone Marrow-derived Clonal Mesenchymal Stromal Cells for Cell Therapy Applications'],
 ['Human Bone Marrow Mesenchymal Stem/Stromal Cells Exposed to an Inflammatory Environment Increase the Expression of ICAM-1 and Release Microvesicles Enriched in This Adhesive Molecule: Analysis of the Participation of TNF-α and IFN-γ'],
 ['Human bone marrow-derived mesenchymal cells differentiate and mature into endocrine pancreatic lineage in vivo.'],
 ['Efficient isolation and chondrogenic differentiation of adult mesenchymal stem cells with fibrin microbeads and micronized collagen sponges.'],
 ['Single cell transcriptomic analysis of human pluripotent stem cell chondrogenesis.'],
 ['Regulatory effects of miR-28 on osteogenic differentiation of human bone marrow mesenchymal stem cells'],
 ['Umbilical Cord Mesench

In [None]:
# Fetch the paper abstracts based on their index
id2details(df, I, 'abstract')

[['Mesenchymal stem cells (MSCs) have the capacity to proliferate and differentiate into multiple connective tissue lineages, which include cartilage, bone, and fat. Cartilage differentiation and chondrocyte maturation are required for normal skeletal development, but the intracellular pathways regulating this process remain largely unclear. This study was designed to identify novel genes that might help clarify the molecular mechanisms of chondrogenesis. Chondrogenesis was induced by culturing human bone marrow (BM) derived MSCs in micromass pellets in the presence of defined medium for 3, 7, 14 or 21 days. Several genes regulated during chondrogenesis were then identified by reverse transcriptase-polymerase chain reaction (RT-PCR). Using an ABI microarray system, we determined the differential gene expression profiles of differentiated chondrocytes and BM-MSCs. Normalization of this data resulted in the identification of 1,486 differentially expressed genes. To verify gene expression

In [None]:
# Serialise index and store it as a pickle
with open(f"models/faiss_index.pickle", "wb") as h:
    pickle.dump(faiss.serialize_index(index), h)

In [None]:
# https://github.com/facebookresearch/faiss/wiki/

# Using Annoy to create the search query structure

In [None]:
# !pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/647.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m573.4/647.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552446 sha256=161b3ece5bc7b5574647e23fb1baa00c08c0d571dd862c28a2d0d4987b7a83aa
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e39

In [None]:
from annoy import AnnoyIndex
from sklearn.preprocessing import normalize

In [None]:
normalized_embed = normalize(embeddings)

In [None]:
# Define the dimension of your embeddings
embedding_dim = embeddings.shape[1]  # Assuming all embeddings have the same dimension

# Initialize the Annoy index
annoy_index = AnnoyIndex(embedding_dim, 'angular')  # 'angular' is a good choice for cosine similarity

for i, emb in enumerate(normalized_embed[:400000]):
    annoy_index.add_item(i, emb)

In [None]:
n_trees = 100  # Number of trees in the index
annoy_index.build(n_trees)

True

In [None]:
num_neighbors = 10
query_index = 555
nearest_neighbors = annoy_index.get_nns_by_item(query_index, num_neighbors)

In [None]:
nearest_neighbor_title = [df["title"][df["index"] == index] for index in nearest_neighbors]

nearest_neighbor_abstract = [df["abstract"][df["index"] == index] for index in nearest_neighbors]

In [None]:
nearest_neighbor_title

[555    Gene Expression Profile during Chondrogenesis ...
 Name: title, dtype: object,
 21273    Regulatory effects of miR-28 on osteogenic dif...
 Name: title, dtype: object,
 97630    Single cell transcriptomic analysis of human p...
 Name: title, dtype: object,
 21566    Mesenchymal Stem Cell–Immune Cell Interaction ...
 Name: title, dtype: object,
 201164    Optimization of Gmp-compatible Biobanking of A...
 Name: title, dtype: object,
 100248    Human bone marrow-derived mesenchymal cells di...
 Name: title, dtype: object,
 278610    Mesenchymal stem cells: Biological characteris...
 Name: title, dtype: object,
 278611    Mesenchymal stem cells: Biological characteris...
 Name: title, dtype: object,
 68278    Mesenchymal stem cells: Biological characteris...
 Name: title, dtype: object,
 293553    Umbilical Cord Mesenchymal Stem Cell-Derived N...
 Name: title, dtype: object]

In [None]:
nearest_neighbor_abstract

[555    Mesenchymal stem cells (MSCs) have the capacit...
 Name: abstract, dtype: object,
 21273    We aimed to assess the regulatory effects of m...
 Name: abstract, dtype: object,
 97630    The therapeutic application of human induced p...
 Name: abstract, dtype: object,
 21566    Critical bone defects and related delayed unio...
 Name: abstract, dtype: object,
 201164    Background & Aim: Allogeneic mesenchymal strom...
 Name: abstract, dtype: object,
 100248    BACKGROUND AIMS The scarcity of human islets f...
 Name: abstract, dtype: object,
 278610    Mesenchymal stem cells (MSCs) are multipotent ...
 Name: abstract, dtype: object,
 278611    Mesenchymal stem cells (MSCs) are multipotent ...
 Name: abstract, dtype: object,
 68278    Mesenchymal stem cells (MSCs) are multipotent ...
 Name: abstract, dtype: object,
 293553    Recombinant human bone morphogenetic protein 2...
 Name: abstract, dtype: object]

In [None]:
# Saving the tree for use

annoy_index.save('models/neighbor_tree.tree')

# f = embeddings.shape[1] # for paraphrase-MiniLM-L3-v2 vector dimensions are 384
# u = AnnoyIndex(f)
# u.load('test.tree')

True