# *The missing link:*
# CAS Applied Data Science Final Project
# Matthias Rinderknecht

## Notebook 4: Searching for matches using the Faiss package

In [None]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import time
import gc

### Load the corpus to be encoded and precompute the index using Faiss

In [14]:
#works, loads the corpus to be encoded from .csv

# Start the timer
start_time = time.time()

# Load the pre-trained model
model = SentenceTransformer("/Users/marinder/Documents/CAS_ADS/Final Project/Model_7/linkage")

# Load the corpus from a CSV file
corpus_df = pd.read_csv('/Users/marinder/Documents/CAS_ADS/Final Project/Source Data/final_source/csv/ICTRP_only(CH=true)_12820x23.csv')

# Specify the columns to be embedded for index search
text_columns = ["scientificTitle", "publicTitle", "interventions", "healthConditions"]  # Replace with your actual column names

# Concatenate the specified columns into a single text
corpus = corpus_df[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1).tolist()

# Encode the corpus
corpus_embeddings = model.encode(corpus)

# Normalize the embeddings - to ensure that IP->cosine sim which is bounded to 1
faiss.normalize_L2(corpus_embeddings)
# Dimension of our vectors
d = corpus_embeddings.shape[1]

# Creating a FAISS index for inner product
index = faiss.IndexFlatIP(d)  # Use IndexFlatIP to search with inner product

# Adding normalized corpus embeddings to the index
index.add(corpus_embeddings)

# Save the index
faiss.write_index(index, 'ICTRP_only_(CH=true)12820x23(sT,pT,int,hC).index')

# Stop the timer
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

# Print the execution time
print(f"Execution time: {execution_time:.1f} seconds")
# Free up memory
del corpus_df, corpus, corpus_embeddings, model
gc.collect()

Execution time: 502.74 seconds


8714

### Link a row from the corpus df to every search row using the precomputed index

In [None]:
# Settings
MODEL = "/Users/marinder/Documents/CAS_ADS/Final Project/Model_7/linkage"
INDEX = "ICTRP_only_(CH=true)12820x23(sT,pT,int,hC).index"
CORPUS = "/Users/marinder/Documents/CAS_ADS/Final Project/Source Data/final_source/csv/ICTRP_only(CH=true)_12820x23.csv"
SEARCH = "/Users/marinder/Documents/CAS_ADS/Final Project/Source Data/final_source/csv/BASEC_without_ICTRP_1535x12.csv"
QUERY = ["layTitle", "layTitle", "intervention", "disease"]
RESULT = '/Users/marinder/Documents/CAS_ADS/Final Project/Merges/FAISS/result_5.csv'

# Start the timer
start_time = time.time()

# Load the pre-trained model
model = SentenceTransformer(MODEL)

# Load the index (=precomputed corpus)
index = faiss.read_index(INDEX)

# Load the corpus from a CSV file
corpus_df = pd.read_csv(CORPUS)

# Load the new data to search for
query_df = pd.read_csv(SEARCH)

# Specify the columns to be embedded for querying
query_columns = QUERY

# Concatenate the specified columns into a single text for each row
queries = query_df[query_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1).tolist()

# Encode the queries
query_embeddings = model.encode(queries)

# Normalize the query embeddings
faiss.normalize_L2(query_embeddings)

# Search for similar embeddings
k = 1  # Retrieve the top match
distances, indices = index.search(query_embeddings, k)

#initialize counters
correct_matches = 0
total_rows = len(query_df)

# Rename overlapping columns in new_row
query_df.columns = [f"new_{col}" if col in corpus_df.columns else col for col in query_df.columns]

# Rename overlapping columns in corpus_df
corpus_df.columns = [f"corpus_{col}" if col in query_df.columns else col for col in corpus_df.columns]

# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame(columns=list(query_df.columns) + list(corpus_df.columns) + ['score'])

# Process the search results
for i, row in query_df.iterrows():
    query_embedding = query_embeddings[i]
    query_distance, query_index = distances[i][0], indices[i][0]

    # Create a new row by combining the query row and the matched corpus row
    result_row_values = list(query_df.iloc[i]) + list(corpus_df.iloc[query_index]) + [query_distance]
    result_row_df = pd.DataFrame([result_row_values], columns=result_df.columns)

    # Append the result_row_df to result_df
    result_df = pd.concat([result_df, result_row_df], ignore_index=True)
    
print(query_df.shape)
print (result_df.shape)
print (corpus_df.shape)

#reorder columns
new_column_order = [
    "score",
    "snctpId",
    "whoId",
    "trialId",
    "layTitle",
    "scientificTitle",
    "publicTitle",
    "intervention",
    "interventions",
    "disease",
    "healthConditions",
    "laySummary",
    "basecId",
    "inclusionCriteria",
    "exclusionCriteria",
    "studysites",
    "studySitesOther",
    "tags",
    "countries",
    "secondaryId",
    "primarySponsor",
    "phase",
    "primaryOutcome",
    "publicContactAffiliation",]

result_df = result_df[new_column_order]

# Save the result_df to a CSV file
result_df.to_csv(RESULT, index=False)

# Stop the timer
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

# Print the execution time
print(f"Execution time: {execution_time:.1f} seconds")

## Code for single matches (choosing one specific row in the query and matching the best row from the corpus to it)

In [12]:
MODEL = "/Users/marinder/Documents/CAS_ADS/Final Project/Model_7/linkage"
INDEX = "ICTRP_only_(CH=true)12820x23(sT,pT,int,hC).index"
CORPUS = "/Users/marinder/Documents/CAS_ADS/Final Project/Source Data/final_source/csv/ICTRP_only(CH=true)_12820x23.csv"
SEARCH = "/Users/marinder/Documents/CAS_ADS/Final Project/Source Data/final_source/csv/BASEC_without_ICTRP_1535x12.csv"

# Load the pre-trained model
model = SentenceTransformer(MODEL)

# Load the index (=precomputed corpus)
index = faiss.read_index(INDEX)

# Load the corpus from a CSV file
corpus_df = pd.read_csv(CORPUS)

# Load the new data to search for
query_df = pd.read_csv(SEARCH)

In [23]:
QUERY = ["layTitle", "layTitle", "intervention", "disease"]

# Get the row number to match from user input
matchrow = int(input("Choose row number to match (0-1535): "))

# Start the timer
start_time = time.time()

# Get the row from the chosen row number
one_row = query_df.iloc[matchrow]
print(one_row)

# Specify the columns to be embedded for querying
query_columns = QUERY

# Concatenate the specified columns into a single text for the selected row
query_text = ' '.join(one_row[query_columns].astype(str))

# Encode the query text
query_embedding = model.encode([query_text])

# Normalize the query embedding
faiss.normalize_L2(query_embedding)

# Search for similar embeddings
k = 1  # Retrieve the top match
distances, indices = index.search(query_embedding, k)

# Initialize counters
correct_matches = 0
total_rows = 1  # Since we're processing only one row

# Process the search result
query_distance, query_index = distances[0], indices[0]

print("Query_index:", query_index)

# Check if the match is correct by comparing "whoId" from the query row with the corresponding corpus entry
print("WhoId:", one_row['whoId'])
print("TrialId", corpus_df.iloc[query_index]['trialId'])

# Print the top match details
print("Similarity:", query_distance)
print(corpus_df.iloc[query_index]['scientificTitle'])

# Stop the timer
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

# Print the execution time
print(f"Execution time: {execution_time:.2f} seconds")

Choose row number to match (0-249): 999
basecId                                                     2022-00157
snctpId                                                 SNCTP000004998
whoId                                                      NCT04790253
layTitle             PRophylaktische Hirnbestrahlung oder aktive MA...
laySummary           Patientinnen und Patienten mit kleinzelligem L...
disease                                   kleinzelligem Lungenkarzinom
intervention         Studienarm mit aktiver Überwachung:\r\nMRT des...
inclusionCriteria     Alter >= 18 Jahre\r\n Histologisch/zytologi...
exclusionCriteria    \tVorherige Strahlentherapie des Gehirns oder...
studysites                           Zürich, Bern, Lausanne, St Gallen
studySitesOther                                                    NaN
tags                                                       Lungenkrebs
Name: 999, dtype: object
Query_index: [153]
WhoId: NCT04790253
TrialId 153    NCT04790253
Name: trialId, dty