# **Verification Lab2 - NLP Medical Question Filtering**


***
# Setup

## Imports

In [1]:
# General imports
import os
from datetime import datetime
import time

import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import uuid

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Embedding
from sentence_transformers import SentenceTransformer



In [2]:
# Check Installs
#

import sys
import platform

print("System Information:")
print("=" * 50)
print(f"Python version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.architecture()}")
print()

conda_env = os.environ.get('CONDA_DEFAULT_ENV', 'Unknown')
print(f"Conda Environment: {conda_env}")
print()

!pip list | grep -E "(pandas|numpy|matplotlib|scikit-learn|tensorflow|tensorflow-metal|keras-tuner|sentence-transformers)"

System Information:
Python version: 3.11.0 | packaged by conda-forge | (main, Jan 14 2023, 12:25:12) [Clang 14.0.6 ]
Platform: macOS-15.6-arm64-arm-64bit
Architecture: ('64bit', '')

Conda Environment: CDT_DAIR_v1

keras-tuner             1.4.7
matplotlib              3.10.6
matplotlib-inline       0.1.7
numpy                   1.23.5
pandas                  2.3.2
scikit-learn            1.7.1
sentence-transformers   5.1.1
tensorflow              2.12.0
tensorflow-estimator    2.12.0


## Project Setup

In [3]:
# Project Setup
#

#---- Run Parameters --------------------------------
run_name = 'Test_Run_2'

#----------------------------------------------------

# Folder paths
local_project_folder = Path.cwd().parent
data_folder = local_project_folder.joinpath('Data')
if not data_folder.exists():
    raise FileNotFoundError(f'{data_folder} does not exist')
results_folder = local_project_folder.joinpath('Results')
if not results_folder.exists():
    raise FileNotFoundError(f'{results_folder} does not exist')

# Run Results
# run_name = f"Run_{run_name}_{datetime.now().strftime('%Y%m%d')}"
run_results_folder = results_folder.joinpath(f'{run_name}')
run_results_folder.mkdir(parents=True, exist_ok=True) 

del local_project_folder, results_folder, run_name

***
# WIP - Feature Visualisation

In [None]:
# TODO: Visualise feature entanglement etc, using clustering

# Word embeddings disentanglement: https://aclanthology.org/2020.aacl-main.72.pdf
# look at PCA, t-SNE and UMAP eg https://voxel51.com/blog/how-to-visualize-your-data-with-dimension-reduction-techniques

# Full example of similarity etc: https://www.perplexity.ai/search/i-have-embedded-text-sentences-kJqSX900ReCGPgkvTZyLkQ?preview=1

In [5]:
# Load the original text and embeddings
query_df = pd.read_pickle(run_results_folder.joinpath('query_data_embeddings.pkl'))


In [6]:
def compute_semantic_similarity(self):
        '''Compute semantic similarity between original and PGD embeddings'''
        # Cosine similarity (most common for sentence embeddings)
        cosine_similarities = []
        for i in range(len(self.original_embeddings)):
            cos_sim = cosine_similarity(
                self.original_embeddings[i:i+1], 
                self.pgd_embeddings[i:i+1]
            )[0,0]
            cosine_similarities.append(cos_sim)

        # Euclidean distance
        euclidean_dists = []
        for i in range(len(self.original_embeddings)):
            euc_dist = euclidean_distances(
                self.original_embeddings[i:i+1], 
                self.pgd_embeddings[i:i+1]
            )[0,0]
            euclidean_dists.append(euc_dist)

        return {
            'cosine_similarities': cosine_similarities,
            'euclidean_distances': euclidean_dists,
            'mean_cosine_sim': np.mean(cosine_similarities),
            'std_cosine_sim': np.std(cosine_similarities),
            'mean_euclidean_dist': np.mean(euclidean_dists),
            'std_euclidean_dist': np.std(euclidean_dists)
        }


In [None]:
import torch

In [10]:
# Semantic Similarity Determine
#

def calc_semantic_similarity(embedding_model, orig_embeddings, compare_embeddings):

    cosine_similarities = embedding_model.similarity(orig_embeddings, compare_embeddings)

    return cosine_similarities.cpu().numpy()


In [11]:
# Quick test
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

test_a = query_df.iloc[5:8]
test_b = query_df.iloc[9:12]
queries_a = test_a['query'].tolist()
queries_b = test_b['query'].tolist()
embeddings_a = np.array(test_a['query-embedding'].to_list())
embeddings_b = np.array(test_b['query-embedding'].to_list())

similarities = calc_semantic_similarity(sbert_model, embeddings_a, embeddings_b)

print(embeddings_a.shape, embeddings_b.shape)
print(similarities)

# Output the pairs with their score
for idx_i, sentence in enumerate(queries_a):
    print(sentence)
    for idx_j, sentence2 in enumerate(queries_b):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

(3, 384) (3, 384)
[[ 0.06831154  0.4459902   0.00077163]
 [ 0.15523376  0.02250158 -0.01895111]
 [ 0.09924722  0.0597299   0.16938508]]
Bad reaction to anti-depression meds, wondering if there's something underlying... Not sure how to ask
 - Help with a mole              : 0.0683
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.4460
 - How to heal a sprain wrist (TFCC injury)?: 0.0008
What are these white bumps in my nostrils? Is it infected? They are sensitive to the touch.
 - Help with a mole              : 0.1552
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.0225
 - How to heal a sprain wrist (TFCC injury)?: -0.0190
What can I do to avoid these nail infections?
 - Help with a mole              : 0.0992
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.0597
 - How to heal a sprain wrist (TFCC injury)?: 0.1694


In [12]:
from sklearn.metrics.pairwise import cosine_similarity


# Quick test
test_a = query_df.iloc[5:8]
test_b = query_df.iloc[9:12]
queries_a = test_a['query'].tolist()
queries_b = test_b['query'].tolist()
embeddings_a = np.array(test_a['query-embedding'].to_list())
embeddings_b = np.array(test_b['query-embedding'].to_list())

sim_matrix = cosine_similarity(embeddings_a, embeddings_b)

print(embeddings_a.shape, embeddings_b.shape)
print(sim_matrix)

# Output the pairs with their score
for idx_i, sentence in enumerate(queries_a):
    print(sentence)
    for idx_j, sentence2 in enumerate(queries_b):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

(3, 384) (3, 384)
[[ 0.06831154  0.4459902   0.00077163]
 [ 0.15523376  0.02250158 -0.01895111]
 [ 0.09924722  0.0597299   0.16938508]]
Bad reaction to anti-depression meds, wondering if there's something underlying... Not sure how to ask
 - Help with a mole              : 0.0683
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.4460
 - How to heal a sprain wrist (TFCC injury)?: 0.0008
What are these white bumps in my nostrils? Is it infected? They are sensitive to the touch.
 - Help with a mole              : 0.1552
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.0225
 - How to heal a sprain wrist (TFCC injury)?: -0.0190
What can I do to avoid these nail infections?
 - Help with a mole              : 0.0992
 - Why do I cry when I occasionally take valium (as prescribed)?: 0.0597
 - How to heal a sprain wrist (TFCC injury)?: 0.1694


In [None]:
# Quick cosine similarity

test_a = query_df.iloc[5:8]
test_b = query_df.iloc[9:15]
queries_a = test_a['query'].tolist()
queries_b = test_b['query'].tolist()
embeddings_a = np.array(test_a['query-embedding'].to_list())
embeddings_b = np.array(test_b['query-embedding'].to_list())

# Compute cosine similarities
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
similarities = sbert_model.similarity(embeddings_a, embeddings_b)

# Output the pairs with their score
for idx_i, sentence in enumerate(queries_a):
    print(sentence)
    for idx_j, sentence2 in enumerate(queries_b):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

In [None]:
# conda install umap-learn


In [None]:
# Dimension reduction and clustering libraries
import umap
# import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
def compute_semantic_similarity(self):
        '''Compute semantic similarity between original and PGD embeddings'''
        # Cosine similarity (most common for sentence embeddings)
        cosine_similarities = []
        for i in range(len(self.original_embeddings)):
            cos_sim = cosine_similarity(
                self.original_embeddings[i:i+1], 
                self.pgd_embeddings[i:i+1]
            )[0,0]
            cosine_similarities.append(cos_sim)

        # Euclidean distance
        euclidean_dists = []
        for i in range(len(self.original_embeddings)):
            euc_dist = euclidean_distances(
                self.original_embeddings[i:i+1], 
                self.pgd_embeddings[i:i+1]
            )[0,0]
            euclidean_dists.append(euc_dist)

        return {
            'cosine_similarities': cosine_similarities,
            'euclidean_distances': euclidean_dists,
            'mean_cosine_sim': np.mean(cosine_similarities),
            'std_cosine_sim': np.std(cosine_similarities),
            'mean_euclidean_dist': np.mean(euclidean_dists),
            'std_euclidean_dist': np.std(euclidean_dists)
        }
