In [7]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import json

# Load the stop words and punctuation
nltk.download('stopwords')
stop_words = stopwords.words('english')
punctuation = string.punctuation

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
model = AutoModel.from_pretrained('dmis-lab/biobert-v1.1')

# Define a function to clean the text data
def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Define a function to calculate the cosine similarity between two embeddings
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

#converting all column data to list..

def convert_df_to_list():
    all_data=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    return complete_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
similarity_scores

[]

In [8]:
# Define the path to the Excel files
path_to_files = r'C:\Users\HP\Downloads\carelon_files'

# Define the names of the Excel files
file_names = ['sheet_1.xlsx', 'sheet_2.xlsx', 'sheet_3.xlsx']

# Define the name of the sheet in each Excel file that contains the text data
sheet_name = 'Sheet1'

# Load and clean the text data from each Excel file
cleaned_data = []
for file_name in file_names:
    # Load the data from the Excel file into a Pandas DataFrame
    df = pd.read_excel(f'{path_to_files}/{file_name}', sheet_name=sheet_name)
    # Extract the relevant text data from the DataFrame
    text_data = convert_df_to_list()
    # Clean the text data
    cleaned_text_data = [clean_text(text) for text in text_data]
    # Tokenize the cleaned text data using the BioBERT tokenizer
    encoded_data = tokenizer(cleaned_text_data, padding=True, truncation=True, return_tensors='pt')
    # Get the contextualized embeddings for the cleaned text data using the BioBERT model
    with torch.no_grad():
        outputs = model(**encoded_data)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    # Append the embeddings to the cleaned data list
    cleaned_data.append(embeddings.numpy())

In [11]:
cleaned_data[0]

array([[ 0.01367003,  0.03860356, -0.34865388, ..., -0.13681906,
         0.0144262 , -0.16563576],
       [-0.40226692, -0.12229208, -0.03835834, ...,  0.15528215,
         0.1065686 , -0.18299991],
       [-0.08860499, -0.15265234, -0.13606878, ...,  0.14138635,
         0.002732  ,  0.17150973],
       ...,
       [-0.0593734 ,  0.00131973, -0.29684758, ...,  0.3032733 ,
        -0.1957117 , -0.05673091],
       [-0.16588202,  0.03362194, -0.24173486, ...,  0.22262587,
         0.02308082,  0.18965939],
       [-0.16588202,  0.03362194, -0.24173486, ...,  0.22262587,
         0.02308082,  0.18965939]], dtype=float32)

In [13]:
# Concatenate the cleaned data into a single NumPy array
cleaned_data = np.concatenate(cleaned_data, axis=0)
cleaned_data

array([ 0.01367003,  0.03860356, -0.34865388, ..., -0.07210014,
        0.19417426,  0.26070973], dtype=float32)

In [None]:
# Define the search term
search_term = 'cancer'

# Tokenize the search term using the BioBERT tokenizer
cleaned_search_term = clean_text(search_term)
encoded_search_term = tokenizer(cleaned_search_term, padding=True, truncation=True, return_tensors='pt')

# Get the contextualized embedding for the search term using the BioBERT model
with torch.no_grad():
    search_embedding = model(**encoded_search_term)[0][0].numpy()

# Calculate the cosine similarity between the search term embedding and each sentence embedding
similarity_scores = []
for sentence_embedding in cleaned_data:
    similarity = cosine_similarity(search_embedding, sentence_embedding)
    similarity_scores.append(similarity)
