In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 10437684093684072020
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 2926942619
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8437519852791075006
 physical_device_desc: "device: 0, name: Quadro P620, pci bus id: 0000:01:00.0, compute capability: 6.1"
 xla_global_id: 416903419]

In [3]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer,util
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
stop_words=stopwords.words('english')
punctuation=string.punctuation

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def save_custom_embeddings(model_path,corpus_embeddings):
    #store sentences & embeddings on disc
    with open(model_path+'\\'+'embeddings.pkl',"wb") as fout:
        pickle.dump({'Sentences':corpus, 'embeddings': corpus_embeddings},fout)
    print("saved Custom embeddings")

def load_custom_embeddings(model_path):
    with open(model_path+'/embeddings.pkl',"rb") as fin:
        stored_data = pickle.load(fin)
        stored_sentences = stored_data['Sentences']
        stored_embeddings = stored_data['embeddings']
    return stored_sentences,stored_embeddings

def get_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

def sentence_similarity_scores(sentence_embedding,
                              custom_embeddings,
                              stored_sentences,
                              top_k,
                              input_sentence):
    #computing similarity scores with the corpus
    cos_scores= util.pytorch_cos_sim(sentence_embedding, custom_embeddings)[0]
    #sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("sentence :", input_sentence, "\n")
    print("Top", top_k, "most similar sentences in corpus")
    results={}
    for idx in top_results[0:top_k]:
        print(stored_sentences[idx],"(scores:%4f)" % (cos_scores[idx]))
        results[f"sentence{idx}"]= ({"predicted_sentence": stored_sentences[idx],"Scores" : float(cos_scores[idx])})
    return results

def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


def concate_column_text(data):
    df["concated_text"]=df[["Category Name","Service name","Service Classification"]].astype(str).agg(' '.join,axis=1)
    return df["concated_text"]


def convert_column_to_list(data):
    data=data.tolist()
    return data

def convert_df_to_list(data):
    all_data=[]
    corpus=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    for word in complete_data:
        if word not in corpus:
            corpus.append(word)
    return corpus

In [5]:
# Define the path to the Excel files
path_to_files = r"C:\Users\AL44096\Documents\NLS_excel_files"

# Define the names of the Excel files
file_names = ['National_Inc_Exc_Classification.xlsx'
              ]

# Define the name of the sheet in each Excel file that contains the text data
#sheet_name = 'Sheet1'

# Load and clean the text data from each Excel file
cleaned_data = []
for file_name in file_names:
    # Load the data from the Excel file into a Pandas DataFrame
    df = pd.read_excel(f'{path_to_files}/{file_name}')
    data=concate_column_text(df)
    #cleaned_data.append(data)
    # Extract the relevant text data from the DataFrame
    text_data = convert_column_to_list(data)
    # Clean the text data
    cleaned_text_data = [clean_text(text) for text in text_data]
    cleaned_data.append(cleaned_text_data)
corpus = [element for innerList in cleaned_data for element in innerList]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42097 entries, 0 to 42096
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Category Name             42097 non-null  object
 1   Service name              42097 non-null  object
 2   Inclusion/Exclusion Name  42097 non-null  object
 3   Type                      42097 non-null  object
 4   Service Classification    42097 non-null  object
 5   concated_text             42097 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [7]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['Category Name', 'Service name', 'Inclusion/Exclusion Name', 'Type', 'Service Classification', 'concated_text'],
    num_rows: 42097
})

In [8]:
comments_dataset['concated_text'][0]

'ABA Therapy ABA Therapy Inpatient Professional Applied Behavioral Analysis'

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [10]:
embedding = get_embeddings(comments_dataset["concated_text"][0])
embedding.shape

torch.Size([384])

In [11]:
import numpy
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["concated_text"]).numpy()}
)

                                                                                                                       

In [12]:
embeddings_dataset

Dataset({
    features: ['Category Name', 'Service name', 'Inclusion/Exclusion Name', 'Type', 'Service Classification', 'concated_text', 'embeddings'],
    num_rows: 42097
})

In [13]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|█████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 161.99it/s]


Dataset({
    features: ['Category Name', 'Service name', 'Inclusion/Exclusion Name', 'Type', 'Service Classification', 'concated_text', 'embeddings'],
    num_rows: 42097
})

In [14]:
search_term = "WheelChair"
question_embedding = get_embeddings([search_term]).numpy()
question_embedding.shape

(1, 384)

In [15]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [16]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [17]:
for _, row in samples_df.iterrows():
    #print(row)
    #print(f"concated_text: {row.concated_text}")
    print(f"SCORE: {row.scores}")
    print(f"Service name: {row['Service name']}")

SCORE: 0.7771551609039307
Service name: Durable Medical Equipment
SCORE: 0.6895759105682373
Service name: Medical Supply - Base
SCORE: 0.6782286167144775
Service name: Medical Supply - Major Medical
SCORE: 0.5872913599014282
Service name: Ground Ambulance
SCORE: 0.5733773708343506
Service name: Air Ambulance
