In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3880007756055668052
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 2926942619
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 7201225017112348005
 physical_device_desc: "device: 0, name: Quadro P620, pci bus id: 0000:01:00.0, compute capability: 6.1"
 xla_global_id: 416903419]

In [3]:
pwd

'C:\\Users\\AL44096'

In [4]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer,util
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
stop_words=stopwords.words('english')
punctuation=string.punctuation

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def save_custom_embeddings(model_path,corpus_embeddings):
    #store sentences & embeddings on disc
    with open(model_path+'\\'+'embeddings.pkl',"wb") as fout:
        pickle.dump({'Sentences':corpus, 'embeddings': corpus_embeddings},fout)
    print("saved Custom embeddings")

def load_custom_embeddings(model_path):
    with open(model_path+'/embeddings.pkl',"rb") as fin:
        stored_data = pickle.load(fin)
        stored_sentences = stored_data['Sentences']
        stored_embeddings = stored_data['embeddings']
    return stored_sentences,stored_embeddings

def encode_sentence(model,sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

def sentence_similarity_scores(sentence_embedding,
                              custom_embeddings,
                              stored_sentences,
                              top_k,
                              input_sentence):
    #computing similarity scores with the corpus
    cos_scores= util.pytorch_cos_sim(sentence_embedding, custom_embeddings)[0]
    #sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("sentence :", input_sentence, "\n")
    print("Top", top_k, "most similar sentences in corpus")
    results={}
    for idx in top_results[0:top_k]:
        print(stored_sentences[idx],"(scores:%4f)" % (cos_scores[idx]))
        results[f"sentence{idx}"]= ({"predicted_sentence": stored_sentences[idx],"Scores" : float(cos_scores[idx])})
    return results

def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


def concate_column_text(data):
    df["concated_text"]=df[["Category Name","Service name","Service Classification"]].astype(str).agg(' '.join,axis=1)
    return df["concated_text"]


def convert_column_to_list(data):
    data=data.tolist()
    return data

def convert_df_to_list(data):
    all_data=[]
    corpus=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    for word in complete_data:
        if word not in corpus:
            corpus.append(word)
    return corpus

In [6]:
# Define the path to the Excel files
path_to_files = r"C:\Users\AL44096\Documents\NLS_excel_files"

# Define the names of the Excel files
file_names = ['National_Inc_Exc_Classification.xlsx'
              ]

# Define the name of the sheet in each Excel file that contains the text data
#sheet_name = 'Sheet1'

# Load and clean the text data from each Excel file
cleaned_data = []
for file_name in file_names:
    # Load the data from the Excel file into a Pandas DataFrame
    df = pd.read_excel(f'{path_to_files}/{file_name}')
    data=concate_column_text(df)
    #cleaned_data.append(data)
    # Extract the relevant text data from the DataFrame
    text_data = convert_column_to_list(data)
    # Clean the text data
    cleaned_text_data = [clean_text(text) for text in text_data]
    cleaned_data.append(cleaned_text_data)
corpus = [element for innerList in cleaned_data for element in innerList]

In [7]:
corpus

['Aba Therapy Aba Therapy Inpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Inpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Office Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Office Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Institutional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Institutional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Inpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Office Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Institutional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Outpatient Professional Applied Behavioral Analysis',
 'Aba Therapy Aba Therapy Inpatient Professional App

In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
corpus_embeddings =  model.encode(corpus, convert_to_tensor=True)

In [10]:
corpus_embeddings

tensor([[-0.0325,  0.0726, -0.0241,  ..., -0.0065, -0.0062, -0.0449],
        [-0.0325,  0.0726, -0.0241,  ..., -0.0065, -0.0062, -0.0449],
        [-0.0481,  0.0700, -0.0213,  ..., -0.0014,  0.0204, -0.0162],
        ...,
        [ 0.0465,  0.0386,  0.0292,  ..., -0.0381,  0.0059, -0.0973],
        [ 0.0221,  0.0156, -0.0216,  ...,  0.0050, -0.0572, -0.0493],
        [ 0.0318,  0.0643,  0.0154,  ..., -0.0094, -0.0055, -0.0193]])

In [11]:
import pickle
model_path= r"C:\Users\AL44096\Documents\sentence_model"
save_custom_embeddings(model_path,corpus_embeddings)

saved Custom embeddings


In [12]:
stored_sentences,stored_embeddings=load_custom_embeddings(model_path)

In [13]:
stored_embeddings

tensor([[-0.0325,  0.0726, -0.0241,  ..., -0.0065, -0.0062, -0.0449],
        [-0.0325,  0.0726, -0.0241,  ..., -0.0065, -0.0062, -0.0449],
        [-0.0481,  0.0700, -0.0213,  ..., -0.0014,  0.0204, -0.0162],
        ...,
        [ 0.0465,  0.0386,  0.0292,  ..., -0.0381,  0.0059, -0.0973],
        [ 0.0221,  0.0156, -0.0216,  ...,  0.0050, -0.0572, -0.0493],
        [ 0.0318,  0.0643,  0.0154,  ..., -0.0094, -0.0055, -0.0193]])

# searching through faiss index

In [14]:
d = stored_embeddings.shape[1]

In [15]:
import faiss
index = faiss.IndexFlatL2(d)

In [16]:
index.is_trained

True

In [17]:
index.add(stored_embeddings)

In [18]:
index.ntotal

42097

In [19]:
k = 10
xq = model.encode(["WheelChair"])

In [20]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 3159  3160 11394 11393 11389 11390 11392  2929  2928 10861]]
CPU times: total: 0 ns
Wall time: 9.91 ms


In [21]:
[f'{i}: {corpus[i]}' for i in I[0]]

['3159: Ambulance Air Ambulance Wheelchair',
 '3160: Ambulance Ground Ambulance Wheelchair',
 '11394: Durable Medical Equipment Medical Supply Major Medical Wheelchair',
 '11393: Durable Medical Equipment Medical Supply Base Wheelchair',
 '11389: Durable Medical Equipment Durable Medical Equipment Wheelchair',
 '11390: Durable Medical Equipment Durable Medical Equipment Wheelchair',
 '11392: Durable Medical Equipment Durable Medical Equipment Major Medical Wheelchair',
 '2929: Ambulance Ground Ambulance Wheelchair Vans',
 '2928: Ambulance Air Ambulance Wheelchair Vans',
 '10861: Durable Medical Equipment Durable Medical Equipment Motorized Wheelchairs Purchase']

In [22]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [23]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

Similar Sentence : Ambulance Air Ambulance Wheelchair
probability : 0.4266226291656494

Similar Sentence : Ambulance Ground Ambulance Wheelchair
probability : 0.4127086400985718

Similar Sentence : Durable Medical Equipment Medical Supply Major Medical Wheelchair
probability : 0.3398250341415405

Similar Sentence : Durable Medical Equipment Medical Supply Base Wheelchair
probability : 0.2941264510154724

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Wheelchair
probability : 0.22284471988677979

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Wheelchair
probability : 0.22284471988677979

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Major Medical Wheelchair
probability : 0.20170152187347412

Similar Sentence : Ambulance Ground Ambulance Wheelchair Vans
probability : 0.19548887014389038

Similar Sentence : Ambulance Air Ambulance Wheelchair Vans
probability : 0.18857061862945557

Similar Sentence : Durable Medical E

# searching by creating an clusters

In [24]:
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [25]:
index.is_trained

False

In [26]:
index.train(stored_embeddings)
index.is_trained # check if index is now trained

True

In [27]:
index.add(stored_embeddings)
index.ntotal  # number of embeddings indexed

42097

In [28]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[33298 36411 36416 33306 33325 33301 34484 36383 33297 33322]]
CPU times: total: 0 ns
Wall time: 746 µs


In [29]:
[f'{i}: {corpus[i]}' for i in I[0]]

['33298: Therapies Occupational Therapy Office Professional First 60 Visits Cerebral Palsy',
 '36411: Therapies Occupational Therapy Office Professional First 60 Visits Spinal Manipulation',
 '36416: Therapies Occupational Therapy Outpatient Institutional First 60 Visits Spinal Manipulation',
 '33306: Therapies Physical Therapy Office Professional Cerebral Palsy',
 '33325: Therapies Physical Therapy Office Professional Cerebral Palsy',
 '33301: Therapies Occupational Therapy Outpatient Institutional First 60 Visits Cerebral Palsy',
 '34484: Therapies Physical Therapy Office Professional Intellectal Disability',
 '36383: Therapies Occupational Therapy Office Professional Specialist Spinal Manipulation',
 '33297: Therapies Occupational Therapy Office Professional Cerebral Palsy',
 '33322: Therapies Occupational Therapy Office Professional Cerebral Palsy']

In [30]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [31]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

Similar Sentence : Therapies Occupational Therapy Office Professional First 60 Visits Cerebral Palsy
probability : -0.2512485980987549

Similar Sentence : Therapies Occupational Therapy Office Professional First 60 Visits Spinal Manipulation
probability : -0.25322437286376953

Similar Sentence : Therapies Occupational Therapy Outpatient Institutional First 60 Visits Spinal Manipulation
probability : -0.2608950138092041

Similar Sentence : Therapies Physical Therapy Office Professional Cerebral Palsy
probability : -0.26337242126464844

Similar Sentence : Therapies Physical Therapy Office Professional Cerebral Palsy
probability : -0.26337242126464844

Similar Sentence : Therapies Occupational Therapy Outpatient Institutional First 60 Visits Cerebral Palsy
probability : -0.2672005891799927

Similar Sentence : Therapies Physical Therapy Office Professional Intellectal Disability
probability : -0.2729761600494385

Similar Sentence : Therapies Occupational Therapy Office Professional Special

# Searching by all the 10 nearest clusters

In [32]:
index.nprobe = 10

In [33]:
%%time
D, I = index.search(xq, k)  # search
print(I)
print(D)

[[ 3159  3160 11394 11393 11389 11390 11392  2929  2928 10861]]
[[0.5733774  0.58729136 0.66017497 0.70587355 0.7771553  0.7771553
  0.7982985  0.80451113 0.8114294  0.81894016]]
CPU times: total: 0 ns
Wall time: 4.99 ms


In [34]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [35]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

Similar Sentence : Ambulance Air Ambulance Wheelchair
probability : 0.4266226291656494

Similar Sentence : Ambulance Ground Ambulance Wheelchair
probability : 0.4127086400985718

Similar Sentence : Durable Medical Equipment Medical Supply Major Medical Wheelchair
probability : 0.3398250341415405

Similar Sentence : Durable Medical Equipment Medical Supply Base Wheelchair
probability : 0.2941264510154724

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Wheelchair
probability : 0.22284471988677979

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Wheelchair
probability : 0.22284471988677979

Similar Sentence : Durable Medical Equipment Durable Medical Equipment Major Medical Wheelchair
probability : 0.20170152187347412

Similar Sentence : Ambulance Ground Ambulance Wheelchair Vans
probability : 0.19548887014389038

Similar Sentence : Ambulance Air Ambulance Wheelchair Vans
probability : 0.18857061862945557

Similar Sentence : Durable Medical E

In [36]:
[f'{i}: {corpus[i]}' for i in I[0]]

['3159: Ambulance Air Ambulance Wheelchair',
 '3160: Ambulance Ground Ambulance Wheelchair',
 '11394: Durable Medical Equipment Medical Supply Major Medical Wheelchair',
 '11393: Durable Medical Equipment Medical Supply Base Wheelchair',
 '11389: Durable Medical Equipment Durable Medical Equipment Wheelchair',
 '11390: Durable Medical Equipment Durable Medical Equipment Wheelchair',
 '11392: Durable Medical Equipment Durable Medical Equipment Major Medical Wheelchair',
 '2929: Ambulance Ground Ambulance Wheelchair Vans',
 '2928: Ambulance Air Ambulance Wheelchair Vans',
 '10861: Durable Medical Equipment Durable Medical Equipment Motorized Wheelchairs Purchase']

In [37]:
p=[corpus[i] for i in I[0]]
p

['Ambulance Air Ambulance Wheelchair',
 'Ambulance Ground Ambulance Wheelchair',
 'Durable Medical Equipment Medical Supply Major Medical Wheelchair',
 'Durable Medical Equipment Medical Supply Base Wheelchair',
 'Durable Medical Equipment Durable Medical Equipment Wheelchair',
 'Durable Medical Equipment Durable Medical Equipment Wheelchair',
 'Durable Medical Equipment Durable Medical Equipment Major Medical Wheelchair',
 'Ambulance Ground Ambulance Wheelchair Vans',
 'Ambulance Air Ambulance Wheelchair Vans',
 'Durable Medical Equipment Durable Medical Equipment Motorized Wheelchairs Purchase']

In [38]:
for i in p:
    y=df.loc[df["concated_text"]==i]['Service name']#.values[0]#item()#iloc[0]
    print(y)#.to_string())

3159    Air Ambulance
Name: Service name, dtype: object
3160    Ground Ambulance
Name: Service name, dtype: object
Series([], Name: Service name, dtype: object)
Series([], Name: Service name, dtype: object)
11389    Durable Medical Equipment
11390    Durable Medical Equipment
Name: Service name, dtype: object
11389    Durable Medical Equipment
11390    Durable Medical Equipment
Name: Service name, dtype: object
Series([], Name: Service name, dtype: object)
2929    Ground Ambulance
Name: Service name, dtype: object
2928    Air Ambulance
Name: Service name, dtype: object
Series([], Name: Service name, dtype: object)


In [39]:
df.iloc[11394]['Service name']

'Medical Supply - Major Medical'