In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer,util
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
stop_words=stopwords.words('english')
punctuation=string.punctuation

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def save_custom_embeddings(model_path,corpus_embeddings):
    #store sentences & embeddings on disc
    with open(model_path+'\\'+'embeddings.pkl',"wb") as fout:
        pickle.dump({'Sentences':corpus, 'embeddings': corpus_embeddings},fout)
    print("saved Custom embeddings")

def load_custom_embeddings(model_path):
    with open(model_path+'/embeddings.pkl',"rb") as fin:
        stored_data = pickle.load(fin)
        stored_sentences = stored_data['Sentences']
        stored_embeddings = stored_data['embeddings']
    return stored_sentences,stored_embeddings

def get_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

def sentence_similarity_scores(sentence_embedding,
                              custom_embeddings,
                              stored_sentences,
                              top_k,
                              input_sentence):
    #computing similarity scores with the corpus
    cos_scores= util.pytorch_cos_sim(sentence_embedding, custom_embeddings)[0]
    #sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("sentence :", input_sentence, "\n")
    print("Top", top_k, "most similar sentences in corpus")
    results={}
    for idx in top_results[0:top_k]:
        print(stored_sentences[idx],"(scores:%4f)" % (cos_scores[idx]))
        results[f"sentence{idx}"]= ({"predicted_sentence": stored_sentences[idx],"Scores" : float(cos_scores[idx])})
    return results

def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


def concate_column_text(data):
    df["concated_text"]=df[["Category Name","Service name","Service Classification"]].astype(str).agg(' '.join,axis=1)
    return df["concated_text"]


def convert_column_to_list(data):
    data=data.tolist()
    return data

def convert_df_to_list(data):
    all_data=[]
    corpus=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    for word in complete_data:
        if word not in corpus:
            corpus.append(word)
    return corpus

In [4]:
###Collecting the data from Mongodb
import pymongo
from pymongo import MongoClient

In [5]:
#Establish a connection to MongoDB
username="src_nlpbimdbg_sit"
password="Gh1FFBzFNHbQJoQ"
connection_uri = f'mongodb://{username}:{password}@ip-10-189-32-138.993514063544.us-east-2.awsdns.internal.das:37043/'  

client = MongoClient(connection_uri, ssl=True, tls=True) #tlsCAFile=rootChain, authSource='bpconadsDB')
#client = MongoClient('mongodb://localhost:27017')


db=client['nlpbimdbg']
collection=db['test_NLS2_benefits']

In [6]:
documents=collection.find()
data=[]
for document in documents:
    data.append(document)

In [7]:
df=pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,_id,benefit_name,category,market_name,category:market,benefit_description,audit
0,644fbbc50f93dbb489bdb6d2,Virtual Office Exam Visit - Primary Care Physi...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nA Virtual Office Exam Visit - Primary Care...,"[{'user_id': 123, 'status': 'in review', 'new_..."
1,644fbbc50f93dbb489bdb6d3,Vision - Routine Vision Services - Primary Car...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nRoutine vision services are those that are...,
2,644fbbc60f93dbb489bdb6d4,Routine Eye Exam - Ages 18 Years and Under,"[Physician / Medical Services, Vision Exam]",[IND],"[Physician / Medical Services:IND, Vision Exam...",\n\nA routine eye exam for someone 18 years or...,
3,644fbbc60f93dbb489bdb6d5,Flu Immunization - Ages 6 Years and Over,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nFlu immunization is a benefit that helps p...,
4,644fbbc60f93dbb489bdb6d6,Vision - Routine Vision Services - Specialist ...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nThis medical benefit provides coverage for...,


In [9]:
df=df.drop('_id', axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16231 entries, 0 to 16230
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   benefit_name         16230 non-null  object
 1   category             16231 non-null  object
 2   market_name          16231 non-null  object
 3   category:market      16231 non-null  object
 4   benefit_description  1470 non-null   object
 5   audit                1 non-null      object
dtypes: object(6)
memory usage: 761.0+ KB


In [11]:
df = df[df['benefit_description'].notna()]

In [12]:
df.shape

(1470, 6)

In [20]:
train_data=df['benefit_description'].tolist()
train_data = [item.strip() for item in train_data]
train_data

['A Virtual Office Exam Visit - Primary Care Physician Non Chiropractor - Remaining Visit(s) is a medical benefit that allows the patient to see a primary care physician without having to go to their office. The visit is done virtually, using either a computer or a phone.',
 'Routine vision services are those that are considered necessary for the maintenance of good vision and eye health. These services may include comprehensive eye exams, contact lens fittings, and refractive surgery consultations.',
 'A routine eye exam for someone 18 years or younger would generally include a vision test and an evaluation of the health of the eyes. The doctor may also check for signs of common eye problems such as nearsightedness, farsightedness, astigmatism, and presbyopia.',
 'Flu immunization is a benefit that helps protect people from the flu, or influenza. The flu is a serious respiratory illness that can lead to hospitalization and even death. Every year, thousands of people in the United Stat

In [None]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

In [18]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

## Step 1: use an existing language model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2',max_seq_length=256)

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
#pooling_model = models.Pooling(word_embedding_dimension=250)
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

## Join steps 1 and 2 using the modules argument
#model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [21]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from torch.utils.data import DataLoader

# # Define your sentence transformer model using CLS pooling
# model_name = 'distilroberta-base'
# word_embedding_model = models.Transformer(model_name, max_seq_length=32)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define a list with sentences (1k - 100k sentences)
# train_sentences = ["Your set of sentences",
#                    "Model will automatically add the noise",
#                    "And re-construct it",
#                    "You should provide at least 1k sentences"]

# Convert train sentences to sentence pairs
train_data = [InputExample(texts=[s, s]) for s in train_data]

# DataLoader to batch your data
train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.MultipleNegativesRankingLoss(model)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    show_progress_bar=True
)

model.summary()



Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/12 [00:00<?, ?it/s][A
Iteration:   8%|██████                                                                  | 1/12 [00:34<06:20, 34.57s/it][A
Iteration:  17%|████████████                                                            | 2/12 [00:51<04:13, 25.39s/it][A
Iteration:  25%|██████████████████                                                      | 3/12 [01:11<03:31, 23.53s/it][A
Iteration:  33%|████████████████████████                                                | 4/12 [01:32<03:02, 22.86s/it][A
Iteration:  42%|██████████████████████████████                                          | 5/12 [01:55<02:39, 22.83s/it][A
Iteration:  50%|████████████████████████████████████                                    | 6/12 [02:14<02:13, 22.25s/it][A
Iteration:  58%|███

AttributeError: 'SentenceTransformer' object has no attribute 'summary'

In [24]:
model.save(r'C:\Users\AL44096\Documents\fine_tune_model')

In [25]:
fine_tune_model = SentenceTransformer(r'C:\Users\AL44096\Documents\fine_tune_model')

In [26]:
def get_fine_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=fine_tune_model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

In [27]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['benefit_name', 'category', 'market_name', 'category:market', 'benefit_description', 'audit', '__index_level_0__'],
    num_rows: 1470
})

In [29]:
import numpy
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_fine_embeddings(x["benefit_description"]).numpy()}
)

                                                                                                                       

In [30]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 136.58it/s]


Dataset({
    features: ['benefit_name', 'category', 'market_name', 'category:market', 'benefit_description', 'audit', '__index_level_0__', 'embeddings'],
    num_rows: 1470
})

In [37]:
question = "WheelChair"
question_embedding = get_fine_embeddings([question]).numpy()
question_embedding.shape

(1, 256)

In [38]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)

In [39]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [40]:
for _, row in samples_df.iterrows():
    #print(row)
    print(f"Benefit_name: {row['benefit_name']}")
    print(f"SCORE: {row.scores}")
    print('\n')
    print(f"benefit_description: {row.benefit_description}")
    print('\n')

Benefit_name: Other Testing - Professional - Walk in Center Walk in Doctor's Office Primary care Physician
SCORE: 11.816832542419434


benefit_description: 's Office

This medical benefit allows the patient to be seen by a professional at a walk-in center, doctor's office, or primary care physician's office.


Benefit_name: Exam / Visit - Walk in Center Walk in Doctor's Office Specialist
SCORE: 11.730724334716797


benefit_description:     Exam / Visit - Walk in Center Walk in Doctor's Office Specialist is a medical benefit that covers the cost of a visit to a specialist at a walk-in center or doctor's office. This type of visit may include video conferencing, telephone calls, or secure messaging with a doctor or other qualified healthcare professional. The benefit pays for the cost of the visit and any associated services such as lab tests or medications.


Benefit_name: Foot Exam - Diabetes or Peripheral Vascular Disease - Specialist Mental Health - First 3 Visit(s)
SCORE: 11.7142820