In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer,util
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
stop_words=stopwords.words('english')
punctuation=string.punctuation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_custom_embeddings(model_path,corpus_embeddings):
    #store sentences & embeddings on disc
    with open(model_path+'\\'+'embeddings.pkl',"wb") as fout:
        pickle.dump({'Sentences':corpus, 'embeddings': corpus_embeddings},fout)
    print("saved Custom embeddings")

def load_custom_embeddings(model_path):
    with open(model_path+'/embeddings.pkl',"rb") as fin:
        stored_data = pickle.load(fin)
        stored_sentences = stored_data['Sentences']
        stored_embeddings = stored_data['embeddings']
    return stored_sentences,stored_embeddings

def get_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

def sentence_similarity_scores(sentence_embedding,
                              custom_embeddings,
                              stored_sentences,
                              top_k,
                              input_sentence):
    #computing similarity scores with the corpus
    cos_scores= util.pytorch_cos_sim(sentence_embedding, custom_embeddings)[0]
    #sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("sentence :", input_sentence, "\n")
    print("Top", top_k, "most similar sentences in corpus")
    results={}
    for idx in top_results[0:top_k]:
        print(stored_sentences[idx],"(scores:%4f)" % (cos_scores[idx]))
        results[f"sentence{idx}"]= ({"predicted_sentence": stored_sentences[idx],"Scores" : float(cos_scores[idx])})
    return results

def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


def concate_column_text(data):
    df["concated_text"]=df[["Category Name","Service name","Service Classification"]].astype(str).agg(' '.join,axis=1)
    return df["concated_text"]


def convert_column_to_list(data):
    data=data.tolist()
    return data

def convert_df_to_list(data):
    all_data=[]
    corpus=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    for word in complete_data:
        if word not in corpus:
            corpus.append(word)
    return corpus

In [3]:
###Collecting the data from Mongodb
import pymongo
from pymongo import MongoClient

In [49]:
#Establish a connection to MongoDB
username="src_nlpbimdbg_sit"
password="Gh1FFBzFNHbQJoQ"
connection_uri = f'mongodb://{username}:{password}@ip-10-189-32-138.993514063544.us-east-2.awsdns.internal.das:37043/'  

client = MongoClient(connection_uri, ssl=True, tls=True) #tlsCAFile=rootChain, authSource='bpconadsDB')
#client = MongoClient('mongodb://localhost:27017')


db=client['nlpbimdbg']
collection=db['test_NLS2_benefits']

In [65]:
documents=collection.find()
data=[]
for document in documents:
    data.append(document)

In [66]:
data=pd.DataFrame(data)

In [67]:
data.head()

Unnamed: 0,_id,benefit_name,category,market_name,category:market,benefit_description,audit
0,644fbbc50f93dbb489bdb6d2,Virtual Office Exam Visit - Primary Care Physi...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nA Virtual Office Exam Visit - Primary Care...,"[{'user_id': 123, 'status': 'in review', 'new_..."
1,644fbbc50f93dbb489bdb6d3,Vision - Routine Vision Services - Primary Car...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nRoutine vision services are those that are...,
2,644fbbc60f93dbb489bdb6d4,Routine Eye Exam - Ages 18 Years and Under,"[Physician / Medical Services, Vision Exam]",[IND],"[Physician / Medical Services:IND, Vision Exam...",\n\nA routine eye exam for someone 18 years or...,
3,644fbbc60f93dbb489bdb6d5,Flu Immunization - Ages 6 Years and Over,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nFlu immunization is a benefit that helps p...,
4,644fbbc60f93dbb489bdb6d6,Vision - Routine Vision Services - Specialist ...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nThis medical benefit provides coverage for...,


In [68]:
data=data.drop(['_id','audit'], axis=1)

In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16231 entries, 0 to 16230
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   benefit_name         16230 non-null  object
 1   category             16231 non-null  object
 2   market_name          16231 non-null  object
 3   category:market      16231 non-null  object
 4   benefit_description  1470 non-null   object
dtypes: object(5)
memory usage: 634.1+ KB


In [70]:
data = data[data['benefit_description'].notna()]

In [71]:
data.shape

(1470, 5)

In [72]:
train_data=data['benefit_description'].tolist()
train_data = [item.strip() for item in train_data]
train_data

['A Virtual Office Exam Visit - Primary Care Physician Non Chiropractor - Remaining Visit(s) is a medical benefit that allows the patient to see a primary care physician without having to go to their office. The visit is done virtually, using either a computer or a phone.',
 'Routine vision services are those that are considered necessary for the maintenance of good vision and eye health. These services may include comprehensive eye exams, contact lens fittings, and refractive surgery consultations.',
 'A routine eye exam for someone 18 years or younger would generally include a vision test and an evaluation of the health of the eyes. The doctor may also check for signs of common eye problems such as nearsightedness, farsightedness, astigmatism, and presbyopia.',
 'Flu immunization is a benefit that helps protect people from the flu, or influenza. The flu is a serious respiratory illness that can lead to hospitalization and even death. Every year, thousands of people in the United Stat

In [14]:
# Define the path to the Excel files
path_to_files = r"C:\Users\AL44096\Documents\NLS_excel_files"

# Define the names of the Excel files
file_names = ['Benefit_description_long.xlsx'
              ]

# Define the name of the sheet in each Excel file that contains the text data
#sheet_name = 'Sheet1'

# Load and clean the text data from each Excel file
cleaned_data = []
for file_name in file_names:
    # Load the data from the Excel file into a Pandas DataFrame
    df = pd.read_excel(f'{path_to_files}/{file_name}')
    #data=concate_column_text(df)
    data=df['Description']
    #cleaned_data.append(data)
    # Extract the relevant text data from the DataFrame
    text_data = convert_column_to_list(data)
    # Clean the text data
    cleaned_text_data = [clean_text(text) for text in text_data]
    cleaned_data.append(cleaned_text_data)
corpus = [element for innerList in cleaned_data for element in innerList]

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,Benefit_name,Web_link,Description
0,0,AAI,https://www.insuranceopedia.com/definition/579...,An Accredited Advisor in Insurance is an insur...
1,1,AAI,https://www.insuranceopedia.com/definition/579...,Many insurance professionals choose to pursue ...
2,2,Absolute Assignment,https://www.insuranceopedia.com/definition/551...,Absolute assignment refers to a policyholder t...
3,3,Absolute Assignment,https://www.insuranceopedia.com/definition/551...,There are a number of reasons why a policyhold...
4,4,Absolute Beneficiary,https://www.insuranceopedia.com/definition/552...,An absolute beneficiary is a beneficiary of a ...


In [25]:
df=df.groupby(['Benefit_name','Web_link'])['Description'].sum()

In [26]:
df=df.to_frame()
len(df)

2588

In [27]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Benefit_name,Web_link,Unnamed: 2_level_1
AAI,https://www.insuranceopedia.com/definition/579/accredited-advisor-in-insurance-aai,An Accredited Advisor in Insurance is an insur...
ACV,https://www.insuranceopedia.com/definition/4964/actual-cash-value-acv-insurance,Actual cash value (ACV) is one way that insura...
AD&D,https://www.insuranceopedia.com/definition/5004/accidental-death-and-dismemberment-insurance-add,Accidental death and dismemberment (AD&D) insu...
ADA,https://www.insuranceopedia.com/definition/740/americans-with-disabilities-act-ada,Americans with Disabilities Act (ADA) is a law...
ADB,https://www.insuranceopedia.com/definition/4984/accelerated-death-benefit-adb,An accelerated death benefit (ADB) is a supple...


In [30]:
train_df=df['Description'].tolist()
train_df = [item.strip() for item in train_df]
train_df

['An Accredited Advisor in Insurance is an insurance professional who has advanced knowledge of insurance topics and the insurance industry. In order to earn the status and the credential of being an Accredited Advisor in Insurance, an insurance professional must pass three examinations given by the Insurance Institute of America.Many insurance professionals choose to pursue the status of Accredited Advisor in Insurance because it can increase earnings. In this way, this credential is similar to many other credentials in a wide variety of fields. Accredited Advisors in Insurance are considered to be significantly more knowledgeable about insurance topics than standard insurance agents or other equivalent insurance professionals. The services of Accredited Advisors in Insurance can be useful if an insurance company is dealing with a complicated client or complicated insurance scenario.',
 "Actual cash value (ACV) is one way that insurance companies measure the worth of assets for an ins

In [None]:
# from datasets import Dataset

# comments_dataset = Dataset.from_pandas(df)
# comments_dataset

In [17]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

## Step 1: use an existing language model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2',max_seq_length=256)

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),'cls')
#pooling_model = models.Pooling(word_embedding_dimension=250)
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

## Join steps 1 and 2 using the modules argument
#model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [31]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from torch.utils.data import DataLoader

# # Define your sentence transformer model using CLS pooling
# model_name = 'distilroberta-base'
# word_embedding_model = models.Transformer(model_name, max_seq_length=32)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define a list with sentences (1k - 100k sentences)
# train_sentences = ["Your set of sentences",
#                    "Model will automatically add the noise",
#                    "And re-construct it",
#                    "You should provide at least 1k sentences"]

# Convert train sentences to sentence pairs
train_data = [InputExample(texts=[s, s]) for s in train_df]

# DataLoader to batch your data
train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.MultipleNegativesRankingLoss(model)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    show_progress_bar=True
)

#model.summary()



Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                                                                | 0/21 [00:00<?, ?it/s][A
Iteration:   5%|███▍                                                                    | 1/21 [01:02<20:44, 62.24s/it][A
Iteration:  10%|██████▊                                                                 | 2/21 [01:54<18:05, 57.12s/it][A
Iteration:  14%|██████████▎                                                             | 3/21 [02:50<17:00, 56.71s/it][A
Iteration:  19%|█████████████▋                                                          | 4/21 [03:45<15:57, 56.33s/it][A
Iteration:  24%|█████████████████▏                                                      | 5/21 [04:46<15:15, 57.23s/it][A
Iteration:  29%|████████████████████▌                                                   | 6/21 [05:45<14:25, 57.69s/it][A
Iteration:  33%|███

In [32]:
model.save(r'C:\Users\AL44096\Documents\Long_fine_tune_model')

In [33]:
fine_tune_model = SentenceTransformer(r'C:\Users\AL44096\Documents\Long_fine_tune_model')

In [34]:
def get_fine_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=fine_tune_model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

In [35]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['Description', 'Benefit_name', 'Web_link'],
    num_rows: 2588
})

In [36]:
import numpy
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_fine_embeddings(x["Description"]).numpy()}
)

                                                                                                                       

In [84]:
embeddings_dataset

Dataset({
    features: ['benefit_name', 'category', 'market_name', 'category:market', 'benefit_description', '__index_level_0__', 'embeddings'],
    num_rows: 1470
})

In [37]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 471.73it/s]


Dataset({
    features: ['Description', 'Benefit_name', 'Web_link', 'embeddings'],
    num_rows: 2588
})

In [38]:
question = "WheelChair"
question_embedding = get_fine_embeddings([question]).numpy()
question_embedding.shape

(1, 256)

In [39]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)

In [40]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [42]:
for _, row in samples_df.iterrows():
    #print(row)
    print(f"Benefit_name: {row['Benefit_name']}")
    print(f"SCORE: {row.scores}")
    print('\n')
    print(f"benefit_description: {row.Description}")
    print('\n')

Benefit_name: Assistive Technology
SCORE: 5.721487045288086


benefit_description: Assistive technology refers to devices and equipment that can help people with disabilities perform activities of daily living and other tasks. Anything from special pens, to furniture, to text-to-speech software can qualify as assistive technology. These devices are often used in educational settings to make classroom learning and activities accessible for students with disabilities.Whether or not an insurer will cover certain types of assistive technology can be extremely relevant for individuals with disabilities, since his technology may be an important factor in improving their quality of life. Assistive technology, however, can also be very expensive. Some insurers, then, may hesitate to cover every single type.


Benefit_name: Any Occupation Income Protection
SCORE: 5.584429740905762


benefit_description: Any occupation income protection is a disability insurance policy that provides benefits to 

In [73]:
data.shape

(1470, 5)

In [74]:
data.head(2)

Unnamed: 0,benefit_name,category,market_name,category:market,benefit_description
0,Virtual Office Exam Visit - Primary Care Physi...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nA Virtual Office Exam Visit - Primary Care...
1,Vision - Routine Vision Services - Primary Car...,[Physician / Medical Services],[IND],[Physician / Medical Services:IND],\n\nRoutine vision services are those that are...


In [75]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(data)
comments_dataset

Dataset({
    features: ['benefit_name', 'category', 'market_name', 'category:market', 'benefit_description', '__index_level_0__'],
    num_rows: 1470
})

In [77]:
import numpy
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_fine_embeddings(x["benefit_description"]).numpy()}
)

                                                                                                                       

In [78]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 469.56it/s]


Dataset({
    features: ['benefit_name', 'category', 'market_name', 'category:market', 'benefit_description', '__index_level_0__', 'embeddings'],
    num_rows: 1470
})

In [79]:
question = "WheelChair"
question_embedding = get_fine_embeddings([question]).numpy()
question_embedding.shape

(1, 256)

In [80]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)

In [81]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [83]:
for _, row in samples_df.iterrows():
    #print(row)
    print(f"Benefit_name: {row['benefit_name']}")
    print(f"SCORE: {row.scores}")
    print('\n')
    print(f"benefit_description: {row.benefit_description}")
    print('\n')

Benefit_name: Other Online Web Visit - WI Specialist
SCORE: 6.023708343505859


benefit_description: 

This medical benefit is defined as a visit to a specialist that is conducted online, through a web-based interface. This type of visit may be used for consultation, diagnosis, and treatment purposes.


Benefit_name: Exam / Visit - Non Diagnostic Same Day Services
SCORE: 6.009596347808838


benefit_description: 

This medical benefit provides coverage for an exam or visit that is non-diagnostic and is provided on the same day as the request for services.


Benefit_name: Routine Exam Visit - Ages 35 Years and Under
SCORE: 6.008851051330566


benefit_description: 

This is a medical visit that is part of a routine check-up for people who are aged 35 years and under. This visit may include a physical examination, as well as tests and screenings for various conditions.


Benefit_name: Foot Exam - Diabetes or Peripheral Vascular Disease - Primary Care Physician - First 2 Visit(s)
SCORE: 5.9