In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer,util
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
stop_words=stopwords.words('english')
punctuation=string.punctuation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_custom_embeddings(model_path,corpus_embeddings):
    #store sentences & embeddings on disc
    with open(model_path+'\\'+'embeddings.pkl',"wb") as fout:
        pickle.dump({'Sentences':corpus, 'embeddings': corpus_embeddings},fout)
    print("saved Custom embeddings")

def load_custom_embeddings(model_path):
    with open(model_path+'/embeddings.pkl',"rb") as fin:
        stored_data = pickle.load(fin)
        stored_sentences = stored_data['Sentences']
        stored_embeddings = stored_data['embeddings']
    return stored_sentences,stored_embeddings

def get_embeddings(sentence):
    #encode sentence to get sentence embeddings
    sentence_embedding=model.encode(sentence, convert_to_tensor=True)
    return sentence_embedding

def sentence_similarity_scores(sentence_embedding,
                              custom_embeddings,
                              stored_sentences,
                              top_k,
                              input_sentence):
    #computing similarity scores with the corpus
    cos_scores= util.pytorch_cos_sim(sentence_embedding, custom_embeddings)[0]
    #sort the results in decreasing order and get the first top_k
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("sentence :", input_sentence, "\n")
    print("Top", top_k, "most similar sentences in corpus")
    results={}
    for idx in top_results[0:top_k]:
        print(stored_sentences[idx],"(scores:%4f)" % (cos_scores[idx]))
        results[f"sentence{idx}"]= ({"predicted_sentence": stored_sentences[idx],"Scores" : float(cos_scores[idx])})
    return results

def clean_text(text):
    # Convert the text to title case
    text = str(text).title()
    # Remove the punctuation
    text = ''.join([c for c in text if c not in punctuation])
    # Remove the stop words
    tokens = [token for token in text.split() if token.lower() not in stop_words]
    # Convert the tokens back to a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text


def concate_column_text(data):
    df["concated_text"]=df[["Category Name","Service name","Service Classification"]].astype(str).agg(' '.join,axis=1)
    return df["concated_text"]


def convert_column_to_list(data):
    data=data.tolist()
    return data

def convert_df_to_list(data):
    all_data=[]
    corpus=[]
    for values in df.columns:
        listin=df[values].tolist()
        all_data.append(listin)
    complete_data = [element for innerList in all_data for element in innerList]
    for word in complete_data:
        if word not in corpus:
            corpus.append(word)
    return corpus

In [3]:
# Define the path to the Excel files
path_to_files = r"C:\Users\AL44096\Documents\NLS_excel_files"

# Define the names of the Excel files
file_names = ['Benefit_description_long.xlsx'
              ]

# Define the name of the sheet in each Excel file that contains the text data
#sheet_name = 'Sheet1'

# Load and clean the text data from each Excel file
cleaned_data = []
for file_name in file_names:
    # Load the data from the Excel file into a Pandas DataFrame
    df = pd.read_excel(f'{path_to_files}/{file_name}')
    #data=concate_column_text(df)
    data=df['Description']
    #cleaned_data.append(data)
    # Extract the relevant text data from the DataFrame
    text_data = convert_column_to_list(data)
    # Clean the text data
    cleaned_text_data = [clean_text(text) for text in text_data]
    cleaned_data.append(cleaned_text_data)
corpus = [element for innerList in cleaned_data for element in innerList]

In [4]:
corpus

['Accredited Advisor Insurance Insurance Professional Advanced Knowledge Insurance Topics Insurance Industry Order Earn Status Credential Accredited Advisor Insurance Insurance Professional Must Pass Three Examinations Given Insurance Institute America',
 'Many Insurance Professionals Choose Pursue Status Accredited Advisor Insurance Increase Earnings Way Credential Similar Many Credentials Wide Variety Fields Accredited Advisors Insurance Considered Significantly Knowledgeable Insurance Topics Standard Insurance Agents Equivalent Insurance Professionals Services Accredited Advisors Insurance Useful Insurance Company Dealing Complicated Client Complicated Insurance Scenario',
 'Absolute Assignment Refers Policyholder Transferring Ownership Policy Another Party Transfer Means Coverage Within Policy Go Newly Named Party Original Owner Policy State Reasons Need Stipulate Conditions Transfer',
 'Number Reasons Policyholder Transfers Rights Policy Another Person Entity Might Think Gift Some

In [5]:
#len(df['concated_text'])
df=df.groupby(['Benefit_name','Web_link'])['Description'].sum()

In [6]:
#df = df.drop(columns=['Unnamed: 0'])
df=df.to_frame()
pd.set_option("display.max_colwidth", None)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Benefit_name,Web_link,Unnamed: 2_level_1
AAI,https://www.insuranceopedia.com/definition/579/accredited-advisor-in-insurance-aai,"An Accredited Advisor in Insurance is an insurance professional who has advanced knowledge of insurance topics and the insurance industry. In order to earn the status and the credential of being an Accredited Advisor in Insurance, an insurance professional must pass three examinations given by the Insurance Institute of America.Many insurance professionals choose to pursue the status of Accredited Advisor in Insurance because it can increase earnings. In this way, this credential is similar to many other credentials in a wide variety of fields. Accredited Advisors in Insurance are considered to be significantly more knowledgeable about insurance topics than standard insurance agents or other equivalent insurance professionals. The services of Accredited Advisors in Insurance can be useful if an insurance company is dealing with a complicated client or complicated insurance scenario."
ACV,https://www.insuranceopedia.com/definition/4964/actual-cash-value-acv-insurance,"Actual cash value (ACV) is one way that insurance companies measure the worth of assets for an insurance claim. They consider a fair market price of what the asset could have been sold for on the day it was lost, stolen, or destroyed. This typically comes out to a lower amount than the policyholder originally paid for the asset because assets lose value over time due to depreciation and wear and tear.For example, if a policyholder wrecks their 2011 car and the insurance policy covers the actual cash value, the insurance company would pay the amount equal to the car's worth at the time of the accident, which is determined by subtracting factors like depreciation from the replacement cost. This differs from replacement-cost system, which would pay the insured enough to buy a new, replacement asset."
AD&D,https://www.insuranceopedia.com/definition/5004/accidental-death-and-dismemberment-insurance-add,"Accidental death and dismemberment (AD&D) insurance is optional coverage added to a health or life insurance policy that pays out benefits for policyholders who die due to an accident as opposed to natural causes or who lose a bodily limb or eyesight. In case of the former, the coverage may call for a payment of double the death benefit, which is known as a double indemnity clause, while with the latter, the policyholder is generally entitled to partial amounts, depending on the policy.Accidental death & dismemberment (AD&D) coverage may be important for those who may have dangerous occupations, but in general, it is only useful to a handful of people. Therefore, be sure to read the terms and details carefully. For instance, the rider may stipulate that death must occur within a certain time after an accident or that the policyholder must lose both sets of limbs or eyesight in both eyes to qualify for the full benefit."
ADA,https://www.insuranceopedia.com/definition/740/americans-with-disabilities-act-ada,"Americans with Disabilities Act (ADA) is a law passed by the US congress in 1990 which prohibits discrimination against American citizens with physical or mental disabilities in all areas of life, including employment, education, and insurance.Disability was first addressed in 1954 by Social Security when it allowed retirement and survivor benefits to people who became disabled. In 1956, it allowed regular benefits to people with disability. In 1996, disability benefits were withheld for people whose disability stemmed from drug addiction or alcoholism."
ADB,https://www.insuranceopedia.com/definition/4984/accelerated-death-benefit-adb,"An accelerated death benefit (ADB) is a supplemental benefit or rider to a life insurance contract. It entitles the policyholder to cash advances from the policy's death benefit in case they get diagnosed with a terminal illness.An accelerated death benefit allows you to use some of your death benefit to pay the costs of certain medical expenses, such as treatments to stay alive and long-term care. It saw its start in the 1980s to help AIDs patients with their health care costs. However, an ADB increases premium charges, and using it decreases the death benefit payable to your beneficiaries by a certain percentage."
...,...,...
Yearly Renewable Term Plan of Reinsurance (YRTPR),https://www.insuranceopedia.com/definition/4911/yearly-renewable-term-plan-of-reinsurance-yrtpr,"A yearly renewable term plan of reinsurance is a type of proportional reinsurance under which mortality risks are ceded by a primary insurer (ceding company) to a reinsurer. InYRTPR, the net amount at risk for the amount above the primary insurer's retention limit on a life insurance policy. Reinsurance premiums for the net amount at risk are renewable every year under the renewable term plan of reinsurance.In the reinsurance agreement, the ceding company and the reinsurer agree on how the policy net amount at risk will be shared between them. The ceding company then prepares a schedule of the net amount at risk for each policy year. The reinsurer develops a schedule of yearly renewable term premium rates for reinsurance on the ceding company's schedule. The ceding insurer will then pay the reinsurer the established premiums for the appropriate net amounts at risk each year. In the occurence of a claim, the reinsurer would remit payment for the assumed portion of the policy's net amount at risk."
Years Certain Annuity,https://www.insuranceopedia.com/definition/4912/years-certain-annuity,"A years certain annuity is an insurance product that typically pays the policyholder a specific monthly income for a designated number of years, regardless of the number of years for which the policyholder lives. Therefore, if the policyholder dies during the time frame selected for the years certain annuity, the beneficiary named in the policy will continue to receive annuity payments until the expiration date.Years certain annuity is also known as term certain annuity and period certain annuity.Many people prefer purchasing a life annuity that provides payouts for the duration of the policyholder's life. However, the advantage of a years certain annuity is that it provides larger monthly payouts to the policyholder for a designated number of years. This annuity involves certain risks as well, such as the risk of the policyholder outliving the designated monthly payment period. This is why people will purchase a years certain annuity only once they know they'll have another source of income during their retirement years."
Years of Service,https://www.insuranceopedia.com/definition/4913/years-of-service,"Years of service is commonly used for_x000D_\nrecording working experience within an employee’s profession. Specifically, it refers to the length_x000D_\nof employment, which is measured to determine eligibility, vesting, and benefits_x000D_\nlevels for employee participants in tax-qualified pension plans. In terms of qualifying for beneftis, it is often a requirement that years of_x000D_\nservice be continuous without a break.Although years of service is used for benefits purposes, it is also used to reward employees. No form of recognition is as important_x000D_\nto the success of any organization as recognition of an employee’s years of_x000D_\nservice. Recognizing employees for the total years they have dedicated to an_x000D_\norganization is an excellent way for a company to show they value the dedication and_x000D_\nhard work of all the individuals in the company."
Zone System,https://www.insuranceopedia.com/definition/4923/zone-system,"The zone system is an triennial examination system the National Association of Insurance Commissioners (NAIC) developed to help states regulate insurance companies. It divides the United States into various geographical zones, and teams from states in each zone examine insurance companies in their respective zones to make sure they are financially in good shape.Typically, states accept the results of these examinations, and the zone system saves them from having to conduct their own. Moreover, for insurance companies that do business in multiple states in different zones, regulators from different zones work together to avoid reviewing them more than once."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2588 entries, ('AAI', 'https://www.insuranceopedia.com/definition/579/accredited-advisor-in-insurance-aai') to ('schedule of insurance', 'https://www.insuranceopedia.com/definition/495/policy-schedule-schedule-of-insurance')
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  2588 non-null   object
dtypes: object(1)
memory usage: 68.7+ KB


In [8]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['Description', 'Benefit_name', 'Web_link'],
    num_rows: 2588
})

In [9]:
#comments_dataset['concated_text'][0]

In [10]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [12]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [13]:
# embedding = get_embeddings(comments_dataset["concated_text"][0])
# embedding.shape

In [14]:
comments_dataset

Dataset({
    features: ['Description', 'Benefit_name', 'Web_link'],
    num_rows: 2588
})

In [15]:
import numpy
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["Description"]).numpy()}
)

                                                                                                                       

In [16]:
embeddings_dataset

Dataset({
    features: ['Description', 'Benefit_name', 'Web_link', 'embeddings'],
    num_rows: 2588
})

In [17]:
#df2=pd.DataFrame(embeddings_dataset)

In [18]:
#df2.head()

In [19]:
len(embeddings_dataset['embeddings'])

2588

In [20]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 156.63it/s]


Dataset({
    features: ['Description', 'Benefit_name', 'Web_link', 'embeddings'],
    num_rows: 2588
})

In [21]:
question = "WheelChair"
question_embedding = get_embeddings([question]).numpy()
question_embedding.shape

(1, 384)

In [22]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)

In [23]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [24]:
for _, row in samples_df.iterrows():
    #print(row)
    print(f"Service name: {row['Benefit_name']}")
    print(f"SCORE: {row.scores}")
    print('\n')
    print(f"concated_text: {row.Description}")
    print('\n')

Service name: DI
SCORE: 59.79560852050781


concated_text: Disability income is income that a person receives as a benefit of disability insurance. In other words, it is money people who become disabled receive either from the Social Security Administration or a private insurance company. The purpose of disability income is to provide disabled people a means to live by supplementing their finances with a portion of the income they would be earning if they were fully healthy.Serious disabilities can prevent a person from earning an income. In such cases, disabled people could quickly go bankrupt if they do not have some sort of insurance coverage. For example, a construction worker would not be able to work on a construction site if he has been temporarily paralyzed. Some disabilities are temporary, and some are permanent. There are disability income options for both types of disabilities.


Service name: Disability Income (DI)
SCORE: 59.79560852050781


concated_text: Disability income

# searching through faiss index

In [None]:
d = stored_embeddings.shape[1]

In [None]:
import faiss
index = faiss.IndexFlatL2(d)

In [None]:
index.is_trained

In [None]:
index.add(stored_embeddings)

In [None]:
index.ntotal

In [None]:
k = 10
xq = model.encode(["WheelChair"])

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

In [None]:
[f'{i}: {corpus[i]}' for i in I[0]]

In [None]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [None]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

# searching by creating an clusters

In [None]:
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [None]:
index.is_trained

In [None]:
index.train(stored_embeddings)
index.is_trained # check if index is now trained

In [None]:
index.add(stored_embeddings)
index.ntotal  # number of embeddings indexed

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

In [None]:
[f'{i}: {corpus[i]}' for i in I[0]]

In [None]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [None]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

# Searching by all the 10 nearest clusters

In [None]:
index.nprobe = 10

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)
print(D)

In [None]:
similar_sentences=[corpus[idx] for idx in I[0]]
similar_probabilities=[1-distance for distance in D[0]]

In [None]:
for sentence, probability in zip(similar_sentences,similar_probabilities):
    print("Similar Sentence :", sentence)
    print("probability :", probability)
    print()

In [None]:
[f'{i}: {corpus[i]}' for i in I[0]]

In [None]:
p=[corpus[i] for i in I[0]]
p

In [None]:

# Below are some quick examples.
# Extract column values by using DataFrame.loc[] property.
df2=df.loc[df['Fee'] == 30000, 'Courses']

# To get First Element by using .iloc[] method.
df2=df.loc[df['Fee'] == 30000, 'Courses'].iloc[0]

# Extract column values by DataFrame.item() method
df2=df.loc[df['Fee'] == 30000, 'Courses'].item()

# Using DataFrame.query() method extract column values.
df2=df.query('Fee==25000')['Courses']

# Using DataFrame.values() property.
df2=df[df['Fee']==22000]['Courses'].values[0]

# Other example.
df2=df[df['Fee']==22000]['Courses']


In [None]:
for i in p:
    y=df.loc[df["concated_text"]==i]['Service name']#.values[0]#item()#iloc[0]
    print(y)#.to_string())

In [None]:
df.iloc[11394]['Service name']