In [None]:
!pip install nltk

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Sample corpus
documents = ['Machine learning is the study of computer algorithms that improve automatically through experience.\
Machine learning algorithms build a mathematical model based on sample data, known as training data.\
The discipline of machine learning employs various approaches to teach computers to accomplish tasks \
where no fully satisfactory algorithm is available.',
'Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\
The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
'Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. \
It involves computers learning from data provided so that they carry out certain tasks.',
'Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"\
or "feedback" available to the learning system: Supervised, Unsupervised and Reinforcement',
'Software engineering is the systematic application of engineering approaches to the development of software.\
Software engineering is a computing discipline.',
'A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concerned\
about the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.\
Developing a machine learning application is more iterative and explorative process than software engineering.'
]

documents_df=pd.DataFrame(documents,columns=['documents'])

In [13]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
W2V_PATH="GoogleNews-vectors-negative300.bin.gz"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

In [14]:
model_w2v.most_similar('machine')

[('machines', 0.7677487730979919),
 ('wringer_washing', 0.5941920280456543),
 ('machinery', 0.5260839462280273),
 ('Automated_teller', 0.5225658416748047),
 ('roundish_pinhole_shape', 0.5103495121002197),
 ('Machine', 0.5063987374305725),
 ('extrude_liquefied_cornmeal', 0.5044564008712769),
 ('ma_chine', 0.4988497197628021),
 ('CNC_lathe', 0.49585121870040894),
 ('Computer_Numerically_Controlled', 0.4957743287086487)]

### Using Doc2Vec

In [17]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shreyas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents)]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)

In [19]:
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)

In [20]:
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
pairwise_similarities=cosine_similarity(document_embeddings)

In [21]:
pairwise_similarities

array([[1.        , 0.19253999, 0.26544734, 0.14571559, 0.32287185,
        0.03162567],
       [0.19253999, 1.        , 0.25993996, 0.31261803, 0.22999214,
        0.18096651],
       [0.26544734, 0.25993996, 1.        , 0.24850267, 0.19638185,
        0.20111244],
       [0.14571559, 0.31261803, 0.24850267, 1.        , 0.33226055,
        0.16981034],
       [0.32287185, 0.22999214, 0.19638185, 0.33226055, 1.        ,
        0.40751086],
       [0.03162567, 0.18096651, 0.20111244, 0.16981034, 0.40751086,
        1.        ]])

In [23]:
def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

In [27]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Software engineering is the systematic application of engineering approaches to the development of software.Software engineering is a computing discipline.
Cosine Similarity : 0.3228718543759047


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.265447341351164


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathemati

### Applying Doc2Vec on Master Data

In [1]:
!pip install sentence-transformers



In [13]:
from sentence_transformers import SentenceTransformer, util
import torch

In [14]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
master_data = pd.read_excel('Master_Data.xlsx')

In [8]:
master_data.head()

Unnamed: 0,QuestionId,Question,Answer,AnswerDetails1,AnswerDetails2,CategoryId,CategoryName,AppId,AppName
0,21,password not working,Click on Login Assistance on Rekonnect Home pa...,https://rekonnect.in.kworld.kpmg.com/,https://rekonnect.in.kworld.kpmg.com/,1,Login / Password,1,Rekonnect
1,23,email for password reset not received,Please raise a Service Request on Support Cent...,http://supportcentral.in.kworld.kpmg.com/MDLSe...,http://supportcentral.in.kworld.kpmg.com/MDLSe...,1,Login / Password,1,Rekonnect
2,24,creation of new cost centre for ki,Requirements: 1. Business COO/HOD Approval ...,,,2,Projects - Cost centre,1,Rekonnect
3,25,rename of cost centre for ki,Requirements: 1. Business COO/HOD Approval ...,,,2,Projects - Cost centre,1,Rekonnect
4,26,project not appearing in the list,1. Project should be in Approved / Pre-Closeou...,ReKonnect Project Manager > Online Project Adm...,https://rekonnect.in.kworld.kpmg.com/,3,Projects - Project administration,1,Rekonnect


In [9]:
master_data['Question_Cleaned'] = master_data['Question'].apply(lambda x: x.lower().strip())

In [10]:
list_questions = list(master_data['Question_Cleaned'])

In [11]:
list_questions[:10]

['password not working',
 'email for password reset not received',
 'creation of new cost centre for ki',
 'rename of cost centre for ki',
 'project not appearing in the list',
 'hsn not appearing',
 'invoicing request is showing in error',
 'invoice pdf not received',
 'error in submission of an event through the online event creation screen',
 'event is showing pending approval but pending approver name is not getting displayed']

In [15]:
corpus_embeddings = sbert_model.encode(list_questions, convert_to_tensor=True)

In [16]:
user_query = "some issue when invoice request is raised"

In [18]:
query_embedding = sbert_model.encode(user_query, convert_to_tensor=True)

In [20]:
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=5)

In [21]:
cos_scores

tensor([0.0447, 0.1449, 0.1177, 0.1104, 0.0869, 0.0461, 0.7344, 0.6493, 0.2257,
        0.1769, 0.1483, 0.1645, 0.2149, 0.2276, 0.0569, 0.1955, 0.3783, 0.2858,
        0.2566, 0.3409, 0.3436, 0.2340, 0.0868, 0.2899, 0.1905, 0.2953, 0.1338,
        0.1521, 0.1803, 0.1349, 0.1561, 0.2593, 0.0569])

In [23]:
top_results

torch.return_types.topk(
values=tensor([0.7344, 0.6493, 0.3783, 0.3436, 0.3409]),
indices=tensor([ 6,  7, 16, 20, 19]))

In [25]:
print("Query:", user_query)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
    print(list_questions[idx], "(Score: {:.4f})".format(score))

Query: some issue when invoice request is raised

Top 5 most similar sentences in corpus:
invoicing request is showing in error (Score: 0.7344)
invoice pdf not received (Score: 0.6493)
is hard copy of supporting bills to be submitted for expense claims (Score: 0.3783)
i have misplaced my hardcopy receipt of the expense claim what should i upload on concur to submit my claim (Score: 0.3436)
how can i access the scanned copy of supporting bills for my past claims submitted on concur (Score: 0.3409)


## Asymmetric Semantic Search

In [15]:
from sentence_transformers import SentenceTransformer, util
import torch

In [27]:
model = SentenceTransformer('D://NLP//Document_Similarity//msmarco-distilroberta-base-v2//0_Transformer//')



In [5]:
sbert_model = SentenceTransformer('D://NLP//Document_Similarity//bert-base-nli-mean-tokens//0_BERT//')

Some weights of the model checkpoint at D://NLP//Document_Similarity//bert-base-nli-mean-tokens//0_BERT// were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
master_data = pd.read_excel('Master_Data.xlsx')

In [7]:
master_data.head()

Unnamed: 0,QuestionId,Question,Answer,AnswerDetails1,AnswerDetails2,CategoryId,CategoryName,AppId,AppName
0,21,password not working,Click on Login Assistance on Rekonnect Home pa...,https://rekonnect.in.kworld.kpmg.com/,https://rekonnect.in.kworld.kpmg.com/,1,Login / Password,1,Rekonnect
1,23,email for password reset not received,Please raise a Service Request on Support Cent...,http://supportcentral.in.kworld.kpmg.com/MDLSe...,http://supportcentral.in.kworld.kpmg.com/MDLSe...,1,Login / Password,1,Rekonnect
2,24,creation of new cost centre for ki,Requirements: 1. Business COO/HOD Approval ...,,,2,Projects - Cost centre,1,Rekonnect
3,25,rename of cost centre for ki,Requirements: 1. Business COO/HOD Approval ...,,,2,Projects - Cost centre,1,Rekonnect
4,26,project not appearing in the list,1. Project should be in Approved / Pre-Closeou...,ReKonnect Project Manager > Online Project Adm...,https://rekonnect.in.kworld.kpmg.com/,3,Projects - Project administration,1,Rekonnect


In [8]:
master_data['Question_Cleaned'] = master_data['Question'].apply(lambda x: x.lower().strip())

In [9]:
list_questions = list(master_data['Question_Cleaned'])

In [10]:
list_questions[:10]

['password not working',
 'email for password reset not received',
 'creation of new cost centre for ki',
 'rename of cost centre for ki',
 'project not appearing in the list',
 'hsn not appearing',
 'invoicing request is showing in error',
 'invoice pdf not received',
 'error in submission of an event through the online event creation screen',
 'event is showing pending approval but pending approver name is not getting displayed']

In [11]:
corpus_embeddings_symmetric = sbert_model.encode(list_questions, convert_to_tensor=True)

In [12]:
user_query = "login issues in rekonnect"

In [13]:
query_embedding_symmetric = sbert_model.encode(user_query, convert_to_tensor=True)

In [17]:
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores_symmetric = util.cos_sim(query_embedding_symmetric, corpus_embeddings_symmetric)[0]
top_results_symmetric = torch.topk(cos_scores_symmetric, k=5)

In [18]:
cos_scores_symmetric

tensor([0.4964, 0.4708, 0.6595, 0.6720, 0.4701, 0.3695, 0.6656, 0.4515, 0.7538,
        0.4748, 0.5367, 0.4679, 0.8236, 0.6046, 0.7695, 0.5285, 0.5988, 0.5484,
        0.5797, 0.5403, 0.5480, 0.2796, 0.5913, 0.5754, 0.5645, 0.5988, 0.4576,
        0.5727, 0.6208, 0.5398, 0.5246, 0.5389, 0.5222])

In [19]:
top_results_symmetric

torch.return_types.topk(
values=tensor([0.8236, 0.7695, 0.7538, 0.6720, 0.6656]),
indices=tensor([12, 14,  8,  3,  6]))

In [20]:
print("Query:", user_query)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results_symmetric[0], top_results_symmetric[1]):
    print(list_questions[idx], "(Score: {:.4f})".format(score))

Query: login issues in rekonnect

Top 5 most similar sentences in corpus:
data issue in reports (Score: 0.8236)
how to access concur website (Score: 0.7695)
error in submission of an event through the online event creation screen (Score: 0.7538)
rename of cost centre for ki (Score: 0.6720)
invoicing request is showing in error (Score: 0.6656)


In [28]:
# Create corpus embeddings of asymmetric semantic search...
corpus_embeddings_asymmetric = model.encode(list_questions, convert_to_tensor=True)

In [29]:
# Create a query embedding using asymmetric pre-trained model...
query_embedding_asymmetric = model.encode(user_query, convert_to_tensor=True)

In [30]:
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores_asymmetric = util.cos_sim(query_embedding_asymmetric, corpus_embeddings_asymmetric)[0]
top_results_asymmetric = torch.topk(cos_scores_asymmetric, k=5)

In [31]:
cos_scores_asymmetric

tensor([ 0.4641,  0.3808,  0.1301,  0.1882,  0.0976,  0.2175,  0.3154,  0.1506,
         0.3544,  0.1901,  0.2710,  0.7011,  0.2081,  0.2147,  0.2432,  0.0734,
         0.0720,  0.1000,  0.0282,  0.2107,  0.2548, -0.0399,  0.1833,  0.1417,
         0.0611,  0.2708,  0.1094,  0.1234,  0.1086,  0.2507,  0.1414,  0.1481,
         0.2449])

In [25]:
top_results_asymmetric

torch.return_types.topk(
values=tensor([0.7011, 0.4641, 0.3808, 0.3544, 0.3154]),
indices=tensor([11,  0,  1,  8,  6]))

In [32]:
print("Query:", user_query)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results_asymmetric[0], top_results_asymmetric[1]):
    print(list_questions[idx], "(Score: {:.4f})".format(score))

Query: login issues in rekonnect

Top 5 most similar sentences in corpus:
not able to open rekonnect forms to run reports (Score: 0.7011)
password not working (Score: 0.4641)
email for password reset not received (Score: 0.3808)
error in submission of an event through the online event creation screen (Score: 0.3544)
invoicing request is showing in error (Score: 0.3154)


--------------------------------------------------------------------------------------------------------------------------