In [1]:
import numpy as np
import pandas as pd
import neattext.functions as nfx
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
ques = pd.read_csv('../data/yahoo_ques.csv')
ques.head()

Unnamed: 0,Questions
0,Why does Zebras have stripes?
1,What did the itsy bitsy sipder climb up?
2,What is the difference between a Bachelors and...
3,Why do women get PMS?
4,If your co-worker is guilty of unsanitary hygi...


In [4]:
ques['cleaned'] = ques['Questions'].apply(lambda x: nfx.clean_text(x, stopwords=True, contractions=True, multiple_whitespaces=True, special_char=True))
ques['cleaned'].head()

0                                       zebras stripes
1                           itsy bitsy sipder climb up
2                  difference bachelors masters degree
3                                            women pms
4    coworker guilty unsanitary hygiene right tell ...
Name: cleaned, dtype: object

In [5]:
ques['cleaned'][0:5].apply(lambda x: model.encode(x))

0    [0.5275992, -0.16937539, -0.30812833, 0.477792...
1    [0.014226267, 0.050527595, 1.226585, 1.0195134...
2    [0.18681271, 1.177461, 1.4966441, -0.05073864,...
3    [0.19621634, -0.18382294, 1.279156, 0.45463634...
4    [0.1242681, 1.1586038, 1.0560055, 0.30857694, ...
Name: cleaned, dtype: object

In [6]:
all_embeddings = ques['cleaned'].apply(lambda x: model.encode(x))

In [7]:
np.save('../data/embeddings.npy', all_embeddings)

In [8]:
all_embeddings = np.load('../data/embeddings.npy', allow_pickle=True)
all_embeddings[1]

array([ 1.42262671e-02,  5.05275950e-02,  1.22658503e+00,  1.01951337e+00,
        5.56340218e-01,  2.86098629e-01, -3.22406143e-01,  6.21487081e-01,
       -9.72093716e-02,  1.24134660e-01, -3.70167911e-01,  6.18201375e-01,
        4.65373665e-01,  5.84072709e-01,  1.18697807e-01, -4.17465046e-02,
       -3.91286701e-01, -2.04564244e-01,  5.03466010e-01, -7.02143371e-01,
       -7.58592725e-01, -2.32180089e-01, -1.36773139e-01, -7.87366927e-01,
        2.97716081e-01, -4.78926986e-01, -1.29051477e-01, -4.21024501e-01,
       -3.05679023e-01,  2.15544179e-01, -8.57690334e-01,  3.67646307e-01,
        9.87406552e-01, -6.21574879e-01, -9.61540997e-01,  1.62482873e-01,
       -4.55587022e-02,  1.00997970e-01,  2.78395653e-01, -1.35586977e+00,
        1.13480902e+00, -1.41860634e-01,  4.74572271e-01, -5.43581486e-01,
       -1.34460723e+00, -1.27914578e-01,  9.93721001e-03, -3.16192925e-01,
        1.66490600e-01, -8.45498741e-01, -3.14808667e-01,  7.20589906e-02,
        9.38415542e-05, -

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
query = 'How is bachelors degree different from Master degree?'
query = nfx.clean_text(query, stopwords=True, contractions=True, multiple_whitespaces=True, special_char=True)
query_embedding = model.encode(query)

In [15]:
similarity_scores = []
cos_sim = cosine_similarity([query_embedding], [all_embeddings[2]])
cos_sim

array([[0.887739]], dtype=float32)

In [16]:
def check_similarity(query, data_frame):

    similarity_score = []

    query = nfx.clean_text(query, stopwords=True, contractions=True, multiple_whitespaces=True, special_char=True)
    query_embedding = model.encode(query)
    
    for val in range(len(data_frame)):
        print()
        cos_sim = cosine_similarity([query_embedding], [all_embeddings[val]])
        similarity_score.append(cos_sim)
    
    data_frame['similarity_score'] = similarity_score
    

    return data_frame.sort_values(by=['similarity_score'], ascending=False)

In [17]:
ques = ques.head()

new = check_similarity('How is bachelors degree different from Master degree?', ques)
new = new[['Questions', 'similarity_score']]
print(new.head())






                                           Questions similarity_score
2  What is the difference between a Bachelors and...     [[0.887739]]
4  If your co-worker is guilty of unsanitary hygi...   [[0.36938083]]
3                              Why do women get PMS?    [[0.3149332]]
1           What did the itsy bitsy sipder climb up?    [[0.2724582]]
0                      Why does Zebras have stripes?   [[0.23423356]]
