In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Precily_Text_Similarity.csv')
data.shape

(3000, 2)

In [3]:
data.sample(5)

Unnamed: 0,text1,text2
668,visa row mandarin made sir john the top civil ...,japan economy slides to recession the japanese...
2065,brown ally rejects budget spree chancellor gor...,howard backs stem cell research michael howard...
894,european medal chances improve what have the e...,beckham virus spotted on the net virus writers...
419,moody joins up with england lewis moody has fl...,buyers snap up jet airways shares investors h...
2127,hantuchova in dubai last eight daniela hantuch...,ireland surge past scots ireland maintained th...


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    
    Args:
      sentence: The sentence we want to process.
    
    Returns:
      A list of processed words
    """
    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]
    
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    
    Args:
      sentence1_emb: sentence1 embedding column
      sentence2_emb: sentence2 embedding column
    
    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

In [36]:

import textdistance

def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    
    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)

In [15]:
jac_sim(data.loc[40])

0.05514705882352944

In [12]:
data.loc[40]

text1    strong quarterly growth for nike nike has repo...
text2    s korean credit card firm rescued south korea ...
Name: 40, dtype: object

In [16]:
def jaccard_similarity(rows):
    x = rows['text1']
    y = rows['text2']
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [17]:
jaccard_similarity(data.loc[40])

0.8888888888888888

In [18]:
jaccard_similarity(data.loc[0])

0.9024390243902439

In [19]:
jaccard_similarity(data.loc[2])

0.8181818181818182

In [20]:
jaccard_similarity(data.loc[4])

0.7555555555555555

In [21]:
data['Jaccard_Similarity'] = data.apply(jaccard_similarity, axis=1)
data

Unnamed: 0,text1,text2,Jaccard_Similarity
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,0.902439
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,0.833333
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,0.818182
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,0.850000
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,0.755556
...,...,...,...
2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...,0.744186
2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...,0.736842
2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...,0.738095
2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...,0.761905


In [25]:
data['Jaccard_Similarity'].where(data['Jaccard_Similarity'] <=0.5).count()

0

In [26]:
from math import sqrt, pow, exp
 
def squared_sum(x):
  """ return 3 rounded square rooted value """
 
  return round(sqrt(sum([a*a for a in x])),3)
 
def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
 
  return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))


In [30]:
embedding_1 = [nlp(sentence).vector for sentence in data.loc[0]['text1']]
embedding_2 = [nlp(sentence).vector for sentence in data.loc[0]['text2']]

In [31]:
embedding_1

[array([-0.08310531, -0.05679482, -0.05277053, -0.7940125 , -0.5292963 ,
         0.575765  , -0.34914535,  0.9151904 ,  1.611188  ,  0.8179959 ,
         0.31001425, -1.2549795 , -1.3442781 , -0.19086576,  0.5743794 ,
         0.80717707, -0.832759  ,  2.3676863 ,  1.7068115 , -1.4398224 ,
         0.8194555 , -0.5670279 ,  0.12317449, -0.60115814,  0.6810997 ,
         0.36340708,  1.064837  , -1.0692973 ,  0.80645263, -1.396902  ,
        -0.0909555 , -0.49731615, -0.04021319, -0.7127582 , -1.396044  ,
         1.9289479 , -1.5209076 ,  1.5592377 ,  1.2441747 ,  1.3195524 ,
         1.1379244 , -1.0756297 , -0.44830394, -0.4652268 ,  1.1024586 ,
        -0.7745434 ,  1.1185619 ,  0.20712793,  0.44258898, -0.25387532,
        -0.5430198 , -2.8183236 ,  0.07169244, -1.1519094 , -0.23732594,
        -0.6647427 , -0.22337002, -0.76448345, -0.1254778 , -0.31970504,
         0.3764056 , -1.405665  ,  1.5764298 , -0.09185337,  0.51820344,
        -0.17269751, -0.28293592, -0.09982419, -0.1

In [32]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()

Downloading builder script:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.05k [00:00<?, ?B/s]

Downloading and preparing dataset stsb_multi_mt/en (download: 1.02 MiB, generated: 1.06 MiB, post-processed: Unknown size, total: 2.08 MiB) to C:/Users/Ajmer/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset stsb_multi_mt downloaded and prepared to C:/Users/Ajmer/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

(5749, 3) (1379, 3)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [37]:
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)

100%|██████████████████████████████████████████████████████████████████████████████| 1379/1379 [00:32<00:00, 42.26it/s]


In [38]:
stsb_test

Unnamed: 0,sentence1,sentence2,similarity_score,Jaccard_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5,0.500000
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.666667
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,1.000000
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2,0.500000
4,A man is playing a harp.,A man is playing a keyboard.,1.5,0.500000
...,...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.0,0.125000
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",1.0,0.200000
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,1.0,0.285714
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.0,0.071429


In [39]:

from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(lowercase=True, stop_words='english')

# Train the model
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
model.fit(X_train)

# Generate Embeddings on Test
sentence1_emb = model.transform(stsb_test['sentence1'])
sentence2_emb = model.transform(stsb_test['sentence2'])

# Cosine Similarity
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)


In [40]:
stsb_test

Unnamed: 0,sentence1,sentence2,similarity_score,Jaccard_score,TFIDF_cosine_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5,0.500000,0.490640
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.666667,0.613080
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,1.000000,0.705323
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2,0.500000,0.692110
4,A man is playing a harp.,A man is playing a keyboard.,1.5,0.500000,0.349763
...,...,...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.0,0.125000,0.153509
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",1.0,0.200000,0.287041
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,1.0,0.285714,0.445120
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.0,0.071429,0.000000


In [1]:
import requests

In [2]:
url = 'http://127.0.0.1:5000/'
params ={'text1': 'that movie was boring', 'text2': 'Movie was decent'}
response = requests.get(url, params)
response.json()

{'similarity score': 0.5031026124151314}