Semantic Texual Similarity using RoBERTa-Large via SBERT. Find Top 10 semantically similar sentences for a given query from a corpus of 6000+ sentences. Output removed due to data sensitivity. 

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', 1500)
pd.set_option('display.max_rows', 3000)

In [2]:
#Read commodity reference index
df = pd.read_csv('~/Desktop/Experiments/COMMODITY/CSV/newh0.csv') 
df

Unnamed: 0,Classification,Code,Description,Refined_Description,Code Parent,Level,isLeaf
0,H0,81050,"Fruit, edible; kiwifruit, fresh",kiwi fruit,810,6,1
1,H0,10111,"Horses; live, pure-bred breeding animals",horses live purebred breeding animals,101,6,1
2,H0,10119,"Horses; live, other than pure-bred breeding animals",horses live purebred breeding animals,101,6,1
3,H0,10120,"Asses, mules and hinnies; live",asses mules hinnies live,101,6,1
4,H0,10210,"Bovine animals; live, pure-bred breeding animals",bovine animals live purebred breeding animals,102,6,1
...,...,...,...,...,...,...,...
6190,H0,970400,"Stamps; postage or revenue; stamp-postmarks, first-day covers, postal stationery (stamped paper) and like, used, or if unused not of current or new issue in the country to which they are destined",stamps postage revenue stamppostmarks firstday covers postal stationery used,9704,6,1
6191,H0,9705,"Collections and collectors' pieces; of zoological, botanical, mineralogical, anatomical, historical, archaeological, palaeontological, ethnographic or numismatic interest",collections collectors pieces zoological botanical mineralogical anatomical historical archaeological palaeontological,97,4,0
6192,H0,970500,"Collections and collectors' pieces; of zoological, botanical, mineralogical, anatomical, historical, archaeological, palaeontological, ethnographic or numismatic interest",collections collectors pieces zoological botanical mineralogical anatomical historical archaeological palaeontological,9705,6,1
6193,H0,9706,Antiques; of an age exceeding one hundred years,antiques age one hundred years,97,4,0


In [None]:
#Read dataset
df2 = pd.read_csv('~path/Experiments/CSV/preprocesseddata.csv') 
df2

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import re

embedder = SentenceTransformer('~path/Transformers/stsb-roberta-large')

# Corpus with example sentences
corpus = df['Refined_Description'].to_list()
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [None]:
queries = text_dict
hist_dict = {1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0}

# Find the closest 10 semantically similar sentences of the corpus for each query sentence based on cosine similarity

top_k = 10
i=0
count = 0
total_recall = 0
real_recall = 0

for query in queries:
    inquery = ''.join([s for s in query if not s.isdigit()])
    query_embedding = embedder.encode(inquery, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_results = torch.topk(cos_scores, k=top_k)
    print("\n\n=========="+str(i)+"============\n\n")
    print("Query:", inquery)
    expected = queries[query]
    print("\nExpected:",expected)
    expected_string = re.sub('{.*$','',expected)
    print("\nTop 10 most similar sentences in corpus:")
    j = 0
    match = 0
    all_results=[]
    for score, idx in zip(top_results[0], top_results[1]):
        j = j + 1
        print(corpus[idx].strip(), "(Score: %.4f)" % (score))
        all_results.append(corpus[idx].strip())
        if(corpus[idx].strip() == expected_string.strip()):
            count = count + 1
            match = match + 1
            hist_dict[j]+=1
    if (match>1):
        extra_match = match - 1
        count = count - extra_match
    i=i+1
    
    #Recall total
    recall_total = 0
    expected_number = re.sub('\D+','',expected)
    expected_number_four = str(expected_number)[:4]
    
    for code in df['Code']:
        if (int(str(expected_number)[:4]) == int(str(code)[:4])):
            recall_total +=1
            
    #Recall relevant
    dtemp = df[df['Code'].astype(str).str.startswith(expected_number_four)]
    all_relevant_desc = dtemp['Refined_Description'].to_list()
    recall_relevant = sum(desc in all_results for desc in all_relevant_desc)
    
    #Recall
    recall = recall_relevant/recall_total
    real_recall+=recall
    if(recall==0.0):
        recall=1.0
    total_recall+=recall
    print("\n recall = "+str(recall))
          
print("\n\n==========--------------------==========\n\n")
print("Bert Siamse: "+str(count+1)+" out of "+str(i))
accuracy = float(((count+1)/i)*100)
print("Accuracy: "+str(accuracy))
r_recall = float((real_recall/i)*100)
print("Real Recall:"+str(r_recall))
t_recall = float((total_recall/i)*100)
print("Total Recall:"+str(t_recall))
F1 = 2 * (accuracy * t_recall) / (accuracy + t_recall)
print("F1 Score:"+str(F1))
print("\n\n==========--------------------==========\n\n")
print(hist_dict)

In [None]:
import matplotlib.pyplot as plt
width = 1.0 
plt.bar(hist_dict.keys(), hist_dict.values(), width, color='#115ed9')


SBERT Paper: @inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}