Semantic Texutal Similarity (STS) based text classification using Google's Universal Sentence Encoder implemented via TensorFlow-GPU, NVIDIA CUDA and cuDNN. 

Task: Find top 10 most semantically similar sentences for a given short sentence as query from a corpus of short, messy and unstructed english sentences.

Tags: challenging text-classification, short unstructured and tangled texts, classification scenario with six thousand plus classes, GPU supported Transfer Learning (TL) approach

In [1]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import util
import tensorflow as tf
import tensorflow_hub as hub
pd.set_option('display.max_colwidth', 1500)
pd.set_option('display.max_rows', 3000)
print(tf.__version__)

2.0.0


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPU


In [3]:
#set gpu memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

Physical devices cannot be modified after being initialized


In [None]:
#set gpu memory limit

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)


In [15]:
#text_dict = dict(zip(dfinal.filtered_text, dfinal.Expected))
#print(len(text_dict))
text_dict = pd.Series(dfinal.Expected.values,index=(dfinal.filtered_text+dfinal.random)).to_dict()
print(len(text_dict))

9684


In [16]:
module_url = "universal-sentence-encoder-large_5"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

def embed(input):
  return model(input)

module universal-sentence-encoder-large_5 loaded


In [17]:
corpus = df['Refined_Description'].to_list()
corpus_embeddings = embed(corpus)
corpus_embeddings = corpus_embeddings.numpy()
corpus_embeddings = torch.from_numpy(corpus_embeddings)

In [None]:

import re

queries = text_dict
hist_dict = {1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0}

# Find the closest 10 sentences of the corpus for each query sentence based on cosine similarity
top_k = 10
i=0
count = 0
total_recall = 0
real_recall = 0

for query in queries:
    inquery = [''.join([s for s in query if not s.isdigit()])]
    query_embedding = embed(inquery)
    query_embedding = query_embedding.numpy()
    query_embedding = torch.from_numpy(query_embedding)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
    print("\n\n=========="+str(i)+"============\n\n")
    print("Query:", inquery)
    expected = queries[query]
    print("\nExpected:",expected)
    expected_string = re.sub('{.*$','',expected)
    print("\nTop 10 most similar sentences in corpus:")
    j = 0
    match = 0
    all_results=[]
    for idx in top_results[0:top_k]:
        j = j + 1
        print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))
        all_results.append(corpus[idx].strip())
        if(corpus[idx].strip() == expected_string.strip()):
            count = count + 1
            match = match + 1
            hist_dict[j]+=1
    if (match>1):
        extra_match = match - 1
        count = count - extra_match
    i=i+1
    
    #Recall total
    recall_total = 0
    print(expected)
    expected_number = re.sub('\D+','',expected)
    expected_number_four = str(expected_number)[:4]
    
    for code in df['Code']:
        if (int(str(expected_number)[:4]) == int(str(code)[:4])):
            recall_total +=1
            
    #Recall relevant
    dtemp = df[df['Code'].astype(str).str.startswith(expected_number_four)]
    all_relevant_desc = dtemp['Refined_Description'].to_list()
    recall_relevant = sum(desc in all_results for desc in all_relevant_desc)
    
    #Recall
    recall = recall_relevant/recall_total
    real_recall+=recall
    if(recall==0.0):
        recall=1.0
    total_recall+=recall
    print("\n recall = "+str(recall))
    
print("\n\n==========--------------------==========\n\n")
print("Sentence Encoder: "+str(count+1)+" out of "+str(i))
accuracy = float(((count+1)/i)*100)
print("Accuracy: "+str(accuracy))
r_recall = float((real_recall/i)*100)
print("Real Recall:"+str(r_recall))
t_recall = float((total_recall/i)*100)
print("Total Recall:"+str(t_recall))
F1 = 2 * (accuracy * t_recall) / (accuracy + t_recall)
print("F1 Score:"+str(F1))
print("\n\n==========--------------------==========\n\n")
print(hist_dict)

In [None]:
import matplotlib.pyplot as plt
width = 1.0 
plt.bar(hist_dict.keys(), hist_dict.values(), width, color='#115ed9')

USE on TF-Hub: https://tfhub.dev/google/universal-sentence-encoder/1 
Paper: Daniel Cer, Yinfei Yang, Sheng-yi Kong, Nan Hua, Nicole Limtiaco, Rhomni St. John, Noah Constant, Mario Guajardo-Céspedes, Steve Yuan, Chris Tar, Yun-Hsuan Sung, Brian Strope, Ray Kurzweil. Universal Sentence Encoder. arXiv:1803.11175, 2018.