# Qualitatively Evaluate a pre-trained DistilBERT model on a couple of sentence pairs 

## https://github.com/huggingface/transformers

In [1]:
import string
import numpy as np
import Levenshtein
import tensorflow as tf
import transformers
from transformers import TFDistilBertForSequenceClassification
from transformers import DistilBertTokenizer

In [2]:
def hash_word(word,avail,hash_dict):
    try:
        return (hash_dict[word],avail,hash_dict)
    except:
        character=np.random.choice(avail,size=1)[0]
        hash_dict[word]=character
        avail.remove(character)
        return (hash_dict[word],avail,hash_dict)
    
#We represent each word in the pair of sentences by a unique randomly selected character
def hash_sentence(sentence1,sentence2):
    hash_dict={}
    avail=[character for character in string.printable]
    st1=''
    for word in sentence1.split():
        st1_add,avail,hash_dict=hash_word(word, avail,hash_dict)
        st1=st1+st1_add
    st2=''
    for word in sentence2.split():
        st2_add,avail,hash_dict=hash_word(word, avail,hash_dict)
        st2=st2+st2_add
    return (st1,st2)

#We convert string to an ordered string containing its unique characters only
def bow(st):
    x=list(set(st))
    x.sort()
    y=''
    for i in x:
        y=y+i
    return (y)

In [5]:
def pre_proc_check(sentence1,sentence2):
    st1,st2=hash_sentence(sentence1,sentence2)
    l1=len(st1)
    l2=len(st2)
    l_word_dist=Levenshtein.distance(sentence1,sentence2)
    l_bow_word_dist=Levenshtein.distance(bow(st1),bow(st2))
    if len(set.intersection(set(st1),set(st2)))>=3:
        print('Minimum number of common words okay')
    else:
        print('Minimum number of common words NOT okay')
    if (l1>=1) and (l1<=50) and (l2>=1) and (l2<=50):
        print('Length okay')
    else:
        print('Length NOT okay')
    if (l1/l2<=1.5) or (l2/l1<=1.5):
        print('Length ratio okay')
    else:
        print('Length ratio NOT okay')
    if (l_word_dist>=1) and (l_word_dist<=20):
        print('Word-based Levenshtein edit distance okay')
    else:
        print('Word-based Levenshtein edit distance NOT okay')
    if l_bow_word_dist>=8:
        print('Bag of words lexical distance okay')
    else:
        print('Bag of words lexical distance NOT okay')
    return

In [7]:
model = TFDistilBertForSequenceClassification.from_pretrained('./data/paraphrase/')
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
s1x = "One never forgets cycling."
s1y="Cycling is a skill that stays with you forever."
pre_proc_check(s1x,s1y)

Minimum number of common words NOT okay
Length okay
Length ratio okay
Word-based Levenshtein edit distance NOT okay
Bag of words lexical distance okay


In [13]:
MAX_LENGTH=max(len(s1x),len(s1y))
inputs1_x_y= tokenizer.encode_plus(s1x, s1y,add_special_tokens=True,truncation_strategy='do_not_truncate',max_length=MAX_LENGTH, pad_to_max_length=True,return_tensors='tf')
print('The probability that the second sentence is a paraphrase of the first sentence:',tf.nn.softmax(model(inputs1_x_y['input_ids'])[0][0])[1].numpy())

The probability that the second sentence is a paraphrase of the first sentence: 0.16632225


In [14]:
s2x = "Comics are a great source of enjoyment for the reader."
s2y = "It is great fun to read comics."
pre_proc_check(s2x,s2y)

Minimum number of common words NOT okay
Length okay
Length ratio okay
Word-based Levenshtein edit distance NOT okay
Bag of words lexical distance okay


In [15]:
MAX_LENGTH=max(len(s2x),len(s2y))
inputs2_x_y= tokenizer.encode_plus(s2x, s2y,add_special_tokens=True,truncation_strategy='do_not_truncate',max_length=MAX_LENGTH, pad_to_max_length=True,return_tensors='tf')
print('The probability that the second sentence is a paraphrase of the first sentence:',tf.nn.softmax(model(inputs2_x_y['input_ids'])[0][0])[1].numpy())

The probability that the second sentence is a paraphrase of the first sentence: 0.582867
