In [1]:
import torch
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
import collections as ct
import time
import utils

Using TensorFlow backend.


In [2]:
sentences = pd.read_csv("data/sentences.csv")
sentences.head()

Unnamed: 0,question,embedding
0,What is the step by step guide to invest in sh...,"[2054, 2003, 1996, 3357, 2011, 3357, 5009, 200..."
1,What is the step by step guide to invest in sh...,"[2054, 2003, 1996, 3357, 2011, 3357, 5009, 200..."
2,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[2054, 2003, 1996, 2466, 1997, 12849, 10606, 1..."
3,What would happen if the Indian government sto...,"[2054, 2052, 4148, 2065, 1996, 2796, 2231, 103..."
4,How can I increase the speed of my internet co...,"[2129, 2064, 1045, 3623, 1996, 3177, 1997, 202..."


In [3]:
def list_from_string(emb):
    emb = emb[1:-1]
    if emb: #not empty
        emb = emb.split(', ')
        emb = list(map(lambda x: int(x), emb))
    return emb

emb = list_from_string(sentences.iloc[5,1])
print(type(emb))
print(type(emb[0]))

<class 'list'>
<class 'int'>


In [4]:
embeddings = sentences.iloc[:,1]

In [5]:
questions = sentences.iloc[:,0]

In [16]:
for i, q in questions.items():
    if type(q) is float:
        print(i, "    ", q)

174363      nan
493339      nan


In [22]:
print(sentences.loc[174363,:])
print(sentences.loc[493339,:])

question     NaN
embedding     []
Name: 174363, dtype: object
question     NaN
embedding     []
Name: 493339, dtype: object


In [6]:
embeddings_aux = list(map(list_from_string, embeddings))

In [7]:
model_class = BertForSequenceClassification
tokenizer_class = BertTokenizer
pretrained_model = 'bert-base-uncased'
tokenizer = tokenizer_class.from_pretrained(pretrained_model)

In [8]:
def embed (sent):
    # returns the embedding, standardized to length 34, as the sentences in 'embeddings' are this long
    emb = tokenizer.encode(sent, add_special_tokens=False)
    
    # if len(emb) > 34:
    #     emb = emb[:34]
    # else:
    #     emb += [0] * (34 - len(emb))

    return emb

sent = "Is Harry?"
print(embed(sent))

[2003, 4302, 1029]


In [9]:
def most_similar_v1(sent):
    '''
    It finds the most similar sentence in the dataset
    - input: sentence consisting in a string
    - output: index of the most similar sentence in the dataset
    
    Currently it is not working, as needs embeddings as an array
    '''
    emb = embed(sent)
    d = cdist(embeddings, [emb])

    present = False
    i = D = np.inf
    for j in range(len(d)):
        if d[j] == 0:
            present = True
        elif d[j] < D:
            D = d[j]
            i = j
        
    print("Most similar sentence to: ", sent)
    if present:
        print("The sentence is present in the list")
    print("id:  ", i+1, ",   position in the table:  ", i, sep="")
    print("sentence: ", sentences.iloc[i,0])
    print("distance: ", float(D),"\n")
    
    return i


def similarity(sample, ref):
    ref = ct.Counter(ref)
    n = max(sum(sample.values()), sum(ref.values()))
    s = 0
    for el in sample:
        s += min(sample[el], ref[el])
    return s/n*100


def most_similar(sent):
    '''
    It finds the most similar sentence in the dataset
    - input: sentence consisting in a string
    - output: index of the most similar sentence in the dataset
    '''
    emb = embed(sent)
    sample = ct.Counter(emb)
    sim = list(map(lambda x: similarity(sample,x), embeddings_aux))

    present = False
    i = s = -np.inf
    for j in range(len(sim)):
        if sim[j] == 100:
            present = True
        elif sim[j] > s:
            s = sim[j]
            i = j
        
    print("Most similar sentence to: ", sent)
    if present:
        print("The sentence is present in the list!")
    print("id:  ", i+1, ",   position in the table:  ", i, sep="")
    print("sentence: ", sentences.iloc[i,0])
    print("similarity: ", "{:.2f}".format(s), "%\n", sep="")
    
    return i+1

In [8]:
sent = "Who was more voted in the presidential elections, Hillary Clinton or Donald Trump?"
time1 = time.time()
_ = most_similar_v1(sent)
time2 = time.time()
_ = most_similar(sent)
time3 = time.time()

print("First version lasted: ", utils.format_time(time2-time1))
print("Second version lasted: ", utils.format_time(time3-time2), "\n \n")


sent = "Is Harry Potter in love with Hermione?"
_ = most_similar_v1(sent)
_ = most_similar(sent)
print("")

sent = "Which is the best rock band in history?"
_ = most_similar_v1(sent)
i = most_similar(sent)
_ = most_similar(sentences.iloc[i,0])

Most similar sentence to:  Who was more voted in the presidential elections, Hillary Clinton or Donald Trump?
index:  32422
sentence:  Who will be the next president of USA: Hillary Clinton or Donald Trump?
distance:  4519.969247682997 

Most similar sentence to:  Who was more voted in the presidential elections, Hillary Clinton or Donald Trump?
index:  80453
sentence:  Who will win the US presidential elections 2016: Hillary Clinton or Donald Trump?
similarity: 66.67%

First version lasted:  0:00:22
Second version lasted:  0:00:47 
 

Most similar sentence to:  Is Harry Potter in love with Hermione?
index:  260736
sentence:  What does IMO mean in a text message?
distance:  2550.2176377713336 

Most similar sentence to:  Is Harry Potter in love with Hermione?
index:  156559
sentence:  Why don't Hermione fall in love with Harry?
similarity: 61.54%


Most similar sentence to:  Which is the best rock band in history?
index:  225975
sentence:  What are the best video games to play?
distanc

In [11]:
sent = "Are aliens green or grey?"
_ = most_similar(sent)

Most similar sentence to:  Are aliens green or grey?
id:  489982,   position in the table:  489981
sentence:  Are my eyes hazel or green?
similarity: 57.14%



In [5]:
print(tokenizer.decode([2053]))
print(tokenizer.decode([2054]))
print(tokenizer.decode([4301]))
print(tokenizer.decode([4302]))

no
what
thoughts
harry
