In [65]:
import spacy
import numpy as np
import torch
nlp_spacy = spacy.load('en_core_web_lg')

In [66]:
"""
Bag of Words Approach
"""

tokenizer = nlp_spacy.tokenizer

def tokenize(sentence):
    tokens = []
    for token in tokenizer(sentence):
        if not token.is_punct:
            tokens.append(token.text.lower())
    return tokens

def s2v_factory_bow(sentences):
    tokens = set()
    for sentence in sentences:
        tokens.update(tokenize(sentence))
    tokens = {t: i for i, t in enumerate(tokens)}
    n = len(tokens)

    def sent2vec_bow(sentences):
        encodings = np.zeros((len(sentences), n))
        for i, sentence in enumerate(sentences):
            for token in tokenize(sentence):
                if token in tokens:
                    encodings[i, tokens[token]] += 1
        return encodings

    return sent2vec_bow

In [67]:
"""
SBERT Approach
"""

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

def sent2vec_sbert(sentences):
    return model.encode(sentences)


In [68]:
def rank_questions(questions, sentences, sent2vec):
    q_embeddings = sent2vec(questions)
    s_embeddings = sent2vec(sentences)
    dist_matrix = util.cos_sim(q_embeddings, s_embeddings)
    importance = torch.max(dist_matrix, dim=1).values
    return torch.argsort(importance, descending=True)

In [72]:
sentences = ["I like Chinese food.", "No you do not!", "Potato is a yummy food to eat."]
questions = ["Do you like to eat food?", "Do I also like Chinese food?", "Is food something you eat?",
             "Am I someone who like good poetry?"]

result_sbert = rank_questions(questions, sentences, sent2vec_sbert)
result_bow = rank_questions(questions, sentences, s2v_factory_bow(sentences))

In [73]:
print(result_sbert)
print(result_bow)



tensor([1, 0, 2, 3])
tensor([1, 3, 2, 0])
