In [46]:
from datasets import load_dataset

dataset = load_dataset('ms_marco', 'v1.1')


Found cached dataset ms_marco (C:/Users/Admin/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84)


  0%|          | 0/3 [00:00<?, ?it/s]

In [47]:
subset = dataset['test']

In [48]:
corpus = []

for sample in subset:
    query_type = sample['query_type']
    if query_type != 'entity':
        continue

In [49]:
query_id = sample['query_id']
query_str = sample['query']
passages_dict = sample['passages']
is_selected_lst = passages_dict['is_selected']
passages_text_lst = passages_dict['passage_text']

corpus += passages_text_lst

In [50]:
def tokenize(text):
    return text.split()

def create_dictionary(corpus):
    dictionary = []
    for doc in corpus:
        normalized_doc = text_normalize(doc)
        tokens = tokenize(normalized_doc)
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary


In [51]:
def vectorize(text, dictionary):
    word_count_dict = {word: 0 for word in dictionary}
    normalized_text = text_normalize(text)
    tokens = tokenize(normalized_text)
    for token in tokens:
        try:
            word_count_dict[token] += 1
        except:
            pass 
        #cache error
        #print error 

    vector = list(word_count_dict.values())

    return vector 

In [52]:
def create_doc_term_matrix(corpus, dictionary):
    doc_term_matrix = {}
    for idx, doc in enumerate(corpus):
        vector = vectorize(doc, dictionary)
        doc_term_matrix[(doc, idx)] = vector

    return doc_term_matrix


def lowercase(text):
    return text.lower()
import string 

remove_chars = string.punctuation
def remove_punctuation(text):
    for char in remove_chars:
        text = text.replace(char, '')

    return text
from nltk.corpus import stopwords

stopwords_lst = stopwords.words('english')

def remove_stopwords(text):
    tokens = tokenize(text)
    non_stopwords_lst = [
        token for token in tokens \
            if token not in stopwords_lst
    ]
    new_text = ' '.join(non_stopwords_lst)

    return new_text

In [53]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stemming(text):
    tokens = tokenize(text)
    stemmed_lst = [
        stemmer.stem(token) for token in tokens
    ]
    new_text = ' '.join(stemmed_lst)

    return new_text

In [54]:
from scipy import spatial
def similarity(a, b):
    return 1 - spatial.distance.cosine(a, b)


In [55]:
def ranking(query, dictionary, doc_term_matrix):
    query_vec = vectorize(query, dictionary)
    scores = []
    for doc_info, doc_vec in doc_term_matrix.items():
        sim = similarity(query_vec, doc_vec)
        scores.append((sim, doc_info))
    scores.sort(reverse= True)

    return scores

In [56]:
query_lst = ['what is the official language in Fiji']
top_k = 10
dictionary = create_dictionary(corpus)
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)
for query in query_lst:
    scores = ranking(query, dictionary, doc_term_matrix)
    print(f'Query: {query}')
    print('=== Relevant docs ===')
    for idx in range(top_k):
        doc_score = scores[idx][0]
        doc_content = scores[idx][1][0]

        print(f'Top {idx + 1}; Scores: {doc_score:.4f}')
        print(doc_content)
        print('\n')

Query: what is the official language in Fiji
=== Relevant docs ===
Top 1; Scores: 1.0000
While methamphetamine and MDMA -- also known as ecstasy or molly -- are technically synthetic drugs because they are made entirely from chemicals, the term synthetic drugs usually refers to bath salts and synthetic marijuana that are often sold in stores and online as household items. 


Top 2; Scores: 1.0000
There’s “bath salts,” there’s synthetic marijuana and there’s drugs that used to be called “designer drugs.” Now all these classes of drugs are starting to be grouped together with the designation “synthetics.”. Synthetic marijuana is sold as Spice or K2 or one of a dozen other names. As reported by the European Monitoring Center for Drugs and Drug Addiction, a new, deadly substance is being abused by those seeking a thrill in the UK and in Europe. This new drug is a synthetic nicknamed 4,4’-DMAR, 4-methyl-euphoria, 4-methyl-U4Euh, 4-M-4-MAR, 4,4-dimethylaminorex or Serotoni.


Top 3; Scores: 

In [57]:
import torch
from sentence_transformers import SentenceTransformer

model =  SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [58]:
from sentence_transformers import util 

def similarity(query_embeddings, corpus_embeddings):
    return util.cos_sim(query_embeddings, corpus_embeddings)[0]



In [59]:
def ranking(query, top_k=10):
    query_embeddings = model.encode(
        query, 
        convert_to_tensor=True
    )
    cos_scores = similarity(
        query_embeddings, 
        corpus_embeddings
    )
    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

In [60]:
custom_queries = ['what is the official language in Fiji']

top_k = min(5, len(corpus))
for query in custom_queries:
    top_results = ranking(query, top_k)

    print('Query: ', query)
    print('\n==================')
    print(f'Top (top_k) most similar sentence in corpus:\n')

    for idx, (score, doc_idx) in enumerate(
        zip(top_results[0], top_results[1])
    ):
        print(f'Document rank (idx + 1):')
        print(corpus[doc_idx], f'\n(Score: {score:.4f})', '\n')

Query:  what is the official language in Fiji

Top (top_k) most similar sentence in corpus:

Document rank (idx + 1):
Bath salts and synthetic marijuana such as K2 and Spice are classified as synthetic drugs, which are sold in retail outlets and via the Internet under many different brand labels as plant food and herbal incense, respectively.  
(Score: 0.0391) 

Document rank (idx + 1):
DrugFacts: Synthetic Cathinones (“Bath Salts”). The term “bath salts” refers to an emerging family of drugs containing one or more synthetic chemicals related to cathinone, an amphetamine-like stimulant found naturally in the khat plant. Early indications are that synthetic cathinones have a high abuse and addiction potential. In a study of the rewarding and reinforcing effects of MDPV, rats showed self-administration patterns and escalation of drug intake nearly identical to methamphetamine. 
(Score: 0.0076) 

Document rank (idx + 1):
Overview and History. Synthetic cannabinoids, commonly known as “syn