In [239]:
import wikipedia
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from scipy.special import softmax
from sklearn.cluster import KMeans
import os
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
covid = wikipedia.page("Coronavirus").content
london = wikipedia.page("London").content
japan = wikipedia.page("Japan").content
china = wikipedia.page("China").content
football = wikipedia.page("football").content
sports = wikipedia.page("sports").content

In [12]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()

In [13]:
model_name = "phiyodr/roberta-large-finetuned-squad2"

tokenizer_ans = AutoTokenizer.from_pretrained(model_name)
model_ans = AutoModelForQuestionAnswering.from_pretrained(model_name).eval()
pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)

In [14]:
"""def get_top_answers(possible_starts,possible_ends,input_ids):
    answers = []
    for start,end in zip(possible_starts,possible_ends):
        answer = tokenizer_ans.convert_tokens_to_string(tokenizer_ans.convert_ids_to_tokens(input_ids[start:end+1]))
        answers.append(answer)
    return answers  

def answer_that(query, passage, n_answers=2):  
    
    inputs = tokenizer_ans.encode_plus(query, passage, add_special_tokens=True, return_tensors="pt")
    
    outputs = model_ans(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    possible_starts = np.argsort(start_scores.cpu().detach().numpy()).flatten()[::-1][:n_answers]
    possible_ends = np.argsort(end_scores.cpu().detach().numpy()).flatten()[::-1][:n_answers]
   
    answer_start = torch.argmax(start_scores)  
    answer_end = torch.argmax(end_scores) + 1  
    
    input_ids = inputs["input_ids"].tolist()[0]
    answer = tokenizer_ans.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    answers = get_top_answers(possible_starts,possible_ends,input_ids)
    return answers

%%time
probs = softmax(sorted(scores,reverse = True))
for i,(passage,_) in enumerate(doc_score_pairs[:5]):
    try:
        print("Passage probability is:",probs[i])
        print(answer_that(query,passage,n_answers=1))
        
    except Exception as e:
        print(e)
        break"""

'def get_top_answers(possible_starts,possible_ends,input_ids):\n    answers = []\n    for start,end in zip(possible_starts,possible_ends):\n        answer = tokenizer_ans.convert_tokens_to_string(tokenizer_ans.convert_ids_to_tokens(input_ids[start:end+1]))\n        answers.append(answer)\n    return answers  \n\ndef answer_that(query, passage, n_answers=2):  \n    \n    inputs = tokenizer_ans.encode_plus(query, passage, add_special_tokens=True, return_tensors="pt")\n    \n    outputs = model_ans(**inputs)\n    \n    start_scores = outputs.start_logits\n    end_scores = outputs.end_logits\n    \n    possible_starts = np.argsort(start_scores.cpu().detach().numpy()).flatten()[::-1][:n_answers]\n    possible_ends = np.argsort(end_scores.cpu().detach().numpy()).flatten()[::-1][:n_answers]\n   \n    answer_start = torch.argmax(start_scores)  \n    answer_end = torch.argmax(end_scores) + 1  \n    \n    input_ids = inputs["input_ids"].tolist()[0]\n    answer = tokenizer_ans.convert_tokens_to_s

In [358]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

def encode_query(query):
    encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    embeddings = cls_pooling(model_output)

    return embeddings.to(torch.float16)


def encode_docs(docs,maxlen = 64, stride = 32):
    encoded_input = []
    embeddings = []
    spans = []
    labels = []
    label = 0
    
    for text in tqdm(docs):
        text = text.split(" ")
        if len(text) < maxlen:
            text = " ".join(text)
            encoded_input.append(tokenizer(text,  return_tensors='pt', truncation = True).to(device))
            spans.append(text)
            labels.append(label)
        else:
            num_iters = int(len(text)/maxlen)
            for i in range(num_iters):
                if i == 0:
                    temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
                else:
                    temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])
                
                encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
                spans.append(temp_text)
                labels.append(label)
        label+=1
   
    with torch.no_grad():
        for encoded in tqdm(encoded_input):
            model_output = model(**encoded, return_dict=True)
            embeddings.append(cls_pooling(model_output))

    return np.float32(torch.stack(embeddings).transpose(0, 1).cpu().detach()), spans, encoded_input, labels 


In [359]:
if 'emb.npy' in os.listdir():
    emb_dicto = np.load('emb.npy',allow_pickle='TRUE').item()
    res_dicto = np.load('spans.npy',allow_pickle='TRUE').item()
    doc_emb = np.array(list(emb_dicto.values()))
    doc_text = list(res_dicto.values())
else:
    docs = [london, covid, japan, china, football, sports]
    doc_emb, doc_text,encoded_input, labels = encode_docs(docs)
    np.save('emb.npy',dict(zip(list(range(len(doc_emb))),doc_emb))) 
    np.save('spans.npy',dict(zip(list(range(len(doc_text))),doc_text))) 


100%|██████████| 6/6 [00:00<00:00, 11.79it/s]
100%|██████████| 856/856 [00:09<00:00, 88.71it/s]


In [363]:
%%time
#query = "What is one child policy?"
#query = "How many Summer Games has London hosted?"
#query = "What is the current population of China?"
query = "How many people live in London?"
#query = "What is the population of London?"
#query = "What is the highest mountain?"
query_emb = encode_query(query).cpu()

scores = np.matmul(query_emb, doc_emb.reshape(-1,768).transpose(1,0))[0].tolist()
doc_score_pairs = list(zip(doc_text, scores))
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

for doc, score in doc_score_pairs[:5]:
    print()
    print(score, "\n",doc)


22.933631896972656 
 people in 2011, while its wider metropolitan area had a population of 12–14 million, depending on the definition used. According to Eurostat, London is the second most populous metropolitan area in Europe. A net 726,000 immigrants arrived there in the period 1991–2001.The region covers 1,579 square kilometres (610 sq mi), giving a population density of 5,177 inhabitants per square kilometre (13,410/sq mi), more than ten times that of any other British region. In population terms, London is the 19th largest city and the 18th largest metropolitan region.


=== Age structure and median age ===
Children younger than 14

22.107393264770508 
 into residential areas at night to take advantage of London's green spaces.


== Demography ==

The 2011 census recorded that 2,998,264 people or 36.7% of London's population were foreign-born making it the city with the second largest immigrant population after New York, in terms of absolute numbers. About 69% of children born in 

In [365]:
%%time
k = 5
probs = softmax(sorted(scores,reverse = True)[:k])
for i, (passage, _) in enumerate(doc_score_pairs[:k]):
    ans = pipe(query, passage)
    print("P(answer|query,passage): {}, P(passage|query): {}".format(ans["score"]*probs[i],probs[i]))
    print("Answer:", ans["answer"], "P(answer|passage):",ans["score"])
    print()

P(answer|query,passage): 1.7910067202462876e-06, P(passage|query): 0.42592900637915004
Answer: 12–14 million, P(answer|passage): 4.204941888019675e-06

P(answer|query,passage): 0.00011236944825303919, P(passage|query): 0.18642593866291862
Answer: 36.7% of London's population were foreign-born P(answer|passage): 0.0006027565104886889

P(answer|query,passage): 0.09802841292079759, P(passage|query): 0.1637024002086898
Answer: 7,172,036 P(answer|passage): 0.5988208651542664

P(answer|query,passage): 2.53292467022337e-06, P(passage|query): 0.15005395661924167
Answer: 8,173,941 P(answer|passage): 1.6880092516657896e-05

P(answer|query,passage): 0.002450631556800161, P(passage|query): 0.0738886981299984
Answer: 14,040,163 P(answer|passage): 0.03316652774810791

Wall time: 2.26 s


In [366]:
%%time
best = 0
best_i = 0
doc_T = doc_emb.reshape(-1,768).transpose(1,0)
doc_r = doc_emb.reshape(-1,768)
for i in range(doc_T.shape[1]):
    candidate = np.mean(np.matmul(doc_r[i,:], doc_T))
    if candidate > best:
        best = candidate
        best_i = i
best_i,best



Wall time: 75.6 ms


(634, 14.96799)

In [263]:
kmeans = KMeans(n_clusters=int(np.sqrt(doc_emb.shape[0])), random_state=0).fit(doc_emb.cpu().view(-1,768))
for i,n in enumerate(kmeans.predict(doc_emb.cpu().view(-1,768))):
    pass