In [1]:
import wikipedia
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

In [135]:
covid = wikipedia.page("Coronavirus").content
london = wikipedia.page("London").content
japan = wikipedia.page("Japan").content
china = wikipedia.page("China").content
football = wikipedia.page("football").content
sports = wikipedia.page("sports").content

In [104]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")


In [217]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

def encode_query(query):
    encoded_input = tokenizer(query, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    embeddings = cls_pooling(model_output)

    return embeddings


def encode_docs(docs,maxlen = 64, stride = 32):
    encoded_input = []
    embeddings = []
    spans = []
    
    for text in tqdm(docs):
        text = text.split(" ")
        if len(text) < maxlen:
            text = " ".join(text)
            encoded_input.append(tokenizer(text,  return_tensors='pt', truncation = True))
            spans.append(text)
        else:
            num_iters = int(len(text)/maxlen)
            for i in range(num_iters):
                if i == 0:
                    temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
                else:
                    temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])
                
                encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True))
                spans.append(temp_text)
    
    with torch.no_grad():
        for encoded in tqdm(encoded_input):
            model_output = model(**encoded, return_dict=True)
            embeddings.append(cls_pooling(model_output))

    return torch.stack(embeddings), spans


In [218]:
docs = [london, covid, japan, china, football, sports]
doc_emb, doc_text = encode_docs(docs)

100%|██████████| 6/6 [00:00<00:00,  9.05it/s]
100%|██████████| 855/855 [05:52<00:00,  2.42it/s]


In [241]:
%%time
#query = "What is one child policy?"
#query = "How many Summer Games has London hosted?"
#query = "What is the current population of China?"
#query = "How many people live in London?"
#query = "Highest mountain"
query_emb = encode_query(query)

scores = torch.mm(query_emb, doc_emb.reshape(-1,doc_emb[0][0].shape[0]).transpose(0, 1))[0].cpu().tolist()
doc_score_pairs = list(zip(doc_text, scores))
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

for doc, score in doc_score_pairs[:5]:
    print()
    print(score, "\n",doc)


22.93402862548828 
 people in 2011, while its wider metropolitan area had a population of 12–14 million, depending on the definition used. According to Eurostat, London is the second most populous metropolitan area in Europe. A net 726,000 immigrants arrived there in the period 1991–2001.The region covers 1,579 square kilometres (610 sq mi), giving a population density of 5,177 inhabitants per square kilometre (13,410/sq mi), more than ten times that of any other British region. In population terms, London is the 19th largest city and the 18th largest metropolitan region.


=== Age structure and median age ===
Children younger than 14

22.107194900512695 
 into residential areas at night to take advantage of London's green spaces.


== Demography ==

The 2011 census recorded that 2,998,264 people or 36.7% of London's population were foreign-born making it the city with the second largest immigrant population after New York, in terms of absolute numbers. About 69% of children born in L