In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')
from sklearn.metrics.pairwise import cosine_similarity

#importing the dataset
import pandas as pd
df = pd.read_csv('../data/translated_questions.csv')
df.head()

Unnamed: 0,id,typeid,kategoriid,statusid,offentlighedskode,titel,dato,modtagelsesdato,frigivelsesdato,paragraf,paragrafnummer,spørgsmålsordlyd,spørgsmålstitel,spørgsmålsid,procedurenummer,grundnotatstatus,dagsordenudgavenummer,opdateringsdato,answer_id,translated_questions
0,2,16,14,1,O,"Er ministeren enig i vurderingen af, at grønla...",2014-01-02T00:00:00,,2014-01-02T11:36:24,,,,,,,,,2015-05-29T11:11:34.047,3.0,Does the minister agree with the assessment th...
1,4,16,14,1,O,"Hvad agter ministeren at foretage sig, så kost...",2014-01-02T00:00:00,,2014-01-20T10:12:24,,,,,,,,,2015-05-29T11:11:32.563,5.0,What does the minister intend to do so that bo...
2,16,16,14,1,O,"Mener ministeren, at det er fair, at det offen...",2014-01-03T00:00:00,,2014-01-20T10:14:03,,,,,,,,,2015-05-29T11:11:15.453,19.0,Does the minister think that it is fair that t...
3,18,16,14,1,O,"Mener ministeren, at det vil være en god ide a...",2014-01-03T00:00:00,,2014-01-20T10:13:46,,,,,,,,,2015-05-29T11:11:30.75,17.0,Does the minister think that it would be a goo...
4,31,16,14,1,O,"Vil udenrigsministeren i lyset af, at danske m...",2014-01-03T00:00:00,,2014-01-03T13:53:04,,,,,,,,,2015-05-29T11:11:26.343,62.0,In light of the fact that the Danish media hav...


In [24]:
#finding the file names of all the questions for which there are answers
#This is done by looking through all the basename of the files in the answers folder
import os
import glob
import re
from pathlib import Path
files = glob.glob('../data/translated/*.txt')
files = [os.path.basename(x) for x in files]
answer_ids = [re.sub(r'\.txt', '', x) for x in files]


In [47]:
#removing all .tmp files from the answer_ids list
answer_ids = [x for x in answer_ids if not x.endswith('.tmp')]

In [48]:
answer_ids = [int(x) for x in answer_ids]

In [49]:
#convert df['answer_id'] to int type if it is not NaN
df['answer_id'] = df['answer_id'].astype('Int64')


In [53]:
mask = df['answer_id'].isin(answer_ids)
new_corpus = df[mask]['translated_questions'].tolist()



In [51]:
sum(mask)

7590

In [54]:
len(new_corpus)

7590

In [68]:
new_corpus[0:10]

['Does the minister agree with the assessment that Greenlandic mineral projects arouse interest abroad, and in this connection can the minister guarantee that there have been no NSA wiretapping or wiretapping by other stakeholders of Greenlandic politicians and civil servants?',
 'What does the minister intend to do so that boarding schools etc. continue to be able to expel pupils who have taken illegal drugs and who are not visibly affected?',
 'Does the minister think that it is fair that the public sector, on the one hand, makes demands on citizens and businesses to be able to communicate digitally with the public sector, when the public sector itself does not provide the opportunity for digital communication, as is the case with the lack of opportunity for digital agreement on NemID employee signature for companies other than personally owned?',
 'Does the minister think that it would be a good idea to expand the Central Business Register so that it becomes possible to implement di

In [56]:
encoded_corpus = model.encode(new_corpus, convert_to_tensor=False) #set to true if I want to use it with pytorch in future

Great succes. It works now

In [57]:
def find_nearest_questions(new_question, top_k=5):
    # encoding the new question
    encoded_question = model.encode([new_question])[0]

    # finding the cosine similarities between the new question and the encoded corpus
    cosine_similarities = cosine_similarity(encoded_question.reshape(1, -1), encoded_corpus)[0]

    # sorting the cosine similarities in descending order
    nearest_questions_indices = cosine_similarities.argsort()[::-1][:top_k]
    
    nearest_questions = []
    for i in nearest_questions_indices:
        nearest_questions.append((new_corpus[i], i))
        
    return nearest_questions

In [17]:
# testing the function
new_question = "What is the capital of France?"
nearest_questions = find_nearest_questions(new_question)

print("Nearest questions for: ", new_question)
for question, index in nearest_questions:
    print("- ", question, " (index: ", index, ")")

Nearest questions for:  What is the capital of France?
-  About a European federal state.  (index:  41924 )
-  About the Muslim Brotherhood in France.  (index:  10274 )
-  About France providing state aid to foreign companies that establish themselves in France.  (index:  42746 )
-  Will the foreign minister explain France&#39;s economic interests in Iran.  (index:  25574 )
-  About the European Council meeting in Nice.  (index:  42204 )


In [61]:
# testing the function
new_question = "Is animal welfare important?"
nearest_questions = find_nearest_questions(new_question)

print("Nearest questions for: ", new_question)
for question, index in nearest_questions:
    print("- ", question, " (index: ", index, ")")

Nearest questions for:  Is animal welfare important?
-  Is it the minister&#39;s assessment that the bureaucracy and the plethora of regulations imposed on Danish farmers have gone too far, when e.g. the guidance on fertilization and harmony rules requires 145 pages of review, and the minister considers that there are limits to how large the regulatory burdens and application requirements for e.g. obtaining EU funding, it is reasonable to impose on ordinary people practicing their liberal professions?  (index:  5624 )
-  What is the minister&#39;s position on the fact that you are taxed on a free car, but not on a free car with a driver, and does the minister intend to change this bias in the law?  (index:  7341 )
-  Does the minister intend to have a discussion with the Social Appeals Board about whether any suspicion of unwellness in a child should be investigated and a municipality should not be refused further action?  (index:  3575 )
-  How does the minister relate to Maj Invest (

In [65]:
#count the number of tokens in new_corpus
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_corpus = [word_tokenize(x) for x in new_corpus]
tokenized_corpus = [item for sublist in tokenized_corpus for item in sublist]
len(tokenized_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rune7\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


399251

In [69]:
(1608000/1000) * 0.0020

3.216