# Load data

In [None]:
import glob
import os

In [None]:
data = []
for n in glob.glob("un/TXT/Session 77 - 2022/*.txt"):
    data.append({"country": os.path.basename(n.replace("_77_2022.txt", "")), "text": open(n).read() })

In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', 500)
df = pd.DataFrame(data)
df

# Sentence segmentation

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
from tqdm.auto import tqdm

In [None]:
# runs 30 seconds or so
sentences = []
for text in tqdm(df["text"]):
    doc = nlp(text)
    for sentence in doc.sents:
        sentences.append(str(sentence).strip())

In [None]:
sentences

In [None]:
len(sentences)

In [None]:
open("sentences.txt", "w").write("@@@".join(sentences))

# Encode sentences

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
# can take a minute or two depending on CPU/GPU configuration
sembeddings = model.encode(sentences, show_progress_bar=True)

In [None]:
import numpy as np
with open("sentences.npy", "wb") as f:
    np.save(f, sembeddings)

In [None]:
sembeddings.shape

In [None]:
model2 = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

In [None]:
sembeddings2 = model2.encode(sentences, show_progress_bar=True)

In [None]:
with open("sentences2.npy", "wb") as f:
    np.save(f, sembeddings2)

# Retrieval

In [None]:
def search(query, text, corpus_embeddings, model, top=20):
    # code query to restrict search space
    question_embedding = model.encode(query)
    
    # Determine similarity (vectors are normalized)
    sim = np.dot(corpus_embeddings, question_embedding)
    
    # Get most similar top by sorting
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]
    
    # Return as dataframe
    return pd.DataFrame(hits)

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("The climate crisis is worse in poorer countries", sentences, sembeddings, model)

In [None]:
search("The climate crisis is worse in poorer countries", sentences, sembeddings2, model2)

In [None]:
search("Die Klimakrise betrifft vor allem ärmere Länder", sentences, sembeddings2, model2)