In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
pd.set_option('display.max_colwidth', None)

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_new_df(df):
    new_df = pd.DataFrame(columns=['Answer', 'Index'])
    texts = []
    indexes = []
    for index in range(len(df)):
        # texts.append(df['Question'][index])
        # indexes.append(index)
        text_list = sent_tokenize(df['Question'][index])
        for text in text_list:
            texts.append(text)
            indexes.append(index)

    new_df['Answer'] = pd.Series(texts)
    new_df['Index'] = pd.Series(indexes)
    return new_df
    

In [3]:
df = pd.read_csv("../data/FAQs.csv")
test_df = pd.read_csv("../data/FAQs_test.csv")
new_df = get_new_df(df)

In [4]:
new_df

Unnamed: 0,Answer,Index
0,When was Albert Einstein born?,0
1,Where was he born?,1
2,When did he die?,2
3,Who were his parents?,3
4,Did he have any sisters and brothers?,4
5,Did he marry and have children?,5
6,Where did he receive his education?,6
7,When was Albert Einstein awarded the Nobel Prize in Physics?,7
8,Did Albert Einstein attend the Nobel Prize Award Ceremony?,8
9,For what did he receive the Nobel Prize?,9


In [5]:
test_df

Unnamed: 0,Question
0,What is the date of his death?
1,Did Einstein have siblings?
2,Who was his wife?
3,What was Einstein's father's name?
4,At what institutions did he study?


In [6]:
import re
def clean(text):
    text = re.sub(r'[Ss]ibling[s]',r'brother and sister',text)
    return text

In [7]:
new_df['Answer'] = new_df['Answer'].map(clean)
test_df['Question'] = test_df['Question'].map(clean)

## Preprocessing

In [8]:
import textacy
import spacy

In [9]:
def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]


def extract_nlp(doc):
    return {
        'lemmas': extract_lemmas(doc, filter_stops=False),
    }


In [10]:
def preprocess(df, process_column):

    nlp = spacy.load('en_core_web_md')
    nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
    for col in nlp_columns:
        df[col] = None

    docs = nlp.pipe(df[process_column])

    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[j] = values

    df[nlp_columns] = df[nlp_columns].applymap(lambda items: ' '.join(items))

    return df

In [11]:
new_df_processed = preprocess(new_df,'Answer')
test_df_processed = preprocess(test_df,'Question')

In [12]:
test_df_processed

Unnamed: 0,Question,lemmas
0,What is the date of his death?,what be the date of his death
1,Did Einstein have brother and sister?,do Einstein have brother and sister
2,Who was his wife?,who be his wife
3,What was Einstein's father's name?,what be Einstein 's father 's name
4,At what institutions did he study?,at what institution do he study


In [13]:
# init the model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
answers = new_df['Answer']
answer_embedding = model.encode(answers)

In [14]:
def check_test_set(test_df):
    for i in range(len(test_df)):

        target_qs = test_df['Question'][i]
        target_qs_embedding = model.encode(target_qs)

        index = cosine_similarity(
            [target_qs_embedding],
            answer_embedding[:]
        ).argmax()

        new_df['Index'][index]

        print(target_qs)
        print(df['Answer'][new_df['Index'][index]])
        print("\n")

In [15]:
check_test_set(test_df)

What is the date of his death?
He died 18 April 1955 in Princeton, New Jersey, USA.


Did Einstein have brother and sister?
He had one sister named Maja.


Who was his wife?
He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.


What was Einstein's father's name?
Albert Einstein was born on 14 March 1879.


At what institutions did he study?
He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)


