In [29]:
import pandas as pd
from nltk.tokenize import sent_tokenize
pd.set_option('display.max_colwidth', None)

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [30]:
def get_new_df(df):
    new_df = pd.DataFrame(columns=['Answer', 'Index'])
    texts = []
    indexes = []
    for index in range(len(df)):
        text_list = sent_tokenize(df['Answer'][index])
        for text in text_list:
            texts.append(text)
            indexes.append(index)

    new_df['Answer'] = pd.Series(texts)
    new_df['Index'] = pd.Series(indexes)
    return new_df

In [31]:
df = pd.read_csv("./data/FAQs.csv")
test_df = pd.read_csv("./data/FAQs_test.csv")
new_df = get_new_df(df)

## Preprocessing

In [32]:
import textacy
import spacy

In [33]:
def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]


def extract_nlp(doc):
    return {
        'lemmas': extract_lemmas(doc, filter_stops=False),
    }


In [34]:
def preprocess(df, process_column):

    nlp = spacy.load('en_core_web_md')
    nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
    for col in nlp_columns:
        df[col] = None

    docs = nlp.pipe(df[process_column])

    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[j] = values

    df[nlp_columns] = df[nlp_columns].applymap(lambda items: ' '.join(items))

    return df

In [35]:
new_df_processed = preprocess(new_df,'Answer')
test_df_processed = preprocess(test_df,'Question')

In [36]:
new_df_processed

Unnamed: 0,Answer,Index,lemmas
0,Albert Einstein was born on 14 March 1879.,0,Albert Einstein be bear on 14 March 1879
1,"He was born in Ulm, Germany.",1,he be bear in Ulm Germany
2,"He died 18 April 1955 in Princeton, New Jersey, USA.",2,he die 18 April 1955 in Princeton New Jersey USA
3,His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).,3,his father be Hermann Einstein and his mother be Pauline Einstein bear Koch
4,He had one sister named Maja.,4,he have one sister name Maja
5,He was married to Mileva Marić between 1903 and 1919.,5,he be married to Mileva Marić between 1903 and 1919
6,"They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910).",5,they have three child Lieserl bear 1902 Hans Albert bear 1904 and Eduard bear 1910
7,He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.,5,he marry Elsa Löwenthal in 1919 and they live together until her death in 1936
8,"He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)",6,he receive his main education at the follow school catholic elementary school in Munich Germany 1885 1888)luitpold Gymnasium in Munich Germany 1888 1894 cantonal school in Aarau Switzerland 1895 1896 Swiss Federal Institute of Technology in Zurich Switzerland 1896 1900 ph.d. from Zurich University Switzerland 1905
9,"The Nobel Prize Awarding Institution, the Royal Swedish Academy of Sciences, decided to reserve the Nobel Prize in Physics in 1921, and therefore no Physics Prize was awarded that year.",7,the Nobel Prize Awarding Institution the Royal Swedish Academy of Sciences decide to reserve the Nobel Prize in Physics in 1921 and therefore no Physics Prize be award that year


In [37]:
# init the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
answers = new_df['Answer']
answer_embedding = model.encode(answers)

In [38]:
def check_test_set(test_df):
    for i in range(len(test_df)):

        target_qs = test_df['Question'][i]
        target_qs_embedding = model.encode(target_qs)

        index = cosine_similarity(
            [target_qs_embedding],
            answer_embedding[:]
        ).argmax()

        new_df['Index'][index]

        print(target_qs)
        print(df['Answer'][new_df['Index'][index]])
        print("\n")

In [39]:
check_test_set(test_df_processed)

What is the date of his death?
He died 18 April 1955 in Princeton, New Jersey, USA.


Did Einstein have siblings?
His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).


Who was his wife?
He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.


What was Einstein's father's name?
His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).


At what institutions did he study?
He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)


