In [43]:
import math

import gensim
import numpy as np
import pandas as pd
import spacy
import textacy
from gensim.models import KeyedVectors, Word2Vec
from scipy.spatial import distance
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)

In [44]:
df = pd.read_csv("./data/FAQs.csv",encoding='utf-8')

In [45]:
df

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey, USA."
3,Who were his parents?,His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).
4,Did he have any sisters and brothers?,He had one sister named Maja.
5,Did he marry and have children?,"He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936."
6,Where did he receive his education?,"He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)"
7,When was Albert Einstein awarded the Nobel Prize in Physics?,"The Nobel Prize Awarding Institution, the Royal Swedish Academy of Sciences, decided to reserve the Nobel Prize in Physics in 1921, and therefore no Physics Prize was awarded that year. According to the statutes, a reserved prize can be awarded the year after, and Albert Einstein was awarded the 1921 Nobel Prize in Physics in 1922."
8,Did Albert Einstein attend the Nobel Prize Award Ceremony?,"The Nobel Prize was announced on 9 November 1922. Being too remote from Sweden, Albert Einstein could not attend the Nobel Prize Award Ceremony in Stockholm on 10 December the same year."
9,For what did he receive the Nobel Prize?,"Einstein was rewarded for his many contributions to theoretical physics, and especially for his discovery of the law of the photoelectric effect."


In [46]:
test_df = pd.read_csv("./data/FAQs_test.csv")

In [47]:
test_df

Unnamed: 0,Question
0,What is the date of his death?
1,Did Einstein have siblings?
2,Who was his wife?
3,What was Einstein's father's name?
4,At what institutions did he study?


### Training own Embedding

#### Extracting Lemmas

In [48]:
def extract_lemmas(doc,**kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc,**kwargs)]

In [49]:
def extract_nlp(doc):
    return {
        'lemmas': extract_lemmas(doc,
                                 exclude_pos=['PART', 'PUNCT',
                                              'DET', 'PRON', 'SYM', 'SPACE'],
                                 filter_stops=False),
    }


In [50]:
nlp = spacy.load('en_core_web_sm')

In [51]:
nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
print(nlp_columns)

['lemmas']


In [52]:
for col in nlp_columns:
    df[col] = None

In [53]:
batch_size = 50
batches = math.ceil(len(df) / batch_size)

for i in tqdm(range(0, len(df), batch_size), total=batches):
    docs = nlp.pipe(df['Question'][i:i+batch_size])

    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[i+j] = values

100%|██████████| 1/1 [00:00<00:00, 52.63it/s]


In [54]:
df[nlp_columns] = df[nlp_columns].applymap(lambda items: ' '.join(items))

In [55]:
df

Unnamed: 0,Question,Answer,lemmas
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.,when be Albert Einstein bear
1,Where was he born?,"He was born in Ulm, Germany.",where be bear
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey, USA.",when do die
3,Who were his parents?,His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).,be parent
4,Did he have any sisters and brothers?,He had one sister named Maja.,do have sister and brother
5,Did he marry and have children?,"He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.",do marry and have child
6,Where did he receive his education?,"He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)",where do receive education
7,When was Albert Einstein awarded the Nobel Prize in Physics?,"The Nobel Prize Awarding Institution, the Royal Swedish Academy of Sciences, decided to reserve the Nobel Prize in Physics in 1921, and therefore no Physics Prize was awarded that year. According to the statutes, a reserved prize can be awarded the year after, and Albert Einstein was awarded the 1921 Nobel Prize in Physics in 1922.",when be Albert Einstein award Nobel Prize in Physics
8,Did Albert Einstein attend the Nobel Prize Award Ceremony?,"The Nobel Prize was announced on 9 November 1922. Being too remote from Sweden, Albert Einstein could not attend the Nobel Prize Award Ceremony in Stockholm on 10 December the same year.",do Albert Einstein attend Nobel Prize Award Ceremony
9,For what did he receive the Nobel Prize?,"Einstein was rewarded for his many contributions to theoretical physics, and especially for his discovery of the law of the photoelectric effect.",for do receive Nobel Prize


In [56]:
# lower casing the tokens
df['lemmas'] = df['lemmas'].str.lower().str.split()

In [57]:
questions = df['lemmas']

In [58]:
questions

0                                [when, be, albert, einstein, bear]
1                                                 [where, be, bear]
2                                                   [when, do, die]
3                                                      [be, parent]
4                                  [do, have, sister, and, brother]
5                                     [do, marry, and, have, child]
6                                   [where, do, receive, education]
7    [when, be, albert, einstein, award, nobel, prize, in, physics]
8     [do, albert, einstein, attend, nobel, prize, award, ceremony]
9                                  [for, do, receive, nobel, prize]
Name: lemmas, dtype: object

#### Training Models with Gensim

In [59]:
model = Word2Vec(
    questions,  # tokenized input sentences
    vector_size=100,  # size of word vectors (default 100)
    window=3,  # context window size (default 5)
    sg=0,  # use skip-gram (default 0 = CBOW)
    min_count=1,  # ignore infrequent words (default 5)
    epochs=5  # number of epochs (default 5)
)

In [60]:
len(model.wv)

25

In [61]:
# saving the model
model.wv.save_word2vec_format("./models/test_w2v.bin",binary=True)

In [62]:
# loading the KeyedVectors
model = KeyedVectors.load_word2vec_format("./models/test_w2v.bin",binary=True)

In [71]:
model.most_similar('brother')

[('receive', 0.10194532573223114),
 ('ceremony', 0.08790984749794006),
 ('attend', 0.07373413443565369),
 ('albert', 0.048132069408893585),
 ('prize', 0.0426732674241066),
 ('have', 0.037428487092256546),
 ('marry', 0.02644934691488743),
 ('for', 0.016616368666291237),
 ('bear', 0.01109641045331955),
 ('where', 0.00639458978548646)]

## Sentence Similarity: Approach 1

In [64]:
# extracting word vectos
embeddings_index = {}

for word,vector in zip(model.index_to_key,model.vectors):
    coefs = np.asarray(vector)
    embeddings_index[word]=coefs

In [65]:
def avg_feature_vector(sentence, model, num_features):
    words = sentence.split()
    #feature vector is initialized as an empty array
    feature_vec = np.zeros((num_features, ))
    n_words = 0
    for word in words:
        if word in embeddings_index.keys():
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [66]:
s1_afv = avg_feature_vector(
    'When did he die?', model=embeddings_index, num_features=100)
s2_afv = avg_feature_vector(
    'What is the date of his death?', model=embeddings_index, num_features=100)
cos = distance.cosine(s1_afv, s2_afv)
print(cos)

0


  dist = 1.0 - uv / np.sqrt(uu * vv)


## Sentence Similarity: Appraoch 2

In [67]:
target_sentence = "What is the date of his death?"

In [68]:
questions_similarity = np.zeros(len(questions))

In [69]:
target_qs_words = [w for w in target_sentence.split() if w in model.index_to_key]

In [70]:
target_qs_words

[]

Note:
- Could not solve the OOV problem with manually trained embedding, which is not at all surprising... as the the vocab size is rather very small!
- Theres a way to handle it though, by using data augementation!