# Question Answering model - Information Retrival

### Import Libraries

In [7]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools
import swifter
from scipy.spatial.distance import cosine
from collections import Counter
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
import gensim.downloader
tqdm.pandas()

### Load Dataset

In [8]:
path_dir =  os.path.dirname(os.getcwd())
train_df = pd.read_csv(os.path.join(path_dir,r'data\interim\train_data.csv'))
val_df = pd.read_csv(os.path.join(path_dir,r'data\interim\val_data.csv'))
train_df.drop('Unnamed: 0',axis=1,inplace = True)
val_df.drop('Unnamed: 0',axis=1,inplace = True)
train_df.head(5)

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False


### Get whole answer sentences

In [9]:
def get_answer_context(df):
    length_context = 0
    answer = ""

    for sentence in sent_tokenize(df.context):
        length_context += len(sentence) + 1
        if df.answer_start <= length_context:
            if len(sentence) >= len(str(df.answer)):
                if answer == "":
                    return sentence
                else:
                    return answer + " " + sentence
            else:
                answer += sentence

In [10]:
train_df['answer_sentences'] = train_df.progress_apply(lambda row: get_answer_context(row),axis = 1)
val_df['answer_sentences'] = val_df.progress_apply(lambda row: get_answer_context(row),axis = 1)

100%|███████████████████████████| 86820/86820 [00:13<00:00, 6624.22it/s]
100%|███████████████████████████| 20302/20302 [00:02<00:00, 8102.40it/s]


In [11]:
train_df.head()

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Their hiatus saw the release of Beyoncé's debu...
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,"Born and raised in Houston, Texas, she perform..."
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False,"Born and raised in Houston, Texas, she perform..."


### Preprocess context

In [12]:
context_df = pd.DataFrame(train_df['context'].unique().tolist(),columns=['context'])
context_df['processed'] = context_df['context'].progress_apply(lambda x: simple_preprocess(x))

question_df = pd.DataFrame(train_df['question'].unique().tolist(),columns=['question'])
question_df['processed'] = question_df['question'].progress_apply(lambda x: simple_preprocess(x))


100%|██████████████████████████| 18877/18877 [00:01<00:00, 10953.26it/s]
100%|██████████████████████████| 86768/86768 [00:00<00:00, 92444.06it/s]


### Training a word2vec model

In [13]:
train_sentences = context_df['processed'].tolist() + question_df['processed'].tolist()
train_words = list(itertools.chain(*train_sentences))
low_word_count = 1
word_count_dict = Counter(train_words)
low_freq_words = [k for k, v in word_count_dict.items() if v == low_word_count]
UNK = '<UNK>'
processed_train_sentences = [[word if word not in low_freq_words else UNK for word in sentence]
                            for sentence in tqdm(train_sentences)]


100%|██████████████████████████| 105645/105645 [17:10<00:00, 102.51it/s]


In [9]:
from gensim.models.callbacks import CallbackAny2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [374]:
vector_size = 300
w2v_model = Word2Vec(min_count=20, 
                     window = 20,
                     vector_size = vector_size,
                     workers=10)

w2v_model.build_vocab(processed_train_sentences)
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# Train Word Embeddings
w2v_model.train(processed_train_sentences, 
                total_examples=w2v_model.corpus_count, 
                epochs=350, 
                report_delay=1,
                compute_loss = True, # set compute_loss = True
                callbacks=[callback()]) 
print(w2v_model.get_latest_training_loss())

Vocab size 10414
Loss after epoch 0: 508614.78125
Loss after epoch 1: 447039.09375
Loss after epoch 2: 436167.125
Loss after epoch 3: 402817.375
Loss after epoch 4: 386353.125
Loss after epoch 5: 366140.5
Loss after epoch 6: 357714.75
Loss after epoch 7: 352439.5
Loss after epoch 8: 345756.75
Loss after epoch 9: 355441.75
Loss after epoch 10: 330795.25
Loss after epoch 11: 305028.5
Loss after epoch 12: 322829.0
Loss after epoch 13: 308551.0
Loss after epoch 14: 317928.5
Loss after epoch 15: 307165.0
Loss after epoch 16: 305995.0
Loss after epoch 17: 315734.0
Loss after epoch 18: 302671.5
Loss after epoch 19: 305039.0
Loss after epoch 20: 301068.0
Loss after epoch 21: 305026.5
Loss after epoch 22: 304666.5
Loss after epoch 23: 301880.5
Loss after epoch 24: 280917.0
Loss after epoch 25: 287391.0
Loss after epoch 26: 278628.0
Loss after epoch 27: 278125.0
Loss after epoch 28: 288754.0
Loss after epoch 29: 281670.0
Loss after epoch 30: 299383.0
Loss after epoch 31: 279910.0
Loss after epoc

Loss after epoch 267: 135284.0
Loss after epoch 268: 135116.0
Loss after epoch 269: 139432.0
Loss after epoch 270: 139712.0
Loss after epoch 271: 137572.0
Loss after epoch 272: 128348.0
Loss after epoch 273: 131448.0
Loss after epoch 274: 134184.0
Loss after epoch 275: 131588.0
Loss after epoch 276: 137476.0
Loss after epoch 277: 133940.0
Loss after epoch 278: 131924.0
Loss after epoch 279: 124240.0
Loss after epoch 280: 129612.0
Loss after epoch 281: 127428.0
Loss after epoch 282: 122960.0
Loss after epoch 283: 125820.0
Loss after epoch 284: 126860.0
Loss after epoch 285: 128540.0
Loss after epoch 286: 132484.0
Loss after epoch 287: 118700.0
Loss after epoch 288: 117960.0
Loss after epoch 289: 126368.0
Loss after epoch 290: 123104.0
Loss after epoch 291: 115880.0
Loss after epoch 292: 114596.0
Loss after epoch 293: 115772.0
Loss after epoch 294: 118188.0
Loss after epoch 295: 119352.0
Loss after epoch 296: 113560.0
Loss after epoch 297: 111892.0
Loss after epoch 298: 111172.0
Loss aft

In [375]:
w2v_model.wv.most_similar(positive="time")

[('least', 0.40335699915885925),
 ('expense', 0.2993525564670563),
 ('age', 0.29746243357658386),
 ('level', 0.274740070104599),
 ('point', 0.2606557607650757),
 ('rate', 0.2550274431705475),
 ('times', 0.2353343516588211),
 ('night', 0.23298636078834534),
 ('beginning', 0.22899189591407776),
 ('end', 0.22744593024253845)]

In [376]:
'their' in w2v_model.wv.key_to_index

True

In [409]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec

In [378]:
# avg_sentence_vector(train_df['question'].tolist()[0].split(),w2v_model,100) 
# avg_sentence_vector(train_df['question'].tolist()[1].split(),w2v_model,100)

In [426]:
def get_cosine_similarity(context,question,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
#     print(context,question)
    context_sents = sent_tokenize(context)
#     print(context_sents)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
#     print(processed_context)
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
#     print(len(context_vectors[0]))
#     print(cosine_similarity(np.array(context_vectors[0]).reshape(1,-1),np.array(question_vector).reshape(1,-1)))
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
#     print(f"Cosine scores: {cosine_sim_list}")
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [407]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[1]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,w2v_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: What areas did Beyonce compete in when she was growing up?


(array([[0.21846901]], dtype=float32),
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.")

In [381]:
temp_df = train_df.head(5)

In [382]:
temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
temp_df

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 218.47it/s]

Cosine scores: [array([[0.04423445]], dtype=float32), array([[0.16496505]], dtype=float32), array([[0.0983622]], dtype=float32), array([[0.22614476]], dtype=float32)]
Cosine scores: [array([[0.09085849]], dtype=float32), array([[0.21846901]], dtype=float32), array([[0.12052556]], dtype=float32), array([[0.20577396]], dtype=float32)]
Cosine scores: [array([[0.33140668]], dtype=float32), array([[0.50080836]], dtype=float32), array([[0.28680265]], dtype=float32), array([[0.44583791]], dtype=float32)]
Cosine scores: [array([[0.0683279]], dtype=float32), array([[0.14804739]], dtype=float32), array([[0.14860673]], dtype=float32), array([[0.1707281]], dtype=float32)]
Cosine scores: [array([[0.18426214]], dtype=float32), array([[0.22673973]], dtype=float32), array([[0.2526833]], dtype=float32), array([[0.2705803]], dtype=float32)]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.22614476]],Their hiatus saw the release of Beyoncé's debu...
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.21846901]],"Born and raised in Houston, Texas, she perform..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Their hiatus saw the release of Beyoncé's debu...,[[0.50080836]],"Born and raised in Houston, Texas, she perform..."
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,"Born and raised in Houston, Texas, she perform...",[[0.1707281]],Their hiatus saw the release of Beyoncé's debu...
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False,"Born and raised in Houston, Texas, she perform...",[[0.2705803]],Their hiatus saw the release of Beyoncé's debu...


### Evaluvate results

#### On Train Set

In [395]:
train_df[['consine_sim','predicted_answer']] = train_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
train_df.head(2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 86820/86820 [03:52<00:00, 374.01it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.22614476]],Their hiatus saw the release of Beyoncé's debu...,False
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.21846901]],"Born and raised in Houston, Texas, she perform...",True
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Their hiatus saw the release of Beyoncé's debu...,[[0.50080836]],"Born and raised in Houston, Texas, she perform...",False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,"Born and raised in Houston, Texas, she perform...",[[0.1707281]],Their hiatus saw the release of Beyoncé's debu...,False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False,"Born and raised in Houston, Texas, she perform...",[[0.2705803]],Their hiatus saw the release of Beyoncé's debu...,False
...,...,...,...,...,...,...,...,...,...,...,...
86815,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229,False,KMC's first international relationship was est...,[[0.4740299]],KMC's first international relationship was est...,True
86816,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414,False,This activity has been further enhanced by est...,[[0.34433782]],KMC's first international relationship was est...,False
86817,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476,False,This activity has been further enhanced by est...,[[0.3677879]],"Kathmandu Metropolitan City (KMC), in order to...",False
86818,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199,False,KMC's first international relationship was est...,[[0.41204268]],"Kathmandu Metropolitan City (KMC), in order to...",False


In [388]:
train_df['correct_prediction'] = train_df['answer_sentences'] == train_df['predicted_answer']
train_df['correct_prediction'].value_counts()

True     61856
False    24964
Name: correct_prediction, dtype: int64

In [391]:
print(f"accuracy: {train_df[train_df['correct_prediction']].shape[0]/train_df.shape[0]}")

accuracy: 0.7124625662289795


#### On Validation set

In [392]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
val_df.head(2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 20302/20302 [00:56<00:00, 359.02it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.2944552]],The Normans (Norman: Nourmands; French: Norman...
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.2944552]],The Normans (Norman: Nourmands; French: Norman...


In [393]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14714
False     5588
Name: correct_prediction, dtype: int64

In [394]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7247561816569796


### Download word2vec model google

In [41]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [411]:
google_model = gensim.downloader.load('word2vec-google-news-300')

In [416]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[0]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,google_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: When did Beyonce start becoming popular?
[['<UNK>', '<UNK>', '<UNK>', 'carter', '<UNK>', 'bee', 'yon', 'say', 'born', 'september', 'is', 'an', 'american', 'singer', 'songwriter', 'record', 'producer', '<UNK>', 'actress'], ['born', '<UNK>', 'raised', 'in', 'houston', 'texas', 'she', 'performed', 'in'

(array([[0.6053659]], dtype=float32),
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.")

In [425]:
google_model['world'].shape

(300,)

### Evaluvate results

In [447]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec
def get_context_vector(context,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    
    return context_vectors
    
def get_cosine_similarity(context,context_vectors,question,model,vector_size=300):
    context_sents = sent_tokenize(context)
    
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
        
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [449]:
temp_df['context_vec'] = temp_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,google_model))

temp_df[['consine_sim','predicted_answer']] = temp_df[['context','context_vec','question']]\
.swifter.progress_bar(enable=True, desc=None)\
.apply(lambda x: get_cosine_similarity(x[0],x[1],x[2],google_model,300),axis=1,result_type="expand")
temp_df.head(2)

Pandas Apply:   0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['context_vec'] = temp_df['context'].swifter\


Pandas Apply:   0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','context_vec','question']]\


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,context_vec
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.6053659]],"Managed by her father, Mathew Knowles, the gro...","[[[-0.011311122, -0.024881635, -0.053231377, 0..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.6998993]],"Managed by her father, Mathew Knowles, the gro...","[[[-0.011311122, -0.024881635, -0.053231377, 0..."


In [450]:
# temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
# .swifter.progress_bar(enable=True, desc=None)\
# .apply(lambda x: get_cosine_similarity(x[0],x[1],google_model,300),axis=1,result_type="expand")
# temp_df.head(2)

#### On val set 

In [457]:
val_df['context_vec'] = val_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,google_model))

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

In [458]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','context_vec','question']]\
.swifter.progress_bar(enable=True, desc=None)\
.apply(lambda x: get_cosine_similarity(x[0],x[1],x[2],google_model,300),axis=1,result_type="expand")
val_df.head(2)

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction,context_vec
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.50003517]],The distinct cultural and ethnic identity of t...,True,"[[[0.064170435, 0.075368784, 0.09860872, 0.118..."
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.50003517]],The distinct cultural and ethnic identity of t...,True,"[[[0.064170435, 0.075368784, 0.09860872, 0.118..."


In [462]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14660
False     5642
Name: correct_prediction, dtype: int64

In [465]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7220963451876662


### Training a fasttext model

In [30]:
vector_size = 400
fast_text_model = FastText(min_count=1, 
                     window = 5,
                     vector_size = vector_size,
                     sg = 1,
                     hs = 1,
                     workers=10)
fast_text_model.build_vocab(train_sentences)
words = fast_text_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# Train Word Embeddings
fast_text_model.train(train_sentences, 
                total_examples=fast_text_model.corpus_count, 
                epochs=500, 
                report_delay=1,
                compute_loss = True,) # set compute_loss = True
#                 callbacks=[callback()]) 
print(fast_text_model.get_latest_training_loss())

Vocab size 78414
0.0


In [31]:
fast_text_model.wv.most_similar(positive="time")

[('period', 0.46009957790374756),
 ('gamedaily', 0.3613353967666626),
 ('during', 0.34807538986206055),
 ('beatle', 0.3249432444572449),
 ('same', 0.3225545287132263),
 ('periods', 0.32006415724754333),
 ('johnathon', 0.319020539522171),
 ('span', 0.31888797879219055),
 ('ascession', 0.31213393807411194),
 ('accension', 0.30582237243652344)]

In [32]:
print(fast_text_model.wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))

0.4006621


In [33]:
def get_cosine_similarity(context,question,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    
    processed_context = [simple_preprocess(sent) for sent in context_sents]
#     processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
#                          for processed_context_sent in processed_context]
    processed_question = simple_preprocess(question)
#     print(processed_context)
#     processed_question = [word if word in vocab else UNK for word in processed_question]
    
#     context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
#     question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
#     print(len(context_vectors[0]))
#     print(cosine_similarity(np.array(context_vectors[0]).reshape(1,-1),np.array(question_vector).reshape(1,-1)))
    
#     cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
        #     print(f"Cosine scores: {cosine_sim_list}")
    cosine_sim_list = [model.wv.n_similarity(context_sent,processed_question) for context_sent in processed_context if len(context_sent) > 0]
#     print(cosine_sim_list)
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [34]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[1]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,fast_text_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: What areas did Beyonce compete in when she was growing up?


(0.5806109,
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.")

In [35]:
train_df[['consine_sim','predicted_answer']] = train_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],fast_text_model,100),axis=1,result_type="expand")
train_df.head(2)

100%|███████████████████████████| 86820/86820 [01:01<00:00, 1422.03it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",0.508958,"Born and raised in Houston, Texas, she perform...",True
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",0.580611,"Born and raised in Houston, Texas, she perform...",True


In [36]:
train_df['correct_prediction'] = train_df['answer_sentences'] == train_df['predicted_answer']
train_df['correct_prediction'].value_counts()

True     61594
False    25226
Name: correct_prediction, dtype: int64

In [37]:
print(f"accuracy: {train_df[train_df['correct_prediction']].shape[0]/train_df.shape[0]}")

accuracy: 0.7094448283805574


In [38]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],fast_text_model),axis=1,result_type="expand")
val_df.head(2)

100%|████████████████████████████| 20302/20302 [00:22<00:00, 919.60it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,0.621735,The Normans (Norman: Nourmands; French: Norman...,True
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,0.621735,The Normans (Norman: Nourmands; French: Norman...,True


In [39]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14381
False     5921
Name: correct_prediction, dtype: int64

In [40]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7083538567628805


#### Download and use GloVe

In [42]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [43]:
glove_model = gensim.downloader.load('glove-wiki-gigaword-300')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [44]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec
def get_context_vector(context,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    
    return context_vectors
    
def get_cosine_similarity(context,context_vectors,question,model,vector_size=300):
    context_sents = sent_tokenize(context)
    
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
        
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [45]:
val_df['context_vec'] = val_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,glove_model))

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

In [46]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14381
False     5921
Name: correct_prediction, dtype: int64

In [47]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7083538567628805
