## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [LR using pretrained word embeddings](#2.-LR-using-pretrained-word-embeddings)  
3. [BM25](#3.-BM25)
4. [String kernels](#4.-String-kernels)
5. [Classification {combined methods}](#5.-Classification-{combined-methods})
6. [Evaluation](#6.-Evaluation)

## 1. Loading preprocessed data

In [1]:
import evaluation
import json
import gensim
import numpy as np
import pandas as pd

from input_output import load_data, write_results
from pprint import pprint
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

IN_PATH = '../data/squad/'
WORD2VEC_PATH = '../word_embeddings/GoogleNews-vectors-negative300.bin'
word2vec_model = None

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. LR using pretrained word embeddings

In [3]:
def get_pretrained_word_embeddings(sentence, word_emb_dims=300):
    global word2vec_model
    
    if word2vec_model is None:
        print('load word2vec_model')
        word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

    word_embeddings = np.zeros(word_emb_dims)
    word_emb_nr = 0

    for word in sentence:
        if word2vec_model.vocab.has_key(word):
            word_embeddings += word2vec_model.word_vec(word)
            word_emb_nr += 1

    # a sentence is represented by the average of word_embbedings
    if word_emb_nr != 0:
        word_embeddings = word_embeddings / word_emb_nr

    return word_embeddings

In [4]:
def get_features(dataset, q_a_combination_method):
    train_x = list()
    train_y = list()
    train_steps = list()
    train_labels = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_words = qas['question_lemmas_without_stopwords']
                question_embedding = get_pretrained_word_embeddings(question_words)
                
                candidate_sentences_embeddings = list()
                for sentence in qas_context['context_sentences_lemmas_without_stopwords']:
                    candidate_sentences_embeddings.append(get_pretrained_word_embeddings(sentence))
                
                labels = set([d['answer_label'] for d in qas['answers']])
                train_labels.append(labels)
                
                train_steps.append(len(candidate_sentences_embeddings))
                
                for i in range(len(candidate_sentences_embeddings)):
                    feature_vector = []
                    if q_a_combination_method == 'concatenation':
                        feature_vector = np.concatenate((question_embedding,
                                                         candidate_sentences_embeddings[i]))
                    elif q_a_combination_method == 'diff_abs':
                        feature_vector = np.abs(question_embedding - 
                                                candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'diff_sqr':
                        feature_vector = np.square(question_embedding - 
                                                   candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'sum':
                        feature_vector = question_embedding + candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'dot_product':
                        feature_vector = question_embedding * candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'min':
                        feature_vector = np.minimum(question_embedding,
                                                    candidate_sentences_embeddings[i])            
                    elif q_a_combination_method == 'max':
                        feature_vector = np.maximum(question_embedding,
                                                    candidate_sentences_embeddings[i])  
                        
                    train_x.append(feature_vector)
                    label = i in labels
                    train_y.append(label)
                    
    return (train_x, train_y, train_steps, train_labels)

In [5]:
get_pretrained_word_embeddings(['hello', 'world']);

load word2vec_model


In [6]:
def logistic_regression(train_x, train_y, test_x):
    pred = np.zeros(len(test_x))

    lr_model = LogisticRegression(C=1, dual=True)
    lr_model.fit(train_x, train_y)
    pred = lr_model.predict_proba(test_x)[:, 1]

    return (lr_model, pred)

In [7]:
def split_scores_for_each_question(y_pred, steps_lengths):
    pred_scores = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = list()
        for idx in range(step_length):
            scores.append(y_pred[steps_lengths_sum + idx])
        
        pred_scores.append(scores)
        
        steps_lengths_sum += step_length
        
    return pred_scores

In [8]:
def run_LR_and_save_pred_scores(q_a_combination_method, data):
    data['LR'] = {'dev_preds': {}, 'train_preds': {}}
    iteration = 0
    
    for q_a_comb in q_a_combination_method:
        iteration += 1
        # feature extraction
        (train_x, train_y, train_steps, train_labels) = get_features(train, q_a_comb)
        (dev_x, dev_y, dev_steps, dev_labels) = get_features(dev, q_a_comb)
        print(q_a_comb, len(train_x[0]))
        
        # dev set
        (lr_model, dev_preds) = logistic_regression(train_x, train_y, dev_x)
        #dev_pred_scores = split_scores_for_each_question(dev_preds, dev_steps)
        
        # train set
        train_preds = lr_model.predict_proba(train_x)[:, 1]
        
        if iteration == 1:
            data['train_steps'] = train_steps
            data['dev_steps'] = dev_steps
            data['train_labels'] = train_labels
            data['dev_labels'] = dev_labels
            data['train_y'] = train_y
            data['dev_y'] = dev_y
            
        data['LR']['dev_preds'][q_a_comb] = dev_preds
        data['LR']['train_preds'][q_a_comb] = train_preds
    
    print('done')

In [9]:
data = { 
        'LR': {}, 'BM25': {}, 'string_kernels': {}, 
        'dev_steps': [], 'train_steps': [],
        'dev_labels': [], 'train_labels': [],
        'dev_y': [], 'train_y': []
       }

In [10]:
q_a_combination_method = ['diff_abs', 'diff_sqr', 
                          'dot_product', 'min', 'max']

run_LR_and_save_pred_scores(q_a_combination_method, data)

100%|██████████| 442/442 [00:18<00:00, 23.46it/s]
100%|██████████| 48/48 [00:02<00:00, 20.24it/s]


('diff_abs', 300)


100%|██████████| 442/442 [00:19<00:00, 22.97it/s]
100%|██████████| 48/48 [00:02<00:00, 20.25it/s]


('diff_sqr', 300)


100%|██████████| 442/442 [00:18<00:00, 24.04it/s]
100%|██████████| 48/48 [00:02<00:00, 21.63it/s]


('dot_product', 300)


100%|██████████| 442/442 [00:18<00:00, 23.72it/s]
100%|██████████| 48/48 [00:02<00:00, 20.27it/s]


('min', 300)


100%|██████████| 442/442 [00:18<00:00, 23.77it/s]
100%|██████████| 48/48 [00:02<00:00, 20.25it/s]


('max', 300)
done


In [11]:
data['LR'];

## 3.BM25

In [12]:
from gensim.summarization.bm25 import BM25

In [13]:
def get_bm25_individual_scores(dataset):
    bm25_scores = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_sentence = qas['question_lemmas_without_stopwords']
                candidate_sentences = qas_context['context_sentences_lemmas_without_stopwords']   
                
                bm25 = BM25(candidate_sentences)
                candidate_scores = bm25.get_scores(question_sentence, 1)
                bm25_scores += candidate_scores
    
    return bm25_scores

In [14]:
data['BM25'] = {'dev_preds': [], 'train_preds': []}
data['BM25']['dev_preds'] = get_bm25_individual_scores(dev)
data['BM25']['train_preds'] = get_bm25_individual_scores(train)

100%|██████████| 48/48 [00:00<00:00, 52.18it/s]
100%|██████████| 442/442 [00:07<00:00, 59.68it/s]


In [15]:
data['BM25'];

## 4. String kernels

In [16]:
from string_kernels import *

In [17]:
def kernel_scores_for_a_given_question(question_words, context_sentences, kernel_type):
    scores = []
    
    for candidate_sentence in context_sentences:
        kernel_value = 0
        if kernel_type == 'spectrum_kernel':
            kernel_value = spectrum_kernel_value(question_words, candidate_sentence)
        elif kernel_type == 'presence_kernel':
            kernel_value = presence_kernel_value(question_words, candidate_sentence)
        elif kernel_type == 'intersection_kernel':
            kernel_value = intersection_kernel_value(question_words, candidate_sentence)
        
        scores.append(kernel_value)
        
    return scores

In [18]:
def get_kernel_scores(dataset, method):
    kernel_scores = list()
    
    for article in dataset['data']:
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                # trying to keep the same notation
                question_words = qas['question_lemmas_without_stopwords']
                candidate_answers_words = qas_context['context_sentences_lemmas_without_stopwords']
                
                # run a method
                kernel_scores += kernel_scores_for_a_given_question(question_words, 
                                                                    candidate_answers_words,
                                                                    method)
    
    return kernel_scores

In [19]:
data['string_kernels'] = {'dev_preds': {}, 'train_preds': {}}
data['string_kernels']['dev_preds']['presence_kernel'] = get_kernel_scores(dev, 'presence_kernel')
data['string_kernels']['train_preds']['presence_kernel'] = get_kernel_scores(train, 'presence_kernel')
data['string_kernels']['dev_preds']['intersection_kernel'] = get_kernel_scores(dev, 'intersection_kernel')
data['string_kernels']['train_preds']['intersection_kernel'] = get_kernel_scores(train, 'intersection_kernel')

In [20]:
data['string_kernels'];

In [21]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
 

def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 

def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
    
    return None

 
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        similarities = [synset.path_similarity(ss) for ss in synsets2]
        best_score = 0
        if similarities:
            best_score = max(similarities)
 
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    if count > 0:
        score /= count
    
    return score


def symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2 

In [22]:
def sentence_similarity_scores(question_words, candidate_answers_words):
    question_sentence = " ".join(question_words)
    scores = list()
    
    for sentence_label in range(len(candidate_answers_words)):
        candidate_answer_sentence = " ".join(candidate_answers_words[sentence_label])
        similarity = symmetric_sentence_similarity(question_sentence, candidate_answer_sentence)
        scores.append(similarity)
    
    return scores

In [23]:
def get_sentence_similarity_scores(dataset, method):
    sentence_similarity_scores = list()
    
    for article in dataset['data']:
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                # trying to keep the same notation
                question_words = qas['question_lemmas_without_stopwords']
                candidate_answers_words = qas_context['context_sentences_lemmas_without_stopwords']
                
                # run a method
                sentence_similarity_scores += sentence_similarity_scores(question_words,
                                                                         candidate_answers_words)
    
    return sentence_similarity_scores

## 5. Classification {combined methods}

In [24]:
train_x_combined = np.vstack((data['LR']['train_preds']['diff_abs'],
                              data['LR']['train_preds']['diff_sqr'],
                              data['LR']['train_preds']['dot_product'],
                              data['LR']['train_preds']['min'],
                              data['LR']['train_preds']['max'],
                              data['BM25']['train_preds'],
                              data['string_kernels']['train_preds']['presence_kernel'],
                              data['string_kernels']['train_preds']['intersection_kernel'])).T
train_y_combined = data['train_y']
train_steps_combined = data['train_steps']
train_labels_combined = data['train_labels']

In [25]:
dev_x_combined = np.vstack((data['LR']['dev_preds']['diff_abs'],
                            data['LR']['dev_preds']['diff_sqr'],
                            data['LR']['dev_preds']['dot_product'],
                            data['LR']['dev_preds']['min'],
                            data['LR']['dev_preds']['max'],
                            data['BM25']['dev_preds'],
                            data['string_kernels']['dev_preds']['presence_kernel'],
                            data['string_kernels']['dev_preds']['intersection_kernel'])).T
dev_y_combined = data['dev_y']
dev_steps_combined = data['dev_steps']
dev_labels_combined = data['dev_labels']

## 6. Evaluation

In [26]:
def extract_scores(y_pred, steps_lengths):
    pred_scores_id = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = {}
        for idx in range(step_length):
            scores[idx] = y_pred[steps_lengths_sum + idx]
        
        sorted_scores_id = sorted(scores, key=scores.get, reverse=True)
        pred_scores_id.append(sorted_scores_id)
        
        steps_lengths_sum += step_length
        
    return pred_scores_id

In [27]:
# dev set
(lr_model, dev_preds) = logistic_regression(train_x_combined,
                                            train_y_combined,
                                            dev_x_combined)
dev_pred_scores_id = extract_scores(dev_preds, dev_steps_combined)

dev_results = evaluation.get_results(dev_pred_scores_id, 
                                     dev_labels_combined, 
                                     'dev combined')
write_results(dev_results)

# train set, just for curiosity
train_preds = lr_model.predict_proba(train_x_combined)[:, 1]
train_pred_scores_id = extract_scores(train_preds, 
                                      train_steps_combined)

train_results = evaluation.get_results(train_pred_scores_id, 
                                       train_labels_combined, 
                                       'train combined')
write_results(train_results)

Method: dev combined
AvgPrec@1: 0.83793755913 (std = 0.368508081498)
MAP: 0.898180673612 (std = 0.219387619495)


Method: train combined
AvgPrec@1: 0.804164431101 (std = 0.396842536597)
MAP: 0.882941739795 (std = 0.243246791108)


