In [1]:
import time
import json
import nltk

import scipy.sparse
import numpy as np

from gensim import models
from lib import graph, coarsening, utils

## Generate domain specific word embeddings

In [2]:
def get_sentences(data_path, data_type):
    data = json.load(open(data_path.format(data_type), 'r'))
    sentences = []
    for article in data['data']:
        for para in article['paragraphs']:
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            for sentence in nltk.sent_tokenize(context):
                sentences.append(nltk.word_tokenize(sentence))
            for qa in para['qas']:
                sentences.append(nltk.word_tokenize(qa['question']))
    return sentences

def generate_word_embeddings(data_path, datasets):
    sentences = []
    for dataset in datasets:
        sentences += get_sentences(data_path, dataset)
    w2v = models.Word2Vec(sentences)
    return w2v

In [3]:
squad_path = "/Users/dthai/data/squad/{}-v1.1.json"
squad_w2v = generate_word_embeddings(squad_path, ['train', 'dev'])
print("Vocab size {}.".format(len(squad_w2v.wv.vocab.keys())))

Vocab size 32255.


In [4]:
newsqa_path = "/Users/dthai/data/newsqa/{}-v1.1.json"
newsqa_w2v = generate_word_embeddings(newsqa_path, ['train', 'dev', 'test'])
print("Vocab size {}.".format(len(newsqa_w2v.wv.vocab.keys())))

Vocab size 47538.


In [5]:
overlap = set(squad_w2v.wv.vocab.keys()) & set(newsqa_w2v.wv.vocab.keys())
print("Number of overlapping vocab {}.".format(len(overlap)))

Number of overlapping vocab 21808.


## Features graphs

In [8]:
number_edges = 16
coarsening_levels = 0

In [13]:
def get_feature_graph(word_emb):
    vocab = word_emb.wv.vocab
    embeddings = np.empty((len(vocab), word_emb.vector_size))
    for i, word in enumerate(vocab.keys()):
        embeddings[i,:] = word_emb[word]

    graph_data = embeddings
    t_start = time.process_time()
    dist, idx = graph.distance_sklearn_metrics(graph_data, k=number_edges, metric='cosine')
    A = graph.adjacency(dist, idx)
    print("{} > {} edges".format(A.nnz//2, number_edges*graph_data.shape[0]//2))
    A = graph.replace_random_edges(A, 0)
    graphs, perm = coarsening.coarsen(A, levels=coarsening_levels, self_connections=False)
    L = [graph.laplacian(A, normalized=True) for A in graphs]
    print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
    return L

In [None]:
graph_data = embeddings
t_start = time.process_time()
dist, idx = graph.distance_sklearn_metrics(graph_data, k=number_edges, metric='cosine')
A = graph.adjacency(dist, idx)
print("{} > {} edges".format(A.nnz//2, number_edges*graph_data.shape[0]//2))
A = graph.replace_random_edges(A, 0)
graphs, perm = coarsening.coarsen(A, levels=coarsening_levels, self_connections=False)
L = [graph.laplacian(A, normalized=True) for A in graphs]
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))