# Installs and Downloads

In [None]:
!pip install fasttext
!pip install transformers

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
!unzip crawl-300d-2M-subword.zip
!rm -f crawl-300d-2M-subword.vec

# Load Data

In [3]:
import json

data_filename = 'iclr_2017.json'

with open(data_filename) as fd:
    data = json.load(fd)

In [4]:
print(data[0]) # data example

{'id': '304', 'title': 'Making Neural Programming Architectures Generalize via Recursion', 'abstract': 'Empirically, neural networks that attempt to learn programs from data have exhibited poor generalizability. Moreover, it has traditionally been difficult to reason about the behavior of these models beyond a certain level of input complexity. In order to address these issues, we propose augmenting neural architectures with a key abstraction: recursion. As an application, we implement recursion in the Neural Programmer-Interpreter framework on four tasks: grade-school addition, bubble sort, topological sort, and quicksort. We demonstrate superior generalizability and interpretability with small amounts of training data. Recursion divides the problem into smaller pieces and drastically reduces the domain of each neural network component, making it tractable to prove guarantees about the overall system’s behavior. Our experience suggests that in order for neural architectures to robustl

# Binary Word Embeddings

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords') # download stopwords if necessary
stop_words = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

# preprocessing function
def process_text(s):
    s = s.lower() # lower text
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    s = s.split() # split whitespace
    s = [porter_stemmer.stem(w) for w in s if w not in stop_words] # remove stop words
    return s

all_words = set()
for p in data:
    all_words = all_words.union(set(process_text(p['abstract'])))
all_words = sorted(all_words)
print(all_words) # print created vocabulary

all_words_map = {}
for i in range(len(all_words)):
    all_words_map[all_words[i]] = i

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['abandon', 'abil', 'abl', 'ablat', 'abovement', 'absenc', 'absolut', 'absorb', 'abstract', 'ac', 'acceler', 'accept', 'access', 'accommod', 'accompani', 'accomplish', 'accord', 'accordingli', 'account', 'accumul', 'accur', 'accuraci', 'achiev', 'achromat', 'acoust', 'acquir', 'acquisit', 'across', 'act', 'action', 'actionvalu', 'activ', 'actor', 'actorcrit', 'actor–crit', 'actual', 'actuat', 'acuiti', 'acycl', 'ad', 'adabn', 'adagrad', 'adam', 'adapt', 'adarad', 'add', 'addit', 'additionari', 'address', 'adem', 'adequ', 'adjust', 'admit', 'adopt', 'adult', 'advanc', 'advantag', 'advent', 'advers', 'adversari', 'advertis', 'advic', 'aec', 'aevb', 'affect', 'affin', 'afford', 'afirst', 'aforement', 'africa', 'agent', 'agentdesign', 'agentenviron', 'agents’', 'agent’', 'aggreg', 'agnost', 'ai', 'aid', 'aim', 'aka', 'al', 'albedo', 'albeit', 'ale', 'alexnet', 'alexnetlevel', 'algebra', 'algorithm', 'algorithms—wher', 'ali', 'alic', 'align', 'allcnn', 'allconvolut', 'allevi', 'alloc', 'all

In [6]:
import numpy as np

binary_word_embeddings = np.zeros((len(data), len(all_words)))

# id: index written in data, pos: position in data
id_to_pos = {}
pos_to_id = {}

for i in range(len(data)):
    id_to_pos[data[i]['id']] = i
    pos_to_id[i] = data[i]['id']
    
    for w in process_text(data[i]['abstract']):
        binary_word_embeddings[i][all_words_map[w]] = 1.
    
    binary_word_embeddings[i] /= np.sqrt(np.sum(binary_word_embeddings[i]**2))

In [7]:
# save index mappings
import json

with open("id_to_pos.json", "w") as fd:
    json.dump(id_to_pos, fd)

with open("pos_to_id.json", "w") as fd:
    json.dump(pos_to_id, fd)

# save embeddings
np.save("binary_word_embeddings.npy", binary_word_embeddings)

In [8]:
# nearest neighbors
from scipy.spatial import KDTree

tree = KDTree(binary_word_embeddings)

dists, idxs = tree.query(binary_word_embeddings[0], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[0]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(binary_word_embeddings[10], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[10]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(binary_word_embeddings[100], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[100]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))

Nearest Neighbors of: "Making Neural Programming Architectures Generalize via Recursion"
	1: "Making Neural Programming Architectures Generalize via Recursion"
	2: "Programming With a Differentiable Forth Interpreter"
	3: "Learning Continuous Semantic Representations of Symbolic Expressions"
	4: "On Detecting Adversarial Perturbations"
	5: "Fast Adaptation in Generative Models with Generative Matching Networks"
	6: "Recurrent Inference Machines for Solving Inverse Problems"
	7: "Variational Recurrent Adversarial Deep Domain Adaptation"
	8: "Adversarial Feature Learning"
	9: "Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning"
	10: "GRAM: Graph-based Attention Model for Healthcare Representation Learning"

Nearest Neighbors of: "Amortised MAP Inference for Image Super-resolution"
	1: "Amortised MAP Inference for Image Super-resolution"
	2: "Deep unsupervised learning through spatial contrasting"
	3: "Recurrent Inference Machines for Solving Inverse Problems"


In [9]:
# reduce binary embeddings using pca
from sklearn import decomposition

pca = decomposition.PCA(n_components=300)
pca.fit(binary_word_embeddings)
reduced_word_embeddings = pca.transform(binary_word_embeddings)

for i in range(len(data)):
    reduced_word_embeddings[i] /= np.sqrt(np.sum(reduced_word_embeddings[i]**2))

In [10]:
# save embeddings
np.save("reduced_word_embeddings.npy", reduced_word_embeddings)

In [11]:
# nearest neighbors
from scipy.spatial import KDTree

tree = KDTree(reduced_word_embeddings)

dists, idxs = tree.query(reduced_word_embeddings[0], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[0]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(reduced_word_embeddings[10], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[10]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(reduced_word_embeddings[100], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[100]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))

Nearest Neighbors of: "Making Neural Programming Architectures Generalize via Recursion"
	1: "Making Neural Programming Architectures Generalize via Recursion"
	2: "Programming With a Differentiable Forth Interpreter"
	3: "On Detecting Adversarial Perturbations"
	4: "Learning Continuous Semantic Representations of Symbolic Expressions"
	5: "PixelCNN++: Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications"
	6: "Fast Adaptation in Generative Models with Generative Matching Networks"
	7: "Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning"
	8: "A recurrent neural network without chaos"
	9: "Program Synthesis for Character Level Language Modeling"
	10: "Visualizing Deep Neural Network Decisions: Prediction Difference Analysis"

Nearest Neighbors of: "Amortised MAP Inference for Image Super-resolution"
	1: "Amortised MAP Inference for Image Super-resolution"
	2: "Energy-based Generative Adversarial Networks"
	3: "Recurrent I

# FastText Embeddings

In [12]:
import fasttext
model = fasttext.load_model("crawl-300d-2M-subword.bin")



In [13]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

fasttext_embeddings = np.zeros((len(data), 300))

for i in range(len(data)):
    ws = word_tokenize(data[i]["abstract"])
    for j in range(len(ws)):
        fasttext_embeddings[i] += model.get_word_vector(ws[j])
    fasttext_embeddings[i] /= np.sqrt(np.sum(fasttext_embeddings[i]**2))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# save embeddings
np.save("fasttext_embeddings.npy", fasttext_embeddings)

In [15]:
# nearest neighbors
from scipy.spatial import KDTree

tree = KDTree(fasttext_embeddings)

dists, idxs = tree.query(fasttext_embeddings[0], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[0]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(fasttext_embeddings[10], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[10]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(fasttext_embeddings[100], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[100]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))

Nearest Neighbors of: "Making Neural Programming Architectures Generalize via Recursion"
	1: "Making Neural Programming Architectures Generalize via Recursion"
	2: "Dropout with Expectation-linear Regularization"
	3: "Multi-label learning with the RNNs for Fashion Search"
	4: "Recurrent Inference Machines for Solving Inverse Problems"
	5: "An Analysis of Feature Regularization for Low-shot Learning"
	6: "Combining policy gradient and Q-learning"
	7: "Revisiting Batch Normalization For Practical Domain Adaptation"
	8: "FractalNet: Ultra-Deep Neural Networks without Residuals"
	9: "Structured Attention Networks"
	10: "ParMAC: distributed optimisation of nested functions, with application to binary autoencoders"

Nearest Neighbors of: "Amortised MAP Inference for Image Super-resolution"
	1: "Amortised MAP Inference for Image Super-resolution"
	2: "Generative Adversarial Networks for Image Steganography"
	3: "Hierarchical compositional feature learning"
	4: "Generative Multi-Adversarial Ne

In [16]:
del model # empty ram

# BERT Embeddings

In [17]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

bert_embeddings = np.zeros((len(data), 768))

for i in range(len(data)):
    inputs = tokenizer(data[i]['abstract'], return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state[0]

    bert_embeddings[i] = last_hidden_states.sum(axis=0).detach().numpy()
    bert_embeddings[i] /= np.sqrt(np.sum(bert_embeddings[i]**2))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# save embeddings
np.save("bert_embeddings.npy", bert_embeddings)

In [19]:
# nearest neighbors
from scipy.spatial import KDTree

tree = KDTree(bert_embeddings)

dists, idxs = tree.query(bert_embeddings[0], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[0]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(bert_embeddings[10], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[10]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))
print()

dists, idxs = tree.query(bert_embeddings[100], k=10)
print("Nearest Neighbors of: \"%s\"" % (data[100]['title']))
for i in range(len(idxs)):
    print("\t%d: \"%s\"" % (i+1, data[idxs[i]]['title']))

Nearest Neighbors of: "Making Neural Programming Architectures Generalize via Recursion"
	1: "Making Neural Programming Architectures Generalize via Recursion"
	2: "Programming With a Differentiable Forth Interpreter"
	3: "Dataset Augmentation in Feature Space"
	4: "Learning Continuous Semantic Representations of Symbolic Expressions"
	5: "Intelligible Language Modeling with Input Switched Affine Networks"
	6: "Recurrent Inference Machines for Solving Inverse Problems"
	7: "Neuro-Symbolic Program Synthesis"
	8: "Learning a Static Analyzer: A Case Study on a Toy Language"
	9: "Simple Black-Box Adversarial Perturbations for Deep Networks"
	10: "Revisiting Batch Normalization For Practical Domain Adaptation"

Nearest Neighbors of: "Amortised MAP Inference for Image Super-resolution"
	1: "Amortised MAP Inference for Image Super-resolution"
	2: "The Variational Walkback Algorithm"
	3: "Learning to Discover Sparse Graphical Models"
	4: "Submodular Sum-product Networks for Scene Understanding