In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os, gc, re
import pickle, copy
from model import *
from data import *
from utils import *
import matplotlib.pyplot as plt
from IPython import display

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [15]:
conf = SmallConfig()
conf.include_files = ['assembly.txt', "c.txt", "clojure.txt", "cpp.txt", "csharp.txt", "css.txt",
                      "html.txt", "java.txt", "js.txt", "php.txt", "python.txt", "r.txt", "ruby.txt"]
# Line count was used in older versions of this loader, but now are redundant, 
# by the exception of not very accurate epoch-by-file estimate. Since counting lines
# in large text files is a lengthy process, I've commented out everything connected to it.
# If you feel adventurous, please also feel free to uncomment them.

# with open(os.path.join(os.path.join(conf.path,"preprocessed"), "line_count.pkl"),'rb') as f:
#     conf.line_count = pickle.load(f)
loader = LangInput(conf)

In [17]:
initializer = tf.random_uniform_initializer(-conf.init_scale,conf.init_scale)
tf.reset_default_graph()
with tf.variable_scope("Model", initializer=initializer, reuse=False):
    model = LangNet(loader,True)

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

costs, iters = 0, 0
def acc(outs, labels, last=conf.ignore_first):
    return np.equal(outs[:,last:],labels[:,last:]).sum()/np.prod(outs[:,last:].shape)*100

step = 0
saver = tf.train.Saver()

In [21]:
saver.restore(sess,'out/lang_net.more.final')

INFO:tensorflow:Restoring parameters from out/lang_net.more.final


In [26]:
run = 1
writer = tf.summary.FileWriter("logs/train/run%d"%run,sess.graph)
mask = np.zeros([conf.batch_size,conf.num_steps],np.float32)
state = sess.run(model.initial_state)
mask[:,conf.ignore_first:]=1
remap = {}

In [None]:
fetches = {
    "cost": model.cost,
    "outs": model.logits,
    "train_op": model.train_op
}
while True:
    batch = loader.next()
    indices = np.arange(loader.num_classes)
    np.random.shuffle(indices)
    losses = []
    accuracies = []
    for i in indices:
        target = remap.get(i,i)
        targets = np.full([conf.batch_size,conf.num_steps], target, np.int32)
        
        feed_dict = {}
        for j, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[j].c
            feed_dict[h] = state[j].h
        feed_dict[model.input_placeholder] = np.expand_dims(batch[i],0)
        feed_dict[model.mask] = mask
        feed_dict[model.label_placeholder] = targets
        
        vals = sess.run(fetches, feed_dict)
        cost = vals["cost"]
        outs = vals["outs"]
        accuracies.append(acc(np.argmax(outs,2),targets,conf.ignore_first))
        losses.append(cost)
        
    writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag="Accuracy",
                simple_value=np.mean(accuracies))]), step)
    writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag="Loss",
                simple_value=np.mean(losses))]), step)
    step+=1
    if np.isnan(cost):break
#     display.clear_output(True)
#     for i, (k, v) in enumerate(loader.cursors.items()):
#         print("%s epoch est.: %.3f%%"%(loader.langs[i], 100*v/loader.line_count[k]))

In [None]:
utils.export_model(saver,model,sess,['Model/input_plch'],'Model/Reshape_1', 'lang_net')

In [22]:
c = """
#define C
#include<stdio.h>
#include<iostream>

static void main(int argc, char*[] argv){
    do{
        int* c = (int*)malloc(sizeof(int)*10)
    }while(someshit)
}
"""
py = """
feed_dict = {}
for i, (c, h) in enumerate(model.initial_state):
    feed_dict[c] = state[i].c
    feed_dict[h] = state[i].h
feed_dict[model.input_placeholder] = np.expand_dims(inp,0)
target = remap.get(loader.langtoi["python"],i)
targets = np.full([conf.batch_size,len(inp)], target, np.int32)
feed_dict[model.label_placeholder] = targets
feed_dict[model.mask] = mask[:,-len(inp):]
neurons = sess.run(rnn_out, feed_dict)
"""
inp = loader.to_numpy(py)

In [30]:
with tf.variable_scope("Model", reuse=True):
    rnn_out = tf.get_default_graph().get_tensor_by_name("Model/concat:0")

In [31]:
feed_dict = {}
for i, (c, h) in enumerate(model.initial_state):
    feed_dict[c] = state[i].c
    feed_dict[h] = state[i].h
feed_dict[model.input_placeholder] = np.expand_dims(inp,0)
target = remap.get(loader.langtoi["python"],i)
targets = np.full([conf.batch_size,len(inp)], target, np.int32)
feed_dict[model.label_placeholder] = targets
feed_dict[model.mask] = mask[:,-len(inp):]
neurons = sess.run(rnn_out, feed_dict)

In [32]:
neurons = neurons.squeeze().transpose(1,0)

In [33]:
text = py
cluster = 4
cluster_size = 15
for i, n in list(enumerate(neurons))[cluster*cluster_size:(cluster+1)*cluster_size]:
    display.display(display.HTML('<p>Neuron #%d</p>%s'%(i,heatmap(text,n))))