In [None]:
pip install --user sklearn

In [1]:
import pickle
import csv
import numpy as np
import sklearn
import tensorflow as tf
from tensorflow.keras.layers import Input, Masking, Bidirectional as Bi, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from IPython.display import display, HTML
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from random import sample
import os.path

f = open("vectorized.pickle","rb")
v = pickle.load(f)

In [33]:
labels = ['none', 'gazette', 'law', 'target fragment', 'new fragment', 'date', 'mod type (deprecated)',
         'mod body', 'mod type: addition', 'mod type: deletion', 'mod type: update', 'mod type: date change', 'mod type: renumbering']
def print_preds(doc, preds):
    colors=['white','#5b0f00','#a61c00','#ff0000','#0000ff','#6aa84f','#b45f06','#e69138','#b45f06','#b45f06','#b45f06','#b45f06','#b45f06','#b45f06']
    with open('toks3.p','rb') as f:
        toks=pickle.load(f)
    output=''
    for i in range(min(input_length, len(toks[doc]['tokens']))):
        if preds[i] == 0:
            output+=toks[doc]['tokens'][i]+' '
        else:
            output+='<span style="color:white; background-color:'+colors[preds[i]]+'">'+toks[doc]['tokens'][i]+' </span>'
    display(HTML(output))

In [3]:
input_length = 300
test_size = 100
keep_all = False

if keep_all:
    n = input_length
    sep_docs = list()
    for doc in v:
        wv = [doc['word_vectors'][i * n:(i + 1) * n] for i in range((len(doc['word_vectors']) + n - 1) // n )]
        l = [doc['labels'][i * n:(i + 1) * n] for i in range((len(doc['labels']) + n - 1) // n )]
        sep_docs += [{'word_vectors':wv[i],'labels':l[i]} for i in range(len(wv))]
    v = sep_docs

unpadded_x = np.array([np.array(doc['word_vectors']) for doc in v])
#y=np.array([np.array(doc['labels']) for doc in v])
unpadded_y = list()
for doc in v:
    a = np.array(doc['labels'])
    b = np.zeros((a.size, 13))
    b[np.arange(a.size),a] = 1
    unpadded_y.append(b)
unpadded_y = np.array(unpadded_y)

x = tf.keras.preprocessing.sequence.pad_sequences(
    unpadded_x,
    maxlen = input_length,
    dtype = 'float32',
    padding = 'post',
    truncating = 'post',
    value = 0.0
)
y = tf.keras.preprocessing.sequence.pad_sequences(
    unpadded_y,
    maxlen = input_length,
    dtype = 'float32',
    padding = 'post',
    truncating = 'post',
    value = 100.0
)

#test_index = sample(range(len(x)), test_size)
f = open('test_index.pickle','rb')
test_index = pickle.load(f)
f.close()
x_test = np.array([x[i] for i in test_index])
y_test = np.array([y[i] for i in test_index])
x_train_val = np.array([x[i] for i in range(len(x)) if i not in test_index])
y_train_val = np.array([y[i] for i in range(len(y)) if i not in test_index])

kf = KFold(n_splits=5, shuffle=True, random_state=10)

In [4]:
def create_model(optimizer=Adam(), dropout_rate=0.1, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
                 init_mode=tf.keras.initializers.GlorotUniform, weight_constraint=None, neurons=100):
    model = tf.keras.models.Sequential([
        Input((input_length, 100), batch_size=None),
        Masking(),
        Bi(LSTM(neurons, dropout=dropout_rate, kernel_initializer=init_mode(), kernel_constraint=weight_constraint, return_sequences=True)),
        Bi(LSTM(neurons, dropout=dropout_rate, kernel_initializer=init_mode(), kernel_constraint=weight_constraint, return_sequences=True)),
        Dropout(rate=dropout_rate),
        Dense(13, activation='softmax')
    ])

    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.CategoricalCrossentropy(),
                  metrics=metrics)

    return model

In [5]:
# define the grid search parameters
# fit parameters
batch_size_values = [None, 16, 32, 64, 128]
epochs_values = [10, 50, 100]
# optimizer
optimizer_values = [SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam]
learning_rate_values = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum_values = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
# architecture
init_mode_values = [tf.keras.initializers.Ones, tf.keras.initializers.Zeros, tf.keras.initializers.GlorotNormal, tf.keras.initializers.GlorotUniform]
weight_constraint_values = [None, tf.keras.constraints.MinMaxNorm(),tf.keras.constraints.NonNeg(),tf.keras.constraints.UnitNorm()]
dropout_rate_values = [0.001, 0.01, 0.2, 0.5]
neurons_values = [50, 100, 150]
#param_grid_values = dict(batch_size=batch_size, epochs=epochs)
#metrics
def f1(precision, recall):
    return 2*(recall * precision)/(recall + precision)
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]

In [6]:
batch_size=32
epochs=50
optimizer=RMSprop
init_mode=tf.keras.initializers.GlorotUniform
weight_constraint=tf.keras.constraints.MinMaxNorm()
neurons=100
dropout_rate=0.01

In [None]:
# fit
for batch_size in batch_size_values:
    file_name = f'./output/batch_size_{batch_size}.pickle'
    if os.path.isfile(file_name):
        continue
    fit_results = [['batch_size', 'epochs', 'loss', 'accuracy', 'precision']]
    for epochs in epochs_values:
        for train_index, val_index in kf.split(x_train_val, y_train_val):
            print(batch_size, epochs)
            x_train, x_val = x_train_val[train_index], x_train_val[val_index]
            y_train, y_val = y_train_val[train_index], y_train_val[val_index]
            model = create_model()
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
            fit_results.append([batch_size, epochs] + model.evaluate(x_val,y_val))
    f=open(file_name, 'wb')
    pickle.dump(fit_results, f)
    f.close()

In [None]:
# optimizer
for optimizer in optimizer_values:
    file_name = f'./output/optimizer_{optimizer}.pickle'
    if os.path.isfile(file_name):
        continue
    fit_results = [['optimizer','loss','accuracy','precision']]
    for train_index, val_index in kf.split(x_train_val, y_train_val):
        print(optimizer)
        x_train, x_val = x_train_val[train_index], x_train_val[val_index]
        y_train, y_val = y_train_val[train_index], y_train_val[val_index]
        model = create_model(optimizer=optimizer())
        model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
        fit_results.append([str(optimizer)] + model.evaluate(x_val,y_val))
    f=open(file_name, 'wb')
    pickle.dump(fit_results, f)
    f.close()

In [None]:
# architecture
for init_mode in init_mode_values:
    file_name = f'./output/init_mode_{str(init_mode)}.csv'
    if os.path.isfile(file_name):
        continue
    fit_results = [['init mode','weight constraint','loss','accuracy','precision']]
    for weight_constraint in weight_constraint_values:
        for train_index, val_index in kf.split(x_train_val, y_train_val):
            print(init_mode)
            x_train, x_val = x_train_val[train_index], x_train_val[val_index]
            y_train, y_val = y_train_val[train_index], y_train_val[val_index]
            model = create_model(optimizer=optimizer(),init_mode=init_mode,weight_constraint=weight_constraint)
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
            fit_results.append([str(init_mode), str(weight_constraint)] + model.evaluate(x_val,y_val))
    f=open(file_name, 'w')
    c=csv.writer(f)
    c.writerows(fit_results)
    f.close()

In [None]:
# architecture 2
for neurons in neurons_values:
    file_name = f'./output/neurons_{neurons}.csv'
    if os.path.isfile(file_name):
        continue
    fit_results = [['neurons','dropout','loss','accuracy','precision','recall']]
    for dropout_rate in dropout_rate_values:
        for train_index, val_index in kf.split(x_train_val, y_train_val):
            print(neurons)
            x_train, x_val = x_train_val[train_index], x_train_val[val_index]
            y_train, y_val = y_train_val[train_index], y_train_val[val_index]
            model = create_model(optimizer=optimizer(),init_mode=init_mode,weight_constraint=weight_constraint,dropout_rate=dropout_rate,neurons=neurons)
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
            fit_results.append([str(neurons), str(dropout_rate)] + model.evaluate(x_val,y_val))
    f=open(file_name, 'w')
    c=csv.writer(f)
    c.writerows(fit_results)
    f.close()

In [None]:
# optimizer params
for momentum in momentum_values:
    file_name = f'./output/momentum_{momentum}.csv'
    if os.path.isfile(file_name):
        continue
    fit_results = [['momentum','learning_rate','loss','accuracy','precision','recall']]
    for learning_rate in learning_rate_values:
        for train_index, val_index in kf.split(x_train_val, y_train_val):
            print(momentum, learning_rate)
            x_train, x_val = x_train_val[train_index], x_train_val[val_index]
            y_train, y_val = y_train_val[train_index], y_train_val[val_index]
            model = create_model(optimizer=optimizer(learning_rate=learning_rate, momentum=momentum),
                                 init_mode=init_mode,weight_constraint=weight_constraint,
                                 dropout_rate=dropout_rate,neurons=neurons)
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
            fit_results.append([str(momentum), str(learning_rate)] + model.evaluate(x_val,y_val))
    f=open(file_name, 'w')
    c=csv.writer(f)
    c.writerows(fit_results)
    f.close()

In [7]:
model = create_model(weight_constraint=weight_constraint,dropout_rate=dropout_rate,optimizer=optimizer())
model.fit(x_train_val,y_train_val,epochs=epochs,batch_size=batch_size)

Train on 433 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f13c9c584a8>

In [None]:
model.evaluate(x_test,y_test)

In [None]:
x_i,y_i=kf.split(x_train_val,y_train_val)
model.evaluate(x_train_val[x_,y_train)

In [8]:
y_predictions = model.predict(x_test)
pred_labels = list()
for doc in y_predictions:
    newdoc = list()
    for label in doc:
        newdoc.append(np.argmax(label))
    pred_labels.append(newdoc)

In [9]:
true_labels = list()
for j in range(len(y_predictions)):
    true_labels = true_labels + [np.argmax(i) for i in y_test[j]]

In [10]:
predicted_labels = list()
for j in range(len(y_predictions)):
    predicted_labels = predicted_labels + [np.argmax(i) for i in y_predictions[j]]

In [None]:
print(sklearn.metrics.confusion_matrix(true_labels,predicted_labels, labels=range(13)))

In [None]:
print(sklearn.metrics.classification_report(true_labels,predicted_labels))

In [None]:
a=sklearn.metrics.classification_report(true_labels,predicted_labels)

In [11]:
with open('toks3.p','rb') as f:
    toks=pickle.load(f)

In [None]:
for i in range(len(v)):
    for j in range(len(v[i]["labels"])):
        if v[i]["labels"][j] > 7:
            print(toks[i]["tokens"][j]," ",v[i]["labels"][j])

In [34]:
for i in range(len(test_index)):
    print_preds(test_index[i],pred_labels[i])

In [None]:
errors=list()
for i in range(len(pred_labels)):
    errors.append(sum([1 for j in range(min(input_length, len(toks[train_size+i]['labels']))) if pred_labels[i][j]!= np.argmax(y_test[i][j])]))

In [None]:
print(errors)
errors.index(max(errors))

In [None]:
with open('output/batch_size_128.pickle','rb') as f:
    p=pickle.load(f)
with open('output/batch_size_128.csv','w') as f:
    c=csv.writer(f)
    c.writerows(p)

In [None]:
with open('output/batch_size_epochs.csv','r') as f:
    c=csv.reader(f)
    l=list(c)
    for i in l:
        print(i)

In [None]:
[j[3] for j in [i for i in l if i[0]=='10']]

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pickle
import csv
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for z in [10,50,100]:
    ax.bar([1,2,3,4,5], [j[3] for j in [i for i in l if i[0]==str(z)]], zs=z, zdir='y')
plt.show()