In [1]:
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras import backend as K
import pandas as pd

Using TensorFlow backend.


# Preparing data

In [2]:
#filename = 'input_data.csv'
#raw_data = open(filename, 'rt')
#data = np.loadtxt(raw_data, delimiter= '\t') #NUMPY is very slow at loading data from CSV

data = pd.read_csv("./train.csv", delimiter = "\t")

In [3]:
number_of_features = len(data.values[0])-1
size_of_dataset = len(data.values)

In [4]:
first_mention = data.values[:,0:565]
second_mention = data.values[:,565:1130]
common_features = data.values[:,1130:1190]

first_mention = np.concatenate((first_mention, common_features), axis=1)
second_mention = np.concatenate((second_mention, common_features), axis=1)

labels = data.values[:,number_of_features] #last column consists of labels

In [5]:
print number_of_features

1190


# Utility functions for siamese network

In [6]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

In [7]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [8]:
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

In [9]:
def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()

    seq.add(Dense(1000, input_shape=(input_dim,), activation='relu'))
    seq.add(Dropout(0.2))
    seq.add(BatchNormalization())
    
    seq.add(Dense(500, activation='relu'))
    seq.add(Dropout(0.2))
    seq.add(BatchNormalization())
    
    seq.add(Dense(300, activation='relu'))
    return seq

In [23]:
def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.001].mean()

# Network definition

In [11]:
input_dim = 625

base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

In [12]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 625)           0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 625)           0                                            
____________________________________________________________________________________________________
sequential_1 (Sequential)        (None, 300)           1282800     input_1[0][0]                    
                                                                   input_2[0][0]                    
____________________________________________________________________________________________________
lambda_1 (Lambda)                (None, 1)             0           sequential_1[1][0]      

# Training

In [13]:
model.compile(loss=contrastive_loss, optimizer='Adam')

In [15]:
model.fit([first_mention, second_mention], labels, batch_size=256, epochs = 2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x113f20cd0>

In [14]:
model.save_weights("weights_siamese_model.h5")

# Evaluation on training set

In [16]:
# compute final accuracy on training and test sets
pred = model.predict([first_mention, second_mention], verbose=1)
tr_acc = compute_accuracy(pred, labels)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))



# Evaluation on test set

In [17]:
data = pd.read_csv("./test.csv", delimiter = "\t")

In [18]:
test_first_mention = data.values[:,0:565]
test_second_mention = data.values[:,565:1130]
test_common_features = data.values[:,1130:1190]

test_first_mention = np.concatenate((test_first_mention, test_common_features), axis=1)
test_second_mention = np.concatenate((test_second_mention, test_common_features), axis=1)

test_labels = data.values[:,number_of_features] #last column consists of labels

In [22]:
test_pred = model.predict([test_first_mention, test_second_mention], verbose=1)
test_acc = compute_accuracy(test_pred, test_labels)
print('* Accuracy on test set: %0.2f%%' % (100 * test_acc))



Let's calculate the prediction value for a single example from the test set, say, example number 10

In [31]:
first = test_first_mention[10:11,:]
second = test_second_mention[10:11,:]
print model.predict([first,second])

[[ 0.00031623]]


If a prediction is <b>below 0.001</b>, we consider a pair coreferent