# Process after creating data frame of Entity Mention and their content

In [1]:
import json
import csv
import pandas as pd
import numpy as np
import pickle
import time
import os
import random

In [2]:
with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('testa_data.pkl', 'rb') as f:
    testa_data = pickle.load(f)

In [3]:
train_data = train_data.dropna(axis=0, how='any')
train_data.reset_index(drop = True, inplace = True)
testa_data = testa_data.dropna(axis=0, how='any')
testa_data.reset_index(drop = True, inplace = True)

In [4]:
train_data.head()

Unnamed: 0,Doc,Entity,Mention,Mention_Content,Content
0,train_doc_0,germany,German,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...
1,train_doc_0,united kingdom,British,lamb clearer March year animal measures veteri...,fiction Charles forum Big exponents Soviet com...
2,train_doc_0,brussels,BRUSSELS,lamb clearer March year animal measures veteri...,Empain Charles dates tensions COCOF Neo runnin...
3,train_doc_0,european commission,European Commission,lamb clearer March year animal measures veteri...,Graham Charles wars agriculture Steven Legisla...
4,train_doc_0,germany,Germany,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...


In [38]:
tr_content = train_data.drop_duplicates(subset=['Content'])
tr_mention_content = train_data.drop_duplicates(subset=['Mention_Content'])

ta_content = testa_data.drop_duplicates(subset=['Content'])
ta_mention_content = testa_data.drop_duplicates(subset=['Mention_Content'])

tr_content.reset_index(drop = True, inplace = True)
tr_mention_content.reset_index(drop = True, inplace = True)
ta_content.reset_index(drop = True, inplace = True)
ta_mention_content.reset_index(drop = True, inplace = True)

In [6]:
print(len(tr_content))
print(len(tr_mention_content))
print(len(ta_content))
print(len(ta_mention_content))

917
120
354
15


0:917       ====> tr_content
917:1037    ====> tr_mention_content
1037:1391   ====> ta_content
1391::      ====> ta_mention_content

In [7]:
918+120+354+55

1447

### Merging all data to compute DTM

In [8]:
total_content = tr_content.append(tr_mention_content.append(ta_content.append(ta_mention_content)))
total_content.reset_index(drop = True, inplace = True)
len(total_content)

1406

0:917
917::

0:354
354::

In [9]:
tr_total = tr_content.append(tr_mention_content)
ta_total = ta_content.append(ta_mention_content)

tr_total.reset_index(drop = True, inplace = True)
ta_total.reset_index(drop = True, inplace = True)

In [10]:
len(total_content.unique())

1274

## Document Term Matrix

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
dtm = CountVectorizer()

In [12]:
dtm.fit(total_content.unique())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
tr_dtm = dtm.transform(tr_total)
ta_dtm = dtm.transform(ta_total)

In [14]:
train_arr = tr_dtm.toarray()
testa_arr = ta_dtm.toarray()

In [15]:
train_arr[train_arr >= 1] = 1
testa_arr[testa_arr >= 1] = 1

In [16]:
import sys

In [17]:
sys.getsizeof(train_arr)/1024**3

0.7523979842662811

In [18]:
sys.getsizeof(testa_arr)/1024**3

0.2677289545536041

# Denoising AutoEncoder

In [19]:
# Adding Noise
noise_factor = 0.1
train_arr = train_arr + noise_factor * np.random.normal(loc=0.0, scale=0.1, size=train_arr.shape) 
testa_arr = testa_arr + noise_factor * np.random.normal(loc=0.0, scale=0.1, size=testa_arr.shape) 

train_arr = np.clip(train_arr, 0., 1.)
testa_arr = np.clip(testa_arr, 0., 1.)

In [20]:
train_arr

array([[  0.00000000e+00,   9.91973980e-01,   1.00503183e-02, ...,
          1.39061154e-02,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   9.97948873e-01,   6.91064858e-03, ...,
          0.00000000e+00,   0.00000000e+00,   5.48530479e-03],
       [  1.33514961e-02,   1.85409384e-02,   1.36085982e-02, ...,
          0.00000000e+00,   3.40956675e-03,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   4.10423879e-03, ...,
          4.53182046e-03,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   7.87830542e-03,   6.41996191e-03, ...,
          1.40533243e-03,   1.09635561e-02,   3.59323258e-03],
       [  1.10042030e-02,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   3.84692562e-03,   1.24838182e-04]])

In [21]:
testa_arr

array([[ 0.        ,  0.        ,  0.00054155, ...,  0.0163283 ,
         0.        ,  0.0022602 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.01324053,
         0.        ,  0.00728867],
       [ 0.00600464,  0.00592986,  0.        , ...,  0.00328166,
         0.00711421,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.00375378,
         0.00038955,  0.        ],
       [ 0.        ,  0.01122103,  0.        , ...,  0.        ,
         0.00106757,  0.02867797],
       [ 0.00049259,  0.00409859,  0.0055428 , ...,  0.        ,
         0.        ,  0.01277573]])

In [22]:
np.sum(testa_arr, axis = 1)

array([  765.11037186,  2295.61207982,   947.08976077,   577.18995876,
        1037.88304852,   805.14020566,   698.16379757,  1073.24337335,
         959.01561117,   815.75937848,   463.48019079,  1240.11349975,
         611.83420086,   701.48752252,   454.48354934,  1101.03700694,
         947.26812291,   813.35625615,   516.3724354 ,   807.18285077,
         554.24138171,   774.19253743,  1668.60459551,   555.9664782 ,
         502.80457732,   456.3564497 ,   795.96154966,  2577.06694255,
         516.68015976,  3288.13107737,   457.8408791 ,  2614.23877245,
         822.15929707,  2030.02214985,   463.80405563,  1054.08191158,
        2617.17037126,   434.65676534,  2770.21168854,   816.70906948,
         454.19649867,  3179.79700867,   666.78171738,  2918.61525733,
         452.24652636,  2998.60552225,   503.66635702,   735.79079241,
        2911.0055981 ,   513.26471843,   460.15678324,  2883.84286746,
         588.45722723,  2086.74881312,   931.66556114,   426.61420155,
      

In [23]:
train_arr.shape[1]

97382

In [62]:
with open('train_arr.pkl', 'wb') as f:
    pickle.dump(train_arr, f, pickle.HIGHEST_PROTOCOL)

MemoryError: 

In [24]:
from keras.layers import Input, Dense
from keras.models import Model

Using TensorFlow backend.


In [25]:
encoding_dim = 1000
input_vec = Input(shape = (train_arr.shape[1],))
encoded = Dense(encoding_dim, activation='relu')(input_vec)
decoded = Dense(train_arr.shape[1], activation='sigmoid')(encoded)

In [26]:
encoder = Model(input_vec, encoded)
autoencoder = Model(input_vec, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [27]:
tic = time.time()
autoencoder.fit(train_arr, train_arr,
                epochs=3,
                batch_size=100,
                shuffle=True,
                validation_data=(testa_arr, testa_arr))
toc = time.time() - tic

Train on 1037 samples, validate on 369 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
encoded_vec = encoder.predict(testa_arr)

In [29]:
sys.getsizeof(encoded_vec)/1024**3

0.0013747364282608032

In [30]:
encoded_vec

array([[  6.26353455,   6.86382294,   5.29022074, ...,   4.38893366,
          7.0499382 ,   6.56959629],
       [ 19.11818695,  21.1339016 ,  15.24909496, ...,  13.75312424,
         21.03320122,  20.72510338],
       [  7.76189184,   8.73104286,   6.66372633, ...,   5.60051155,
          8.75747299,   8.28940868],
       ..., 
       [  3.33720279,   3.8166995 ,   2.79934525, ...,   2.21806502,
          3.74780965,   3.65763807],
       [  3.29046226,   3.77411675,   2.75431895, ...,   2.21018195,
          3.70887017,   3.61790252],
       [  4.49539328,   5.14643764,   3.79264235, ...,   3.02331448,
          5.04809618,   4.96924448]], dtype=float32)

In [105]:
np.max(encoded_vec, axis = 0)

array([  2.88412571e+01,   1.97690430e+01,   3.08160744e+01,
         2.98718281e+01,   2.97893028e+01,   2.11994514e+01,
         8.61658955e+00,   1.63698635e+01,   2.52479706e+01,
         4.30178118e+00,  -0.00000000e+00,   2.65682125e+01,
         3.04810123e+01,   2.31617928e+01,   1.10830154e+01,
         2.39199047e+01,   2.70216637e+01,   3.10274467e+01,
         2.37984409e+01,   2.10890942e+01,   2.79766712e+01,
        -0.00000000e+00,   7.14365816e+00,   2.30379181e+01,
         3.03055954e+01,   3.00929661e+01,   1.27706356e+01,
         1.96981106e+01,   2.94597492e+01,   2.60414982e+01,
        -0.00000000e+00,   2.48890027e-01,   2.61787033e+01,
         2.37781048e+01,   2.93826256e+01,   1.11065750e+01,
         2.77333031e+01,  -0.00000000e+00,  -0.00000000e+00,
         2.47140236e+01,   2.64092064e+01,   2.07880745e+01,
         2.49270821e+01,   3.11451168e+01,   2.18835411e+01,
         3.06272774e+01,  -0.00000000e+00,   2.61186600e+01,
         3.07172604e+01,

In [106]:
with open('encoded_vec_testa.pkl', 'wb') as f:
    pickle.dump(encoded_vec, f, pickle.HIGHEST_PROTOCOL)

In [56]:
with open('encoded_vec_ta.pkl', 'rb') as f:
    trial = pickle.load(f)

In [31]:
encoded_vec_tr = encoder.predict(train_arr)

In [108]:
with open('encoded_vec_train.pkl', 'wb') as f:
    pickle.dump(encoded_vec_tr, f, pickle.HIGHEST_PROTOCOL)

In [32]:
import gc
gc.collect()

3502

### Separating eoncoded vectors of article content (Mention) and wiki content (Entity)

In [None]:
0:917
917::

0:354
354::

In [33]:
en_tr_entity = encoded_vec_tr[0:917]
en_tr_mention = encoded_vec_tr[917::]

In [34]:
en_ta_entity = encoded_vec[0:354]
en_ta_mention = encoded_vec[354::]

In [115]:
encoded_vec_tr.shape

(1037, 1000)

In [71]:
tr_content_emb = tr_content[['Entity', 'Content']]
tr_content_emb['Entity_Encoding'] = np.nan

tr_content_emb = tr_content_emb.astype('object')
tr_content_emb.dtypes

for i in range(len(tr_content_emb)):
    tr_content_emb.iloc[i, 2] = en_tr_entity[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [72]:
tr_content_emb.head()

Unnamed: 0,Entity,Content,Entity_Encoding
0,germany,fiction Kurdish Charles dates Soviet computer ...,"[22.3534, 24.3593, 18.1434, 22.9279, 17.6045, ..."
1,united kingdom,fiction Charles forum Big exponents Soviet com...,"[21.9733, 23.9349, 17.5621, 22.553, 17.5677, 2..."
2,brussels,Empain Charles dates tensions COCOF Neo runnin...,"[17.0051, 19.2079, 13.7814, 17.3861, 13.5747, ..."
3,european commission,Graham Charles wars agriculture Steven Legisla...,"[8.56106, 9.60613, 7.19879, 8.99513, 6.61871, ..."
4,european union,Charles dates Mont tensions ECSC compositions ...,"[15.6312, 17.1679, 12.867, 16.2168, 12.2442, 1..."


In [73]:
tr_mention_content_emb = tr_mention_content[['Mention', 'Mention_Content']]
tr_mention_content_emb['Mention_Encoding'] = np.nan

tr_mention_content_emb = tr_mention_content_emb.astype('object')
tr_mention_content_emb.dtypes

for i in range(len(tr_mention_content_emb)):
    tr_mention_content_emb.iloc[i, 2] = en_tr_mention[i]

tr_mention_content_emb.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Mention,Mention_Content,Mention_Encoding
0,German,lamb clearer March year animal measures veteri...,"[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."
1,Hendrix,Jimi Nottingham audience pearl sheet girlfrien...,"[3.24999, 3.76371, 2.69743, 3.50474, 2.31345, ..."
2,China,Reuters hostility Shen Vice People comments Fo...,"[3.38193, 3.77913, 2.79379, 3.57681, 2.35058, ..."
3,China,agency Association Straits executive Wednesday...,"[3.07787, 3.54289, 2.57833, 3.30985, 2.19912, ..."
4,German,growth year figure increase General registrati...,"[3.27218, 3.69715, 2.69447, 3.5139, 2.35307, 2..."


In [70]:
len(train_data)

1665

In [79]:
tr_embedding = train_data.merge(tr_content_emb[['Entity', 'Entity_Encoding']], how='left', on='Entity')
print(len(tr_embedding))
tr_embedding.head()

1665


Unnamed: 0,Doc,Entity,Mention,Mention_Content,Content,Entity_Encoding
0,train_doc_0,germany,German,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[22.3534, 24.3593, 18.1434, 22.9279, 17.6045, ..."
1,train_doc_0,united kingdom,British,lamb clearer March year animal measures veteri...,fiction Charles forum Big exponents Soviet com...,"[21.9733, 23.9349, 17.5621, 22.553, 17.5677, 2..."
2,train_doc_0,brussels,BRUSSELS,lamb clearer March year animal measures veteri...,Empain Charles dates tensions COCOF Neo runnin...,"[17.0051, 19.2079, 13.7814, 17.3861, 13.5747, ..."
3,train_doc_0,european commission,European Commission,lamb clearer March year animal measures veteri...,Graham Charles wars agriculture Steven Legisla...,"[8.56106, 9.60613, 7.19879, 8.99513, 6.61871, ..."
4,train_doc_0,germany,Germany,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[22.3534, 24.3593, 18.1434, 22.9279, 17.6045, ..."


In [83]:
tr_embedding = tr_embedding.merge(tr_mention_content_emb[['Mention_Content', 'Mention_Encoding']], 
                                  how='left', on='Mention_Content')
print(len(tr_embedding))
tr_embedding.head()

1665


Unnamed: 0,Doc,Entity,Mention,Mention_Content,Content,Entity_Encoding,Mention_Encoding
0,train_doc_0,germany,German,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[22.3534, 24.3593, 18.1434, 22.9279, 17.6045, ...","[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."
1,train_doc_0,united kingdom,British,lamb clearer March year animal measures veteri...,fiction Charles forum Big exponents Soviet com...,"[21.9733, 23.9349, 17.5621, 22.553, 17.5677, 2...","[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."
2,train_doc_0,brussels,BRUSSELS,lamb clearer March year animal measures veteri...,Empain Charles dates tensions COCOF Neo runnin...,"[17.0051, 19.2079, 13.7814, 17.3861, 13.5747, ...","[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."
3,train_doc_0,european commission,European Commission,lamb clearer March year animal measures veteri...,Graham Charles wars agriculture Steven Legisla...,"[8.56106, 9.60613, 7.19879, 8.99513, 6.61871, ...","[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."
4,train_doc_0,germany,Germany,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[22.3534, 24.3593, 18.1434, 22.9279, 17.6045, ...","[3.95313, 4.47875, 3.29411, 4.24217, 2.86743, ..."


In [94]:
#tr_embedding.to_csv('train_embedding.csv', index = False)

In [96]:
with open('train_embedding.pkl', 'wb') as f:
    pickle.dump(tr_embedding, f, pickle.HIGHEST_PROTOCOL)

In [97]:
#with open('train_embedding.pkl', 'rb') as f:
#    a = pickle.load(f)

In [99]:
# Test Data
ta_content_emb = ta_content[['Entity', 'Content']]
ta_content_emb['Entity_Encoding'] = np.nan

ta_content_emb = ta_content_emb.astype('object')
ta_content_emb.dtypes

for i in range(len(ta_content_emb)):
    ta_content_emb.iloc[i, 2] = en_ta_entity[i]

ta_mention_content_emb = ta_mention_content[['Mention', 'Mention_Content']]
ta_mention_content_emb['Mention_Encoding'] = np.nan

ta_mention_content_emb = ta_mention_content_emb.astype('object')
ta_mention_content_emb.dtypes

for i in range(len(ta_mention_content_emb)):
    ta_mention_content_emb.iloc[i, 2] = en_ta_mention[i]

ta_embedding = testa_data.merge(ta_content_emb[['Entity', 'Entity_Encoding']], how='left', on='Entity')
print(len(ta_embedding))
ta_embedding = ta_embedding.merge(ta_mention_content_emb[['Mention_Content', 'Mention_Encoding']], 
                                  how='left', on='Mention_Content')
print(len(ta_embedding))
ta_embedding.head()

510
510


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,Doc,Entity,Mention,Mention_Content,Content,Entity_Encoding,Mention_Encoding
0,testa_doc_0,leicestershire county cricket club,LEICESTERSHIRE,title championship Moody Tim Paul requirements...,Charles March title Englands reverse Twenty ye...,"[6.26353, 6.86382, 5.29022, 6.41278, 4.82623, ...","[3.63068, 4.18889, 3.12095, 3.91361, 2.65805, ..."
1,testa_doc_0,london,LONDON,title championship Moody Tim Paul requirements...,accents Historic Charles forum Hotspur mole Ha...,"[19.1182, 21.1339, 15.2491, 19.6437, 15.2486, ...","[3.63068, 4.18889, 3.12095, 3.91361, 2.65805, ..."
2,testa_doc_0,west indies cricket team,West Indian,title championship Moody Tim Paul requirements...,future Champions Lanka Charles March title Tob...,"[7.76189, 8.73104, 6.66373, 7.9307, 6.10454, 7...","[3.63068, 4.18889, 3.12095, 3.91361, 2.65805, ..."
3,testa_doc_0,phil simmons,Phil Simmons,title championship Moody Tim Paul requirements...,player Lanka Champions year tenure March Crick...,"[4.51972, 5.20816, 3.98515, 4.74074, 3.48611, ...","[3.63068, 4.18889, 3.12095, 3.91361, 2.65805, ..."
4,testa_doc_0,leicestershire county cricket club,Leicestershire,title championship Moody Tim Paul requirements...,Charles March title Englands reverse Twenty ye...,"[6.26353, 6.86382, 5.29022, 6.41278, 4.82623, ...","[3.63068, 4.18889, 3.12095, 3.91361, 2.65805, ..."


In [100]:
with open('testa_embedding.pkl', 'wb') as f:
    pickle.dump(ta_embedding, f, pickle.HIGHEST_PROTOCOL)

## Calculating Score

In [5]:
from sklearn.preprocessing import normalize

In [2]:
with open('train_embedding.pkl', 'rb') as f:
    tr_embedding = pickle.load(f)
with open('testa_embedding.pkl', 'rb') as f:
    ta_embedding = pickle.load(f)

In [9]:
#tr_embedding.iloc[0,5] / np.linalg.norm(tr_embedding.iloc[0,5])
#normalize(tr_embedding.iloc[0,5][:,np.newaxis], axis=0).ravel()

In [10]:
for i in range(len(tr_embedding)):
    tr_embedding.iloc[i,5] = normalize(tr_embedding.iloc[i,5][:,np.newaxis], axis=0).ravel()
    tr_embedding.iloc[i,6] = normalize(tr_embedding.iloc[i,6][:,np.newaxis], axis=0).ravel()

In [12]:
for i in range(len(ta_embedding)):
    ta_embedding.iloc[i,5] = normalize(ta_embedding.iloc[i,5][:,np.newaxis], axis=0).ravel()
    ta_embedding.iloc[i,6] = normalize(ta_embedding.iloc[i,6][:,np.newaxis], axis=0).ravel()

In [16]:
tr_embedding.head()

Unnamed: 0,Doc,Entity,Mention,Mention_Content,Content,Entity_Encoding,Mention_Encoding
0,train_doc_0,germany,German,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[0.0370044, 0.0403251, 0.0300351, 0.0379554, 0...","[0.0363386, 0.0411703, 0.0302807, 0.0389956, 0..."
1,train_doc_0,united kingdom,British,lamb clearer March year animal measures veteri...,fiction Charles forum Big exponents Soviet com...,"[0.0368515, 0.0401413, 0.0294535, 0.0378237, 0...","[0.0363386, 0.0411703, 0.0302807, 0.0389956, 0..."
2,train_doc_0,brussels,BRUSSELS,lamb clearer March year animal measures veteri...,Empain Charles dates tensions COCOF Neo runnin...,"[0.036532, 0.0412644, 0.0296065, 0.0373505, 0....","[0.0363386, 0.0411703, 0.0302807, 0.0389956, 0..."
3,train_doc_0,european commission,European Commission,lamb clearer March year animal measures veteri...,Graham Charles wars agriculture Steven Legisla...,"[0.0363569, 0.0407951, 0.0305716, 0.0382003, 0...","[0.0363386, 0.0411703, 0.0302807, 0.0389956, 0..."
4,train_doc_0,germany,Germany,lamb clearer March year animal measures veteri...,fiction Kurdish Charles dates Soviet computer ...,"[0.0370044, 0.0403251, 0.0300351, 0.0379554, 0...","[0.0363386, 0.0411703, 0.0302807, 0.0389956, 0..."


In [13]:
trial_score = np.zeros((len(tr_embedding),))
for i in range(len(tr_embedding)):
    trial_score[i] = np.dot(tr_embedding.iloc[i,5], tr_embedding.iloc[0,6])

In [15]:
trial_score

array([ 0.99803829,  0.9979409 ,  0.99808621, ...,  0.99901426,
        0.99810135,  0.9982053 ])

In [14]:
np.argmax(trial_score)

8

In [17]:
tr_embedding.iloc[8]

Doc                                                       train_doc_0
Entity                                                 franz fischler
Mention                                                Franz Fischler
Mention_Content     lamb clearer March year animal measures veteri...
Content             doi Born agriculture Integration Food politici...
Entity_Encoding     [0.0361405, 0.0409552, 0.030396, 0.0386768, 0....
Mention_Encoding    [0.0363386, 0.0411703, 0.0302807, 0.0389956, 0...
Name: 8, dtype: object

## Adding One more Layer 

In [18]:
from keras.models import Sequential
from keras.layers import Concatenate, Merge

In [None]:
def dot_loss()

In [None]:
model1 = Sequential()
model1.add(Dense(units=200, activation='sigmoid', input_dim = 1000))

model2 = Sequential()
model2.add(Dense(units=200, activation='sigmoid', input_dim = 1000))

model = Sequential()
model.add(Merge([model1, model2], mode='dot'))

model.compile(loss = dot_loss, optimizer = 'adam')

In [None]:
model_final_concat = Concatenate(axis=-1)([model1, model2])
model_final_dense_1 = Dense(30, activation='softmax')(model_final_concat)

model = Model(inputs=[in1, in2], outputs=model_final_dense_1)

model.compile(loss='categorical_crossentropy', #continu together
              optimizer='adam',
              metrics=['accuracy'])

model.fit([X_train_one, X_train_two], Y_train,
          batch_size=32, nb_epoch=10, verbose=1)