In [22]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential 
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [23]:
Sentences = []
with open("processed_data.dat", "rb") as f:
    Sentences = pickle.load(f)

In [24]:
attributes = ['is_argument', 'arg_class', 'd_rel', 'phrase_type']
dt = pd.DataFrame(columns = attributes)

In [25]:
all_data = {
    'is_argument' : [],
    'arg_class' : [],
    'd_rel' : [],
    'phrase_type' : [],
}
for sentence in tqdm(Sentences):
    for chunk in sentence.nodeList:
#         print(chunk.__dict__)
#         data_entry = dict()
        if chunk.parentPB != '0':
            all_data['is_argument'].append(True)
            all_data['arg_class'].append(chunk.parentPBRelation)
        else:
            all_data['is_argument'].append(False)
            all_data['arg_class'].append(np.NaN)
        all_data['d_rel'].append(chunk.parentRelation)
        all_data['phrase_type'].append(chunk.type)
#         print(data_entry)
dt = dt.append(pd.DataFrame(all_data))
dt

100%|██████████| 6796/6796 [00:00<00:00, 45598.87it/s]


Unnamed: 0,is_argument,arg_class,d_rel,phrase_type
0,False,,r6,NP
1,False,,k1,NP
2,False,,k7,NP
3,False,,root,VGF
4,False,,k2,CCP
...,...,...,...,...
101561,True,ARG2-LOC,k7p,NP
101562,False,,nmod,NP
101563,False,,r6,NP
101564,True,ARG1,k2,NP


### Argument Identification

In [26]:
dataset = dt.values
X = dataset[:, 2:]
y = dataset[:,0]
X = X.astype(str)
y = y.reshape((len(y), 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (81252, 2) (81252, 1)
Test (20314, 2) (20314, 1)


In [27]:
# prepare input data
def prepare_inputs(X,X_train, X_test):
    ohe = OneHotEncoder()
    ohe.fit(X)
    X_train_enc = ohe.transform(X_train)
    X_test_enc = ohe.transform(X_test)
    return X_train_enc, X_test_enc
 
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

X_train_enc, X_test_enc = prepare_inputs(X,X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
print('Train', X_train_enc.shape, y_train_enc.shape)
print('Test', X_test_enc.shape, y_test_enc.shape)

Train (81252, 88) (81252,)
Test (20314, 88) (20314,)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [29]:
# model = Sequential() 
# model.add(Dense(2, input_dim=input_dim, activation='softmax'))
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/20
 - 6s - loss: 0.4156 - accuracy: 0.7688
Epoch 2/20
 - 6s - loss: 0.3990 - accuracy: 0.7791
Epoch 3/20
 - 5s - loss: 0.3986 - accuracy: 0.7795
Epoch 4/20
 - 5s - loss: 0.3984 - accuracy: 0.7793
Epoch 5/20
 - 5s - loss: 0.3981 - accuracy: 0.7799
Epoch 6/20
 - 5s - loss: 0.3979 - accuracy: 0.7799
Epoch 7/20
 - 5s - loss: 0.3979 - accuracy: 0.7797
Epoch 8/20
 - 5s - loss: 0.3979 - accuracy: 0.7798
Epoch 9/20
 - 5s - loss: 0.3977 - accuracy: 0.7800
Epoch 10/20
 - 5s - loss: 0.3977 - accuracy: 0.7795
Epoch 11/20
 - 6s - loss: 0.3978 - accuracy: 0.7795
Epoch 12/20
 - 5s - loss: 0.3976 - accuracy: 0.7797
Epoch 13/20
 - 6s - loss: 0.3976 - accuracy: 0.7801
Epoch 14/20
 - 5s - loss: 0.3976 - accuracy: 0.7800
Epoch 15/20
 - 5s - loss: 0.3976 - accuracy: 0.7799
Epoch 16/20
 - 5s - loss: 0.3976 - accuracy: 0.7796
Epoch 17/20
 - 5s - loss: 0.3975 - accuracy: 0.7800
Epoch 18/20
 - 5s - loss: 0.3974 - accuracy: 0.7798
Epoch 19/20
 - 5s - loss: 0.3975 - accuracy: 0.7804
Epoch 20/20
 - 5s - l

### Semantic Role Classifier

In [30]:
dt_arguments =  dt[dt['arg_class'].notnull()]
dataset = dt_arguments.values
X = dataset[:, 2:]
y = dataset[:,1]
X = X.astype(str)
y = y.reshape((len(y), 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (26360, 2) (26360, 1)
Test (6591, 2) (6591, 1)


In [31]:
X_train_enc, X_test_enc = prepare_inputs(X,X_train, X_test)
y_train_enc, y_test_enc = prepare_inputs(y,y_train, y_test)
print('Train', X_train_enc.shape, y_train_enc.shape)
print('Test', X_test_enc.shape, y_test_enc.shape)

Train (26360, 70) (26360, 22)
Test (6591, 70) (6591, 22)


In [32]:
model2 = Sequential()
model2.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model2.add(Dense(y_train_enc.shape[1], activation='softmax'))
# compile the keras model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model2.fit(X_train_enc, y_train_enc, epochs=30, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model2.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/30
 - 3s - loss: 1.2799 - accuracy: 0.6662
Epoch 2/30
 - 3s - loss: 0.6636 - accuracy: 0.8138
Epoch 3/30
 - 2s - loss: 0.6115 - accuracy: 0.8223
Epoch 4/30
 - 2s - loss: 0.5907 - accuracy: 0.8247
Epoch 5/30
 - 2s - loss: 0.5809 - accuracy: 0.8258
Epoch 6/30
 - 2s - loss: 0.5749 - accuracy: 0.8271
Epoch 7/30
 - 2s - loss: 0.5709 - accuracy: 0.8269
Epoch 8/30
 - 2s - loss: 0.5678 - accuracy: 0.8269
Epoch 9/30
 - 2s - loss: 0.5649 - accuracy: 0.8273
Epoch 10/30
 - 2s - loss: 0.5632 - accuracy: 0.8277
Epoch 11/30
 - 2s - loss: 0.5620 - accuracy: 0.8274
Epoch 12/30
 - 2s - loss: 0.5606 - accuracy: 0.8270
Epoch 13/30
 - 2s - loss: 0.5593 - accuracy: 0.8278
Epoch 14/30
 - 2s - loss: 0.5580 - accuracy: 0.8275
Epoch 15/30
 - 2s - loss: 0.5572 - accuracy: 0.8278
Epoch 16/30
 - 2s - loss: 0.5565 - accuracy: 0.8277
Epoch 17/30
 - 2s - loss: 0.5561 - accuracy: 0.8274
Epoch 18/30
 - 2s - loss: 0.5550 - accuracy: 0.8279
Epoch 19/30
 - 2s - loss: 0.5548 - accuracy: 0.8281
Epoch 20/30
 - 2s - l

In [35]:
!mkdir -p saved_models
model.save('saved_models/identification_model')
model2.save('saved_models/classification_model')