In [79]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential 
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K


In [80]:
Sentences = []
with open("processed_data.dat", "rb") as f:
    Sentences = pickle.load(f)

In [81]:
attributes = ['is_argument', 'arg_class', 'd_rel', 'phrase_type', 'predicate_pos']
dt = pd.DataFrame(columns = attributes)

In [82]:
all_data = {
    'is_argument' : [],
    'arg_class' : [],
    'd_rel' : [],
    'phrase_type' : [],
    'predicate_pos' : []
}
for sentence in tqdm(Sentences):
    for chunk in sentence.nodeList:
#         print(chunk.__dict__)
#         data_entry = dict()
        if chunk.parentPB != '0':
            all_data['is_argument'].append(True)
            all_data['arg_class'].append(chunk.parentPBRelation)
        else:
            all_data['is_argument'].append(False)
            all_data['arg_class'].append(np.NaN)
        all_data['d_rel'].append(chunk.parentRelation)
        all_data['phrase_type'].append(chunk.type)
        all_data['predicate_pos'].append(chunk.parentPB)
#         print(data_entry)
dt = dt.append(pd.DataFrame(all_data))
dt

100%|██████████| 6796/6796 [00:00<00:00, 37985.12it/s]


Unnamed: 0,is_argument,arg_class,d_rel,phrase_type,predicate_pos
0,False,,ccof,NP,0
1,True,ARG0-GOL,k4a,CCP,VGF
2,False,,nmod,NP,0
3,False,,ccof,NP,0
4,True,ARGM-TMP,k7t,NP,VGF
...,...,...,...,...,...
101561,False,,pof,JJP,0
101562,False,,r6,VGNN,0
101563,False,,ccof,NP,0
101564,False,,k1s,JJP,0


### Argument Identification

In [83]:
dataset = dt.values
X = dataset[:, 2:-1]
y = dataset[:,0]
X = X.astype(str)
y = y.reshape((len(y), 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (81252, 2) (81252, 1)
Test (20314, 2) (20314, 1)


In [84]:
# prepare input data
def prepare_inputs(X,X_train, X_test):
    ohe = OneHotEncoder()
    ohe.fit(X)
    X_train_enc = ohe.transform(X_train)
    X_test_enc = ohe.transform(X_test)
    return X_train_enc, X_test_enc
 
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

X_train_enc, X_test_enc = prepare_inputs(X,X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
print('Train', X_train_enc.shape, y_train_enc.shape)
print('Test', X_test_enc.shape, y_test_enc.shape)

Train (81252, 88) (81252,)
Test (20314, 88) (20314,)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# model = Sequential() 
# model.add(Dense(2, input_dim=input_dim, activation='softmax'))
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=15, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/15
 - 6s - loss: 0.4203 - accuracy: 0.7652
Epoch 2/15
 - 5s - loss: 0.3992 - accuracy: 0.7784
Epoch 3/15
 - 5s - loss: 0.3989 - accuracy: 0.7784
Epoch 4/15
 - 5s - loss: 0.3985 - accuracy: 0.7789
Epoch 5/15
 - 5s - loss: 0.3984 - accuracy: 0.7791
Epoch 6/15
 - 5s - loss: 0.3985 - accuracy: 0.7793
Epoch 7/15
 - 5s - loss: 0.3982 - accuracy: 0.7792
Epoch 8/15
 - 5s - loss: 0.3981 - accuracy: 0.7796
Epoch 9/15
 - 5s - loss: 0.3980 - accuracy: 0.7794
Epoch 10/15
 - 5s - loss: 0.3979 - accuracy: 0.7795
Epoch 11/15
 - 5s - loss: 0.3980 - accuracy: 0.7794
Epoch 12/15


### Semantic Role Classifier

In [None]:
dt_arguments =  dt[dt['arg_class'].notnull()]
dataset = dt_arguments.values
X = dataset[:, 2:]
y = dataset[:,1]
X = X.astype(str)
y = y.reshape((len(y), 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

In [None]:
X_train_enc, X_test_enc = prepare_inputs(X,X_train, X_test)
y_train_enc, y_test_enc = prepare_inputs(y,y_train, y_test)
print('Train', X_train_enc.shape, y_train_enc.shape)
print('Test', X_test_enc.shape, y_test_enc.shape)

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model2 = Sequential()
model2.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model2.add(Dense(y_train_enc.shape[1], activation='softmax'))
# compile the keras model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', recall_m, f1_m, precision_m])
# fit the keras model on the dataset
model2.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)

## Results

In [None]:
# Confusion Matrix

y_pred_enc = model2.predict_classes(X_test_enc)

le = LabelEncoder()
le.fit(y_train)
y_test_a = le.transform(y_test)
labels = le.inverse_transform([i for i in range(21)])

con_mat = tf.math.confusion_matrix(labels=y_test_a, predictions=y_pred_enc).numpy()

plt.subplots(figsize=(20,15))
sns.set()
confusion_matrix = sns.heatmap(con_mat, annot=True, xticklabels=labels, yticklabels=labels, fmt='g')
print("Confusion Matrix:")


In [None]:
loss, accuracy, recall, f1_score, precision = model2.evaluate(X_test_enc, y_test_enc, verbose=0)
print("Scores:")
print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-score: {f1_score}\nLoss: {loss}\n")

In [None]:
!mkdir -p saved_models
model.save('saved_models/identification_model')
model2.save('saved_models/classification_model')