In [94]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from keras import Input, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Reshape, Concatenate, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.layers import Bidirectional, InputLayer
from keras.constraints import maxnorm
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint
import h5py
from sklearn.metrics import classification_report , precision_recall_fscore_support,precision_score,recall_score,f1_score

In [9]:
wsj_train = pickle.load(open('../emnlp/features/arg_features_wsj_train.p','rb'))
wsj_dev = pickle.load(open('../emnlp/features/arg_features_wsj_dev.p','rb'))

brown = pickle.load(open('../emnlp/features/arg_features_brown.p','rb'))

nyt_def = pickle.load(open('../emnlp/features/arg_features_nyt_def.p','rb'))
nyt_med = pickle.load(open('../emnlp/features/arg_features_nyt_med.p','rb'))
nyt_def_para = pickle.load(open('../emnlp/features/arg_features_nyt_def_para.p','rb'))
nyt_med_para = pickle.load(open('../emnlp/features/arg_features_nyt_med_para.p','rb'))

In [59]:
wsj_train_labels = np.array([int(label) for label in wsj_train['labels']])
wsj_dev_labels = np.array([int(label) for label in wsj_dev['labels']])

brown_labels = np.array([int(label) for label in brown['labels']])

nyt_def_labels = np.array([int(label) for label in nyt_def['labels']])
nyt_med_labels = np.array([int(label) for label in nyt_med['labels']])
nyt_def_para_labels = np.array([int(label) for label in nyt_def_para['labels']])
nyt_med_para_labels = np.array([int(label) for label in nyt_med_para['labels']])

In [31]:
wsj_train_emb = np.array(pickle.load(open('../emnlp/features/wsj_train_embeddings_layer4.p','rb')))
wsj_dev_emb = np.array(pickle.load(open('../emnlp/features/wsj_dev_embeddings_layer4.p','rb')))

brown_emb = np.array(pickle.load(open('../emnlp/features/brown_embeddings_layer4.p','rb')))

nyt_def_emb = np.array(pickle.load(open('../emnlp/features/nyt_def_embeddings_layer4.p','rb')))
nyt_med_emb = np.array(pickle.load(open('../emnlp/features/nyt_med_embeddings_layer4.p','rb')))
nyt_def_para_emb = np.array(pickle.load(open('../emnlp/features/nyt_def_para_embeddings_layer4.p','rb')))
nyt_med_para_emb = np.array(pickle.load(open('../emnlp/features/nyt_med_para_embeddings_layer4.p','rb')))

In [57]:
wsj_train_arg = extract_features_and_reorder(wsj_train)
wsj_dev_arg = extract_features_and_reorder(wsj_dev)

nyt_def_arg = extract_features_and_reorder(nyt_def)
nyt_med_arg = extract_features_and_reorder(nyt_med)
nyt_def_para_arg = extract_features_and_reorder(nyt_def_para)
nyt_med_para_arg = extract_features_and_reorder(nyt_med_para)

## BERT

In [32]:
# model.add(Merge([model1, model2, model3, model4, model5, model6], mode='concat'))

model = Sequential()
model.add(InputLayer(input_shape=(wsj_train_emb[0].shape[0],)))
model.add(Dense(1,activity_regularizer=l2(0.0001)))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1)                 769       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 769
Trainable params: 769
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.fit(wsj_train_emb, wsj_train_labels,
          batch_size=batch_size,
          epochs=1,
          validation_data=(wsj_dev_emb, wsj_dev_labels))



<tensorflow.python.keras.callbacks.History at 0x7f81f1e80490>

In [35]:
for embeddings, labels, name in zip([nyt_def_emb, nyt_med_emb, nyt_def_para_emb, nyt_med_para_emb],
                                    [nyt_def_labels, nyt_med_labels, nyt_def_para_labels, nyt_med_para_labels],
                         ['nyt_def', 'nyt_med', 'nyt_def_para', 'nyt_med_para']):
    print(name)
#     test_features, test_labels = prepare_data(dataset)
#     test_features = sequence.pad_sequences(test_features, maxlen=maxlen)
    
    score, acc = model.evaluate(embeddings, labels)
    print('Test score:', score)
    print('Test accuracy:', acc)

nyt_def
Test score: 0.646885335445404
Test accuracy: 0.7950000166893005
nyt_med
Test score: 0.7022445201873779
Test accuracy: 0.7875000238418579
nyt_def_para
Test score: 0.6548301577568054
Test accuracy: 0.7839999794960022
nyt_med_para
Test score: 0.7410150170326233
Test accuracy: 0.7524999976158142


## RNN

In [56]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, SimpleRNN
from keras.datasets import imdb

max_features = 3
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32

print('Loading data...')
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train, y_train, x_test, y_test = wsj_train_arg, wsj_train_labels, wsj_dev_arg, wsj_dev_labels

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2)) #, recurrent_dropout=0.5
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(x_test, y_test))

# model.save_weights("model_lstm_15ep.h5")

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...
3502 train sequences
1000 test sequences
Pad sequences (samples x time)
x_train shape: (3502, 80)
x_test shape: (1000, 80)
Build model...
Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.2144295573234558
Test accuracy: 0.925000011920929


In [62]:
# rnn_5ep
for features, labels, name in zip([nyt_def_arg, nyt_med_arg, nyt_def_para_arg, nyt_med_para_arg],
                                    [nyt_def_labels, nyt_med_labels, nyt_def_para_labels, nyt_med_para_labels],
                         ['nyt_def', 'nyt_med', 'nyt_def_para', 'nyt_med_para']):
    print(name)
    x_test = sequence.pad_sequences(features, maxlen=maxlen)
    score, acc = model.evaluate(x_test, labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

nyt_def
Test score: 0.26058417558670044
Test accuracy: 0.9035000205039978
nyt_med
Test score: 0.3319821059703827
Test accuracy: 0.871999979019165
nyt_def_para
Test score: 0.8388977646827698
Test accuracy: 0.5920000076293945
nyt_med_para
Test score: 0.802855372428894
Test accuracy: 0.609000027179718


# RNN + BERT

In [111]:
'''first_input = Input(shape=(2, ))
first_dense = Dense(1, )(first_input)

second_input = Input(shape=(2, ))
second_dense = Dense(1, )(second_input)

merge_one = concatenate([first_dense, second_dense])

third_input = Input(shape=(1, ))
merge_two = concatenate([merge_one, third_input])

model = Model(inputs=[first_input, second_input, third_input], outputs=merge_two)
ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)
model.compile(optimizer=ada_grad, loss='binary_crossentropy',
               metrics=['accuracy'])'''



input_emb = Input(shape=(768,))
dense_1 = Dense(128, activation='relu', activity_regularizer=l2(0.0001))(input_emb)
dropout_1 = Dropout(0.5)(dense_1)
# dense_2 = Dense(128, activation='sigmoid', activity_regularizer=l2(0.0001))(input_emb)
# dropout_2 = Dropout(0.5)(dense_1)

input_arg = Input(shape=(maxlen,))
model_arg = Embedding(max_features, 128)(input_arg)
model_arg = SimpleRNN(128, dropout=0.2)(model_arg)

merged = concatenate([dropout_1, model_arg])
dense_pred = (Dense(1, activation='sigmoid'))(merged)

model = Model(inputs=[input_emb, input_arg], outputs=dense_pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           [(None, 768)]        0                                            
__________________________________________________________________________________________________
input_50 (InputLayer)           [(None, 80)]         0                                            
__________________________________________________________________________________________________
dense_22 (Dense)                (None, 128)          98432       input_49[0][0]                   
__________________________________________________________________________________________________
embedding_32 (Embedding)        (None, 80, 128)      384         input_50[0][0]                   
____________________________________________________________________________________________

In [112]:
x_train_arg = sequence.pad_sequences(wsj_train_arg, maxlen=maxlen)
x_dev_arg = sequence.pad_sequences(wsj_dev_arg, maxlen=maxlen)

model.fit([wsj_train_emb, x_train_arg], wsj_train_labels,
          batch_size=batch_size,
          epochs=3,
          validation_data=([wsj_dev_emb, x_dev_arg], wsj_dev_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f81e9dd9550>

In [110]:
# rnn_5ep
for emb, arg, labels, name in zip([nyt_def_emb, nyt_med_emb, nyt_def_para_emb, nyt_med_para_emb],
                                  [nyt_def_arg, nyt_med_arg, nyt_def_para_arg, nyt_med_para_arg],
                                  [nyt_def_labels, nyt_med_labels, nyt_def_para_labels, nyt_med_para_labels],
                         ['nyt_def', 'nyt_med', 'nyt_def_para', 'nyt_med_para']):
    print(name)
    arg_seq = sequence.pad_sequences(arg, maxlen=maxlen)
    score, acc = model.evaluate([emb, arg_seq], labels, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

nyt_def
Test score: 0.999777615070343
Test accuracy: 0.7854999899864197
nyt_med
Test score: 1.041422963142395
Test accuracy: 0.7785000205039978
nyt_def_para
Test score: 0.9296719431877136
Test accuracy: 0.7749999761581421
nyt_med_para
Test score: 1.0192060470581055
Test accuracy: 0.7444999814033508


## Functions

In [37]:
def artcile_split_and_pred_para(sentences, predictions):
    'Retruns article splitted along with their sentences labels'
    
    assert len(sentences) == len(predictions)
    
    article, article_splits = [], []
    for i, (sent, pred) in enumerate(zip(sentences, predictions)):
        if sent == 'ARTICLE_SPLIT_LINE\t0\n':
            article_splits.append(article)
            article = []
        else:
            article.append(( sent.split('\t')[0], int(pred.rstrip()) ))
#             article.append(( sent, pred ))
    
    return article_splits


def article_sent_labels_emb(article_sent_labels):
    X = []
    for _, label in article_sent_labels:
        X.append(label)
    
    return X

def prepare_data(publisher_dict, pred='bert', mode='not_categorical'):
    if pred == 'bert':
        news_arg = artcile_split_and_pred_para(publisher_dict['news_sent'],publisher_dict['news_pred_bert'])
        op_arg = artcile_split_and_pred_para(publisher_dict['op_sent'],publisher_dict['op_pred_bert'])
    else: #pred=='roberta'
        news_arg = artcile_split_and_pred_para(publisher_dict['news_sent'],publisher_dict['news_pred_roberta'])
        op_arg = artcile_split_and_pred_para(publisher_dict['op_sent'],publisher_dict['op_pred_roberta'])
    
    
    features, labels = [], []
    for article in news_arg:
        features.append(article_sent_labels_emb(article))
        labels.append(0)
    for article in op_arg:
        features.append(article_sent_labels_emb(article))
        labels.append(1)
    
    return np.array(features), np.array(labels)

In [52]:
def extract_features_and_reorder(publisher_dict, pred='bert'):
    '''Runs extract_arg_features for both "news" and "op" types of input publisher
        Then merges them in one list following the order in the original data'''
    
    original_data = publisher_dict['original_data']
    # getting news/op articles and argumentative features
    if pred == 'bert':
        news_articles = artcile_split_and_pred_para(publisher_dict['news_sent'],publisher_dict['news_pred_bert'])
        op_articles = artcile_split_and_pred_para(publisher_dict['op_sent'],publisher_dict['op_pred_bert'])
    else: #pred=='roberta'
        news_articles = artcile_split_and_pred_para(publisher_dict['news_sent'],publisher_dict['news_pred_roberta'])
        op_articles = artcile_split_and_pred_para(publisher_dict['op_sent'],publisher_dict['op_pred_roberta'])
        
    
    news_features, op_features = [], []
    for article in news_articles:
        news_features.append(article_sent_labels_emb(article))
    for article in op_articles:
        op_features.append(article_sent_labels_emb(article))
    
    assert len(news_articles) == len(news_features)
    assert len(op_articles) == len(op_features)
    assert len(original_data) == len(news_articles) + len(op_articles)
#     print(len(original_data), len(news_articles), len(op_articles))
    
    
    # merging the two lists into one following the ordering of the original data
    news_iter, op_iter, features = 0, 0, []
    for i, (article, label) in enumerate(original_data):
    
        if int(label) == 1:
            features.append(op_features[op_iter])
            if len(article) > 0:
                try:
                    assert op_articles[op_iter][0][0].split()[0] in article.split()[0]
                except Exception as e:
                    print(op_articles[op_iter][0][0].split())
                    print(article.split())
                    print(e)
            assert op_iter < len(op_features)
            op_iter += 1
        
        elif int(label) == 0:
            features.append(news_features[news_iter])
            if len(article) > 0:
                assert article.split()[0] == news_articles[news_iter][0][0].split()[0]
            assert news_iter < len(news_features)
            news_iter += 1
        
        else:
            print('Wrong label at line {}\nThis statement should never be printed'.format(i))
            break
    
    return features