**Bidirectional LSTM with W2V and GloVe**


In [0]:
!unzip fnc-1.zip
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!unzip Models.zip


In [0]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import tensorflow as tf

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

In [0]:
# Specify the folder locations
W2V_DIR = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
DATA_DIR = 'fnc-1/'
MODEL_DIR = 'Models/'
Glove = 'glove.6B.200d.txt'


In [0]:
import gensim
import keras
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")
from keras.utils import np_utils


from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors



import matplotlib as mpl
%matplotlib inline
from matplotlib import pyplot as plt
from keras.utils import plot_model 
from IPython.display import Image
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.preprocessing import LabelEncoder



np.random.seed(1003)

In [0]:
MAX_SENT_LEN = 170
MAX_VOCAB_SIZE = 400000
LSTM_DIM = 128
#EMBEDDING_DIM = 200
EMBEDDING_DIM = 300
BATCH_SIZE = 200
N_EPOCHS = 10

In [0]:
train_bodies = pd.read_csv(DATA_DIR+'train_bodies.csv')
train_stances = pd.read_csv(DATA_DIR+'train_stances.csv')

test_bodies = pd.read_csv(DATA_DIR+'test_bodies.csv')
test_stances_unlabeled = pd.read_csv(DATA_DIR+'test_stances_unlabeled.csv')

competetion_bodies = pd.read_csv(DATA_DIR+'competition_test_bodies.csv')
competetion_stances = pd.read_csv(DATA_DIR+'competition_test_stances.csv')

competetion_unlabeled = pd.read_csv(DATA_DIR+'competition_test_stances_unlabeled.csv')




In [0]:
train = train_stances.join(train_bodies.set_index('Body ID'), on='Body ID')
test = test_stances_unlabeled.join(test_bodies.set_index('Body ID'), on='Body ID')
comp = competetion_stances.join(competetion_bodies.set_index('Body ID'), on='Body ID')



In [0]:
train.replace('unrelated',1,True)
train.replace('agree',2,True)
train.replace('disagree',3,True)
train.replace('discuss',4,True)

comp.replace('unrelated',1,True)
comp.replace('agree',2,True)
comp.replace('disagree',3,True)
comp.replace('discuss',4,True)

In [0]:
word_seq_headline_train = [text_to_word_sequence(sent) for sent in train['Headline']]
word_seq_bodies_train = [text_to_word_sequence(sent) for sent in train['articleBody']]

word_seq_headline_test = [text_to_word_sequence(sent) for sent in test['Headline']]
word_seq_bodies_test = [text_to_word_sequence(sent) for sent in test['articleBody']]

word_seq_headline_comp = [text_to_word_sequence(sent) for sent in comp['Headline']]
word_seq_bodies_comp = [text_to_word_sequence(sent) for sent in comp['articleBody']]

In [0]:
word_seq = []
for i in range(len(word_seq_headline_train)):
  word_seq.append(word_seq_headline_train[i])
  
for i in range(len(word_seq_bodies_train)):
  word_seq.append(word_seq_bodies_train[i])

for i in range(len(word_seq_headline_test)):
  word_seq.append(word_seq_headline_test[i])

for i in range(len(word_seq_bodies_test)):
  word_seq.append(word_seq_bodies_test[i])
  
for i in range(len(word_seq_headline_comp)):
  word_seq.append(word_seq_headline_comp[i])
  
for i in range(len(word_seq_bodies_comp)):
  word_seq.append(word_seq_bodies_comp[i])
 

In [0]:
for i in range (len(word_seq_headline_comp)):
  word_seq_headline_comp[i].extend(word_seq_bodies_comp[i])



In [0]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq])


In [0]:
X_comp = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_headline_comp])
X_comp = pad_sequences(X_comp, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
y_comp = comp['Stance']
y_comp = y_comp.values


In [0]:
encoder_comp = LabelEncoder()
encoder_comp.fit(y_comp)
encoded_comp = encoder_comp.transform(y_comp)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_comp = np_utils.to_categorical(encoded_comp)

In [114]:
from keras import models
model1 = models.load_model(MODEL_DIR+'LSTM+W2V.h5')
y_pred1 = model1.predict(X_comp)
score,acc = model1.evaluate(X_comp, dummy_y_comp)
print ("LSTM with W2V")
print (score)





LSTM with W2V
1.2471345294789826


In [115]:
model2 = models.load_model(MODEL_DIR+'LSTM+Glove.h5')
y_pred2 = model2.predict(X_comp)
score,acc = model2.evaluate(X_comp, dummy_y_comp)
print ("LSTM with Glove")
print (score)



LSTM with Glove
0.853046915201202


In [116]:
model3 = models.load_model(MODEL_DIR+'BiLSTM+W2V.h5')
y_pred3 = model3.predict(X_comp)
score,acc = model3.evaluate(X_comp, dummy_y_comp)
print ("BiLSTM with W2V")
print (score)



BiLSTM with W2V
1.4881437730419544


In [146]:
model4 = models.load_model(MODEL_DIR+'BiLSTM+Glove.h5')
y_pred4 = model4.predict(X_comp)
score,acc = model4.evaluate(X_comp, dummy_y_comp)
4


BiLSTM with Glove
0.9865527658597997


In [118]:
model5 = models.load_model(MODEL_DIR+'BiLSTMActivation+W2V.h5')
y_pred5 = model5.predict(X_comp)
score,acc = model5.evaluate(X_comp, dummy_y_comp)
print ("BiLSTMActivtion with W2V")
print (score)

BiLSTMActivtion with W2V
0.8487603987693524


In [154]:
outputs = [np.argmax(p) for p in y_pred1]

for i in range(len(outputs)):
    if outputs[i] == 0: outputs[i] = "unrelated"
    if outputs[i] == 1: outputs[i] = "disagree"
    if outputs[i] == 2: outputs[i] = "agree"
    if outputs[i] == 3: outputs[i] = "discuss"
print (np.unique(outputs))

['agree' 'disagree' 'discuss' 'unrelated']


In [0]:
Predicted = {}
Predicted = pd.DataFrame({'Stance': outputs})
result = pd.conc,,at([competetion_unlabeled, Predicted], axis=1, sort=False)
result.to_csv('submission.csv', index=False, encoding='utf-8')

In [155]:
cs = pd.read_csv(DATA_DIR+'competition_test_stances.csv')
stance_true = cs['Stance'].values
from score import report_score
report_score(stance_true, outputs)


40.75313807531381
