In [0]:
!unzip fnc-1.zip
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip


In [0]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import tensorflow as tf

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
sess = tf.Session(config=tf_config)

In [3]:
import gensim
import keras
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences


import matplotlib as mpl
%matplotlib inline
from matplotlib import pyplot as plt
from keras.utils import plot_model 
from IPython.display import Image
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.preprocessing import LabelEncoder



np.random.seed(1003)

Using TensorFlow backend.


### Python 3.6.1

### Specify Hyperparameters

In [0]:
# Specify the folder locations
W2V_DIR = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
DATA_DIR = 'fnc-1/'
Glove = 'glove.6B.200d.txt'

# These are some hyperparameters that can be tuned
MAX_SENT_LEN = 170
MAX_VOCAB_SIZE = 400000
LSTM_DIM = 128
EMBEDDING_DIM = 200
EMBEDDING_DIM = 300
BATCH_SIZE = 200
N_EPOCHS = 10


### Load Data

In [0]:
# Read the text files of positive and negative sentences
train_bodies = pd.read_csv(DATA_DIR+'train_bodies.csv')
train_stances = pd.read_csv(DATA_DIR+'train_stances.csv')

test_bodies = pd.read_csv(DATA_DIR+'test_bodies.csv')
test_stances_unlabeled = pd.read_csv(DATA_DIR+'test_stances_unlabeled.csv')

#competetion_bodies = pd.read_csv(DATA_DIR+'competition_test_bodies.csv')
#competetion_stances = pd.read_csv(DATA_DIR+'competition_test_stances.csv')

#competetion_unlabeled = pd.read_csv(DATA_DIR+'competition_test_stances_unlabeled.csv')








In [7]:
print('Number of Body sentences:', len(train_bodies['articleBody']))
print('Number of Headlines sentences:', len(train_stances['Headline']))

train = train_stances.join(train_bodies.set_index('Body ID'), on='Body ID')
test = test_stances_unlabeled.join(test_bodies.set_index('Body ID'), on='Body ID')
#comp = competetion_stances.join(competetion_bodies.set_index('Body ID'), on='Body ID')


Number of Body sentences: 1683
Number of Headlines sentences: 49972


In [0]:
train.replace('unrelated',1,True)
train.replace('agree',2,True)
train.replace('disagree',3,True)
train.replace('discuss',4,True)

#comp.replace('unrelated',1,True)
#comp.replace('agree',2,True)
#comp.replace('disagree',3,True)
#comp.replace('discuss',4,True)

In [9]:
# Pre-processing involves removal of puctuations and converting text to lower case
word_seq_headline_train = [text_to_word_sequence(sent) for sent in train['Headline']]
word_seq_bodies_train = [text_to_word_sequence(sent) for sent in train['articleBody']]

word_seq_headline_test = [text_to_word_sequence(sent) for sent in test['Headline']]
word_seq_bodies_test = [text_to_word_sequence(sent) for sent in test['articleBody']]

#word_seq_headline_comp = [text_to_word_sequence(sent) for sent in comp['Headline']]
#word_seq_bodies_comp = [text_to_word_sequence(sent) for sent in comp['articleBody']]

print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_headline_train], 90))
print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_bodies_train], 90))

print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_headline_test], 90))
print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq_bodies_test], 90))

90th Percentile Sentence Length: 16.0
90th Percentile Sentence Length: 683.0
90th Percentile Sentence Length: 16.0
90th Percentile Sentence Length: 657.0


In [0]:
word_seq = []
for i in range(len(word_seq_headline_train)):
  word_seq.append(word_seq_headline_train[i])
  
for i in range(len(word_seq_bodies_train)):
  word_seq.append(word_seq_bodies_train[i])

for i in range(len(word_seq_headline_test)):
  word_seq.append(word_seq_headline_test[i])

for i in range(len(word_seq_bodies_test)):
  word_seq.append(word_seq_bodies_test[i])
'''  
for i in range(len(word_seq_headline_comp)):
  word_seq.append(word_seq_headline_comp[i])
  
for i in range(len(word_seq_bodies_comp)):
  word_seq.append(word_seq_bodies_comp[i])
'''


In [0]:
for i in range (len(word_seq_headline_train)):
  word_seq_headline_train[i].extend(word_seq_bodies_train[i])

  
for i in range (len(word_seq_headline_test)):
  word_seq_headline_test[i].extend(word_seq_bodies_test[i])
  

#for i in range (len(word_seq_headline_comp)):
 # word_seq_headline_comp[i].extend(word_seq_bodies_comp[i])




In [12]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq])

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 22380


In [13]:
# Convert the sequence of words to sequnce of indices
X_train = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_headline_train])
X_train = pad_sequences(X_train, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
y_train = train['Stance']
y_train = y_train.values

print (X_train.shape)
print (y_train.shape)


X_test = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_headline_comp])
X_test = pad_sequences(X_test, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

#X_comp = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq_headline_comp])
#X_comp = pad_sequences(X_comp, maxlen=MAX_SENT_LEN, padding='post', truncating='post')
#y_comp = comp['Stance']
#y_comp = y_comp.values



(49972, 170)
(49972,)


In [14]:
print (np.unique((y_train)))
print (np.unique((y_comp)))

[1 2 3 4]
[1 2 3 4]


In [0]:
from keras.utils import np_utils

encoder_train = LabelEncoder()
encoder_train.fit(y_train)
encoded_train = encoder_train.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_train)

'''
encoder_comp = LabelEncoder()
encoder_comp.fit(y_comp)
encoded_comp = encoder_comp.transform(y_comp)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_comp = np_utils.to_categorical(encoded_comp)
'''

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_train, dummy_y_train, random_state=10, test_size=0.1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=10, test_size=0.1)
print (np.unique((y_train)))
print (np.unique((y_val)))

[0. 1.]
[0. 1.]


In [0]:
embeddings = gensim.models.KeyedVectors.load_word2vec_format(W2V_DIR, binary=True)




In [0]:
glove2word2vec(glove_input_file=Glove, word2vec_output_file="gensim_glove_vectors.txt")
embeddings = gensim.models.KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)


In [59]:
print('Number of words in this pre-trained w2v model:', len(embeddings.vocab))
print('Dimension of w2v:', embeddings.vector_size)

Number of words in this pre-trained w2v model: 400000
Dimension of w2v: 200


In [0]:
# Create an embedding matrix containing only the word's in our vocabulary
# If the word does not have a pre-trained embedding, then randomly initialize the embedding

embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings

### Keras Sequential API

In [62]:
# Build a sequential model by stacking neural net units 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=True, name='word_embedding_layer' 
                          ))
model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer'))
#model.add(Bidirectional(LSTM(LSTM_DIM, return_sequences=False, name='Bidrectional_lstm_layer1')))
model.add(Dropout(rate=0.8, name='dropout_1')) # Can try varying dropout rates, in paper suggest 0.8
model.add(Dense(4, activation='softmax', name='output_layer'))


W0724 19:32:06.858913 140519608772480 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [63]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_embedding_layer (Embedd (None, None, 200)         4476200   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               336896    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
output_layer (Dense)         (None, 4)                 1028      
Total params: 4,814,124
Trainable params: 4,814,124
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          validation_data=(X_val, y_val))



In [0]:
with open('LSTM+W2V', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

In [0]:
from keras.models import load_model

model.save('LSTM+W2V.h5')

**Bi-directional W2V+Glove with activation layer**



In [0]:
# Build a sequential model by stacking neural net units 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True)) # trainable=True results in overfitting

#model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer')) # Can try Bidirectional-LSTM
model.add(Bidirectional(LSTM(LSTM_DIM, return_sequences=False, name='Bidrectional_lstm_layer1')))

model.add(Dense(32, name='dense_1'))
# model.add(BatchNormalization(name='bn_1')) # BN did not really help with performance 
model.add(Dropout(rate=0.3, name='dropout_1')) # Can try varying dropout rates
model.add(Activation(activation='relu', name='activation_1'))

model.add(Dense(8, name='dense_2'))
# model.add(BatchNormalization(name='bn_2'))
model.add(Dropout(rate=0.3, name='dropout_2'))
model.add(Activation(activation='relu', name='activation_2'))

model.add(Dense(4, activation='softmax', name='output_layer'))
#model.add(Dense(1, activation='sigmoid', name='output_layer'))

model.summary()

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
history = model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=1,
          validation_data=(X_val, y_val))

In [0]:
with open('Bi-LSTM+Glove+Activation', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

In [0]:
from keras.models import load_model

model.save('BiLSTMActivation+W2V.h5')