In [None]:
# data source
! git clone https://github.com/VinAIResearch/COVID19Tweet.git

import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from tqdm import tqdm
import json
import os

# loading imports from args file


In [None]:
args_file = 'cnn_args.json'

with open(args_file) as f:
  args = json.load(f)


# data_dir = args['data_dir'] # files must be train.tsv and valid.tsv 
num_words = args['num_words'] # how many words to keep in vocab
max_seq_len = args['max_seq_len'] # max length of a seq vector (for padding or cropping)
# pretrained_embeds_file = args['pretrained_embeds_file']
filter_widths = args['filter_widths']
number_of_filters = args['number_of_filters']
dropout_prob = args['dropout_prob']
optimizer = args['optimizer']
n_classes = args['n_classes']
hidden_activation = args['hidden_activation']
BATCH_SIZE = args['BATCH_SIZE']
max_epochs = args['max_epochs'] 
trained_model_dir = args['trained_model_dir']

# loading task data

In [None]:
train_df = pd.read_csv("COVID19Tweet/train.tsv", sep='\t')
val_df = pd.read_csv("COVID19Tweet/valid.tsv", sep='\t',names=['Id','Text','Label'])
test_df = pd.read_csv("COVID19Tweet/unlabeled_test_with_noise.tsv", sep='\t',names=['Id','Text'])


train_sentences = train_df.Text.values
train_labels =  train_df.Label.values

val_sentences = val_df.Text.values
val_labels =  val_df.Label.values


test_sentences = test_df.Text.values
# test_labels =  val_df.Label.values

y_train = [int(label == 'INFORMATIVE') for label in train_labels]
y_val = [int(label == 'INFORMATIVE') for label in val_labels]

y_train = np.array(y_train)
y_val = np.array(y_val)

# preparing text input for network using Keras tokenizer

In [None]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)

X_train = tokenizer.texts_to_sequences(train_sentences)
X_val = tokenizer.texts_to_sequences(val_sentences)
X_test = tokenizer.texts_to_sequences(test_sentences)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index



X_train = pad_sequences(X_train, padding='post', maxlen=max_seq_len)
X_val = pad_sequences(X_val, padding='post', maxlen=max_seq_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_seq_len)

# loading pre-trained embeds as weights matrix


In [None]:
# Either one of these must be chosen to load the embeddings weights

## Glove Twitter Embeds

In [None]:
! wget http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip

--2020-08-09 09:40:23--  http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip [following]
--2020-08-09 09:40:23--  https://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.twitter.27B.zip [following]
--2020-08-09 09:40:23--  http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting respon

In [None]:
!unzip glove.twitter.27B.zip

Archive:  glove.twitter.27B.zip
  inflating: glove.twitter.27B.100d.txt  
  inflating: glove.twitter.27B.200d.txt  
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  


In [None]:
from tqdm import tqdm
pretrained_embeds_file = 'glove.twitter.27B.200d.txt'


embedding_vector = {}
f = open(pretrained_embeds_file)
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

1193517it [01:02, 19007.90it/s]


In [None]:
EMBEDDING_DIM = embedding_vector['test'].shape[0]

embedding_matrix = np.zeros((vocab_size,EMBEDDING_DIM))
for word,i in tqdm(tokenizer.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

100%|██████████| 22935/22935 [00:00<00:00, 537644.13it/s]


## W2V Twitter Embeds

In [None]:
from gensim.models.keyedvectors import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('drive/My Drive/W-NUT COVID19/word2vec_twitter_tokens.bin', binary=True, unicode_errors='ignore')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
w2v_model['test'].shape[0]

400

In [None]:
from tqdm import tqdm
EMBEDDING_DIM = w2v_model['test'].shape[0]

embedding_matrix = np.zeros((vocab_size,EMBEDDING_DIM))
for word,i in tqdm(tokenizer.word_index.items()):
  if word in w2v_model:
    embedding_matrix[i] = w2v_model[word]

100%|██████████| 22935/22935 [00:00<00:00, 203461.43it/s]


# kim's CNN network implementation


In [None]:
input_text = layers.Input(shape=(max_seq_len,))

embedding_layer = layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix],
                            trainable=True)(input_text)
text_embed = layers.SpatialDropout1D(dropout_prob)(embedding_layer)


conv_layers = []
for filter_length in filter_widths:
    conv_layer = layers.Conv1D(filters=number_of_filters, kernel_size=filter_length, padding='valid',
                        strides=1, activation=hidden_activation)(text_embed)
    maxpooling = layers.MaxPool1D(pool_size=max_seq_len - filter_length + 1)(conv_layer)
    flatten = layers.Flatten()(maxpooling)
    conv_layers.append(flatten)
sentence_embed = layers.concatenate(inputs=conv_layers)
dropout = layers.Dropout(dropout_prob)(sentence_embed)
# dense_layer = layers.Dense(hidden_units, activation=hidden_activation)(dropout)

if(n_classes == 2):
  output = layers.Dense(1, activation='sigmoid')(dropout)
  model = Model(input_text, output)
  model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=optimizer)
else:
  output = layers.Dense(n_classes, activation='softmax')(dropout)
  model = Model(input_text, output)
  model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=optimizer)

model.summary()

# Training

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min')
mcp_save = ModelCheckpoint(trained_model_dir+"model.hdf5", save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, epsilon=1e-4, mode='min')

if not os.path.exists(trained_model_dir):
    os.makedirs(trained_model_dir)


history = model.fit(X_train, y_train,
                    epochs=max_epochs,
                    verbose=True,
                    validation_data=(X_val, y_val), 
                    callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                    batch_size=BATCH_SIZE)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


# eval model on val set

In [None]:
cnn_probs = model.predict(X_val)
cnn_preds = [int(pred[0]>0.5)for pred in cnn_preds]

In [None]:
## we only look at class 1 f1-score
from sklearn.metrics import classification_report

print(classification_report(y_val, cnn_preds,digits=6))

# saving CNN prob outputs in CSV to use later in Ensemble 

In [None]:
val_results = pd.DataFrame()

# depends on which embeds were used
val_results['cnn_w2v_outputs'] = cnn_probs.flatten()
# val_results['cnn_glove_outputs'] = cnn_probs.flatten()

In [None]:
val_results.to_csv("val_probs.csv")