<a href="https://colab.research.google.com/github/ShenghanZhang/Blog-Back-Up/blob/master/Tweet%20Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/Disaster Tweets')

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from sklearn.metrics import precision_score, recall_score, f1_score

np.set_printoptions(suppress=True)


In [4]:
PATH = '/content/drive/My Drive/Colab Notebooks/Disaster Tweets/inputs/'
BERT_PATH = '/content/drive/My Drive/Colab Notebooks/Disaster Tweets/bert_en_uncased_L-12_H-768_A-12'
tokenizer = tokenization.FullTokenizer(BERT_PATH+'/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train_data_cleaning.csv')
df_test = pd.read_csv(PATH+'test_data_cleaning.csv')

df_train = df_train
df_test = df_test
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)
print('train columns is {}'.format(df_train.columns))

train shape = (7613, 5)
test shape = (3263, 4)
train columns is Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


In [5]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


def _convert_to_bert_inputs(text,tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    stoken = ["[CLS]"] + tokenizer.tokenize(text) + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arrays(df, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for i in tqdm(range(len(df))):
        t = df.iloc[i].text
        ids, masks, segments = _convert_to_bert_inputs(t, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df):
    return np.asarray(df.target)

In [6]:
class CustomCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_data, test_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        self.test_inputs = test_data

        self.batch_size = batch_size
        self.fold = fold

        self.valid_predictions = []
        self.val_precision_scores = []
        self.val_recall_scores = []
        self.val_f1_scores = []

        self.test_predictions = []

    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))

        valid_pred_labels = np.round(self.valid_predictions)[-1]

        self.val_precision_scores.append(precision_score(self.valid_outputs, valid_pred_labels , average='macro'))
        self.val_recall_scores.append(recall_score(self.valid_outputs, valid_pred_labels , average='macro'))
        self.val_f1_scores.append(f1_score(self.valid_outputs, valid_pred_labels , average='macro'))

        self.test_predictions.append(
            self.model.predict(self.test_inputs, batch_size=self.batch_size))


        print('\nEpoch: {} - Validation Precision: {:.6} - Validation Recall: {:.6} - Validation '
              'F1: {:.6}'.format(epoch, self.val_precision_scores[-1], self.val_recall_scores[-1],self.val_f1_scores[-1]))
        
        


In [7]:
def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    
    x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(1, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model  

In [8]:
def train_and_predict(model, train_data, valid_data, test_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):

    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]),
        test_data=test_data,
        batch_size=batch_size,
        fold=fold)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs,
              batch_size=batch_size,callbacks= [custom_callback] )
    
    return custom_callback

In [9]:
outputs = compute_output_arrays(df_train)
inputs = compute_input_arrays(df_train, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, tokenizer, MAX_SEQUENCE_LENGTH)

100%|██████████| 7613/7613 [00:03<00:00, 2278.41it/s]
100%|██████████| 3263/3263 [00:01<00:00, 2331.41it/s]


In [10]:
skf = StratifiedKFold(n_splits=5,random_state=116,shuffle=True).split(X=df_train.text, y = df_train.target)

In [11]:
histories = []
for fold, (train_idx, valid_idx) in enumerate(skf):
    
    # will actually only do 3 folds (out of 5) to manage < 2h
    if fold > 2:
        K.clear_session()
        model = bert_model()

        train_inputs = [inputs[i][train_idx] for i in range(3)]
        train_outputs = outputs[train_idx]

        valid_inputs = [inputs[i][valid_idx] for i in range(3)]
        valid_outputs = outputs[valid_idx]

        # history contains two lists of valid and test preds respectively:
        #  [valid_predictions_{fold}, test_predictions_{fold}]
        history = train_and_predict(model, 
                          train_data=(train_inputs, train_outputs), 
                          valid_data=(valid_inputs, valid_outputs),
                          test_data=test_inputs, 
                          learning_rate=3e-5, epochs=4, batch_size=8,
                          loss_function='binary_crossentropy', fold=fold)

        histories.append(history)
        
        pd.DataFrame(index = valid_idx, data = {'valid_pred':histories[-1].valid_predictions[-1].reshape(-1)}).to_csv(PATH+str(fold)+'train_pred.csv')
        pd.DataFrame(data = {'test_pred': histories[-1].test_predictions[-1].reshape(-1)}).to_csv(PATH+str(fold)+'test_pred.csv')

Epoch 1/4
Epoch: 0 - Validation Precision: 0.830542 - Validation Recall: 0.816847 - Validation F1: 0.821186
Epoch 2/4
Epoch: 1 - Validation Precision: 0.802086 - Validation Recall: 0.804058 - Validation F1: 0.802941
Epoch 3/4
Epoch: 2 - Validation Precision: 0.81688 - Validation Recall: 0.795262 - Validation F1: 0.800692
Epoch 4/4
Epoch: 3 - Validation Precision: 0.811058 - Validation Recall: 0.80055 - Validation F1: 0.804048
Epoch 1/4
Epoch: 0 - Validation Precision: 0.833582 - Validation Recall: 0.828461 - Validation F1: 0.83055
Epoch 2/4
Epoch: 1 - Validation Precision: 0.829757 - Validation Recall: 0.827476 - Validation F1: 0.828507
Epoch 3/4
Epoch: 2 - Validation Precision: 0.826857 - Validation Recall: 0.817391 - Validation F1: 0.820734
Epoch 4/4
Epoch: 3 - Validation Precision: 0.829458 - Validation Recall: 0.810752 - Validation F1: 0.815984
