In [None]:
! pip install tf-models-official==2.4.0 -q
! pip install tensorflow-gpu==2.4.1 -q
! pip install tensorflow-text==2.4.1 -q
! python -m spacy download en_core_web_sm -q
! pip install dataprep | grep -v 'already satisfied'

In [None]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

import tensorflow as tf
from tensorflow import keras

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from dataprep.eda import plot, plot_diff, plot_correlation, create_report
from dataprep.clean import clean_text

# Preprocessing and Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, concatenate 
from tensorflow.keras import Model, regularizers 
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.losses import BinaryCrossentropy
from official.nlp.optimization import create_optimizer # AdamW optimizer
# Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
tf.__version__

In [None]:
# Random seeds
import random
import numpy as np
import tensorflow as tf
random.seed(319)
np.random.seed(319)
tf.random.set_seed(319)

<a id=0></a>
## <p style="background-color:lightblue; font-family:newtimeroman; font-size:120%; text-align:left; border-radius: 15px 50px;">Table of Content</p>
* [0. Introduction and updates](#0)
* [1. Loading Data 💎](#1)
* [2. EDA 📊](#2)
* [3. Data Preprocessing](#3)
* [4. Vectorization](#4)
    * [4.1 Common Vectorizer Usage](#4.1)
    * [4.2 If-Idf Term Weightings](#4.2)
* [5. Transfer Learning with Hugging Face](#5)
    * [5.1 Tokenization](#5.1)
    * [5.2 Defining a Model Architecture](#5.2)
    * [5.3 Training Classification Layer Weights](#5.3)
    * [5.4 Fine-tuning DistilBert and Training All Weights](#5.4)
* [6. Make a Submission](#6)
* [7. References](#7)

<a id=0></a>
<font size="+3" color="#5bc0de"><b>Introduction </b></font><br>
[Content](#0)

In this kernel, beside the general steps working with text data as EDA, preprocessing. The workflow in Modelling can divided into 2 main stages:
1. Defining a Model Architecture with concatenation a keyword column into BERT model
2. Training Classification Layer Weights.

<a id=1.2 ></a>
<font size="+3" color="#5bc0de"><b>1.2. Update via Versions </b></font><br>
[Content](#0)

### Current Version
* Adding 1 hidden layer in Model to incease accuracy.


[Content](#0)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. Loading Data 💎</p>

Just load the dataset and global variables for colors and so on.

[Content](#0)

In [None]:
train_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_full = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print('Training Set Shape = {}'.format(train_full.shape))
print('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))

print('Test Set Shape = {}'.format(test_full.shape))
print('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">2. EDA 📊</p>


[Content](#0)

In [None]:
plot(train_full)

In [None]:
create_report(train_full)

In [None]:
plot(train_full, 'text')

In [None]:
train_full.text

### Range from 120 to 140 characters is the most common in tweet.

### Dataset is balanced

In [None]:
plot(train_full, "text", "target")

In [None]:
df1 = train_full.text[train_full.target == 0]
df2 = train_full.text[train_full.target == 1]
plot_diff([df1, df2])

<a id='3'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">3. Data Pre-processing </p>

Now we are going to engineering the data to make it easier for the model to clasiffy.

This section is very important to reduce the dimensions of the problem.




[Content](#0)

# Main technics I used in this data
    * [3.1] Remove 157 duplicated rows
    * [3.2] Cleaning text
    * [3.3] Spelling Checker
    * [3.4] Remove Stemming
 #### Step 3.3 spends a lot time (around 4000s in 4536s in total). 
 #### So, I splits Data Preprocessing into [another kernel](https://www.kaggle.com/phanttan/disastertweet-prepareddata). 
 #### And the prepared data to save in to [new dataset](https://www.kaggle.com/phanttan/disastertweet-prepared2)
 #### I am so appreciate to you for using/upvoting it.


In [None]:
# Read commited-dataset
df_train = pd.read_csv("/kaggle/input/disastertweet-prepared2/train_prepared.csv")
df_test = pd.read_csv("/kaggle/input/disastertweet-prepared2/test_prepared.csv")

In [None]:
# Only apply 'keyword' columns in full data, because other features cleaned in df_train/test
train_full = clean_text(train_full,'keyword')
test_full = clean_text(test_full, 'keyword')

In [None]:
# Adding cleaned data into df_train/test
df_train['keyword'] = train_full['keyword']
df_test['keyword'] = test_full['keyword']

In [None]:
# Load Spacy Library
nlp_spacy = spacy.load('en_core_web_sm')
# Load the sentence encoder
sentence_enc = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [None]:
def extract_keywords(text):
    potential_keywords = []
    TOP_KEYWORD = -1
    # Create a list for keyword parts of speech
    pos_tag = ['ADJ', 'NOUN', 'PROPN']
    doc = nlp_spacy(text)
    
    for i in doc:
        if i.pos_ in pos_tag:
            potential_keywords.append(i.text)

    document_embed = sentence_enc([text])
    potential_embed = sentence_enc(potential_keywords)    
    
    vector_distances = cosine_similarity(document_embed, potential_embed)
    keyword = [potential_keywords[i] for i in vector_distances.argsort()[0][TOP_KEYWORD:]]

    return keyword

def keyword_filler(keyword, text):
    if pd.isnull(keyword):
        try:
            keyword = extract_keywords(text)[0]
        except:
            keyword = '' 
        
    return keyword

In [None]:
df_train.keyword = pd.DataFrame(list(map(keyword_filler, df_train.keyword, df_train.text))).astype(str)
df_test.keyword = pd.DataFrame(list(map(keyword_filler, df_test.keyword, df_test.text))).astype(str)

print('Null Training Keywords => ', df_train['keyword'].isnull().any())
print('Null Test Keywords => ', df_test['keyword'].isnull().any())

In [None]:
df_train

# Visualization the Keyword Frequency

In [None]:
keyword_non_disaster = df_train.keyword[df_train.target==0].value_counts().reset_index()
sns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')
plt.title('Non-Disaster Keyword Frequency (0)')
plt.xlabel('Frequency')
plt.ylabel('Top 10 Keywords')
plt.show()

In [None]:
keyword_disaster = df_train.keyword[df_train.target==1].value_counts().reset_index()
sns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')
plt.title('Non-Disaster Keyword Frequency (0)')
plt.xlabel('Frequency')
plt.ylabel('Top 10 Keywords')
plt.show()

In [None]:
# Spilt data
X_train, X_val, y_train, y_val = train_test_split(df_train[['text','keyword']],
                                                    df_train.target, 
                                                    test_size=0.2, 
                                                    random_state=42)
X_train.shape, X_val.shape

# Create TensorFlow Datasets

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
val_ds = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))
test_ds = tf.data.Dataset.from_tensor_slices(dict(df_test[['text','keyword']]))

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

BUFFER_SIZE = 1000
BATCH_SIZE = 32
RANDOM_SEED = 319

def configure_dataset(dataset, shuffle=False, test=False):
    if shuffle:
        dataset = dataset.cache()\
                        .shuffle(BUFFER_SIZE, seed=RANDOM_SEED, reshuffle_each_iteration=True)\
                        .batch(BATCH_SIZE, drop_remainder=True)\
                        .prefetch(AUTOTUNE)
    elif test:
        dataset = dataset.cache()\
                        .batch(BATCH_SIZE, drop_remainder=False)\
                        .prefetch(AUTOTUNE)
    else:
        dataset = dataset.cache()\
                        .batch(BATCH_SIZE, drop_remainder=True)\
                        .prefetch(AUTOTUNE)
    return dataset

In [None]:
a3 = configure_dataset(train_ds, shuffle=True)
dict3 = []
for elem in a3:
    dict3.append(elem[0]['text'][0])
dict3[:10]

In [None]:
# Configure the datasets
train_ds = configure_dataset(train_ds, shuffle=True)
val_ds = configure_dataset(val_ds)
test_ds = configure_dataset(test_ds, test=True)

In [None]:
# Free memory
del X_train, X_val, y_train, y_val, df_train, df_test, train_full, test_full

# Classifier Model

In [None]:
# Bidirectional Encoder Representations from Transformers (BERT).
bert_encoder_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
# Text preprocessing for BERT.
bert_preprocessor_path = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
# Token based text embedding trained on English Google News 200B corpus.
keyword_embedding_path = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"

In [None]:
bert_encoder = hub.KerasLayer(bert_encoder_path, trainable=True, name="BERT_Encoder")
bert_preprocessor = hub.KerasLayer(bert_preprocessor_path, name="BERT_Preprocessor")
nnlm_embed = hub.KerasLayer(keyword_embedding_path, name="NNLM_Embedding")

In [None]:
kernel_initializer = tf.keras.initializers.GlorotNormal(seed=319)
# Model function
def create_model():
    # Keyword Branch
    text_input = Input(shape=(), dtype=tf.string, name="text")
    encoder_inputs = bert_preprocessor(text_input)
    encoder_outputs = bert_encoder(encoder_inputs)
    # Pooled output
    pooled_output = encoder_outputs["pooled_output"]
    bert_branch = Dropout(0.1,
                          seed=319,
                          name="BERT_Dropout")(pooled_output)
    # Construct keyword layers
    keyword_input = Input(shape=(), dtype=tf.string, name='keyword')
    keyword_embed = nnlm_embed(keyword_input)
    keyword_flat = Flatten(name="Keyword_Flatten")(keyword_embed)
    keyword_dense1 = Dense(128, 
                          activation='relu',
                          kernel_initializer=kernel_initializer,
                          kernel_regularizer=regularizers.l2(1e-4),
                          name="Keyword_Dense1"
                         )(keyword_flat)
    keyword_branch1 = Dropout(0.5,
                             seed=319,
                             name='Keyword_dropout1'
                            )(keyword_dense1)
    keyword_dense2 = Dense(128, 
                          activation='relu',
                          kernel_initializer=kernel_initializer,
                          kernel_regularizer=regularizers.l2(1e-4),
                          name="Keyword_Dense2"
                         )(keyword_branch1)
    keyword_branch2 = Dropout(0.5,
                             seed=319,
                             name='Keyword_dropout2'
                            )(keyword_dense2)
    keyword_dense3 = Dense(128, 
                          activation='relu',
                          kernel_initializer=kernel_initializer,
                          kernel_regularizer=regularizers.l2(1e-4),
                          name="Keyword_Dense3"
                         )(keyword_branch2)
    keyword_branch3 = Dropout(0.5,
                             seed=319,
                             name='Keyword_dropout3'
                            )(keyword_dense3)
    
    # Merge the layers and classify
    merge = concatenate([bert_branch, keyword_branch3], name="Concatenate")
    dense = Dense(128, 
                  activation='relu',
                  kernel_initializer=kernel_initializer,
                  kernel_regularizer=regularizers.l2(1e-4), 
                  name="Merged_Dense")(merge)
    dropout = Dropout(0.5,
                      seed=319,
                      name="Merged_Dropout"
                     )(dense)
    clf = Dense(1,
                activation="sigmoid", 
                kernel_initializer=kernel_initializer,
                name="Classifier"
               )(dropout)
    return Model([text_input, keyword_input], 
                 clf, 
                 name="BERT_Classifier")

In [None]:
bert_classifier = create_model()
bert_classifier.summary()

In [None]:
keras.utils.plot_model(bert_classifier, 
                      show_shapes=False)

# AdamW Optimizer

In [None]:
EPOCHS = 3
LEARNING_RATE = 5e-5

STEPS_PER_EPOCH = int(train_ds.unbatch().cardinality().numpy() / BATCH_SIZE)
VAL_STEPS = int(val_ds.unbatch().cardinality().numpy() / BATCH_SIZE)
# Calculate the train and warmup steps for the optimizer
TRAIN_STEPS = STEPS_PER_EPOCH * EPOCHS
WARMUP_STEPS = int(TRAIN_STEPS * 0.1)

adamw_optimizer = create_optimizer(
    init_lr=LEARNING_RATE,
    num_train_steps=TRAIN_STEPS,
    num_warmup_steps=WARMUP_STEPS,
    optimizer_type='adamw'
)

In [None]:
STEPS_PER_EPOCH, VAL_STEPS, TRAIN_STEPS, WARMUP_STEPS

## Training the model again

In [None]:
bert_classifier.compile(loss=BinaryCrossentropy(from_logits=True),
                   optimizer=adamw_optimizer, 
                   metrics=[BinaryAccuracy(name="accuracy")]
                  )
history = bert_classifier.fit(train_ds, 
                         epochs=EPOCHS,
                         steps_per_epoch=STEPS_PER_EPOCH,
                         validation_data=val_ds,
                         validation_steps=VAL_STEPS
                        )

<a id=6 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">6. Make a Submission</p>

[Content](#0)

In [None]:
def submission(model, test):
    sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
    predictions =  model.predict(test)
    y_preds = [ int(i) for i in np.rint(predictions)]
    sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_preds})
    sub.to_csv('submission.csv', index=False)

In [None]:
submission(bert_classifier, test_ds)

<a id=7 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">7. References</p>

[Content](#0)

[Hugging Face Transformers Fine-Tunning DistilBert for Binary Classification Tasks](https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379)

[Keras functional API](https://keras.io/guides/functional_api/)

[Distil Bert](https://huggingface.co/transformers/model_doc/distilbert.html)

[Tensorflow Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)

[BERT in TFHub](https://tfhub.dev/google/collections/bert)

[TensorFlow NLP Modelling Toolkit](https://github.com/tensorflow/models/tree/master/official/nlp)

[NLP With BERT from Tendorflow](https://www.tensorflow.org/text/tutorials/fine_tune_bert)

[NLP Optimization](https://github.com/tensorflow/models/blob/master/official/nlp/optimization.py)

# If you like this kernel, please upvote and tell me your thought. Thank you @@