In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Visualization
!pip install dataprep | grep -v 'already satisfied'
from dataprep.eda import plot, plot_diff, plot_correlation, create_report

# Preprocessing and Modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras tuner
!pip install -q -U keras-tuner
import keras_tuner as kt
# Warning
import warnings
warnings.filterwarnings('ignore')

<a id=0></a>
## <p style="background-color:lightblue; font-family:newtimeroman; font-size:120%; text-align:left; border-radius: 15px 50px;">Table of Content</p>
* [0. Introduction and updates](#0)
* [1. Loading Data 💎](#1)
* [2. EDA 📊](#2)
* [3. Data Preprocessing](#3)
* [4. Vectorization](#4)
    * [4.1 Common Vectorizer Usage](#4.1)
    * [4.2 If-Idf Term Weightings](#4.2)
* [5. Transfer Learning with Hugging Face](#5)
    * [5.1 Tokenization](#5.1)
    * [5.2 Defining a Model Architecture](#5.2)
    * [5.3 Training Classification Layer Weights](#5.3)
    * [5.4 Fine-tuning DistilBert and Training All Weights](#5.4)
* [6. Make a Submission](#6)
* [7. References](#7)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">0. Introduction and update </p>
# Introduction: 
In this kernel, beside the general steps working with text data as EDA, preprocessing. The workflow in Modelling can divided into 3 main stages:
1. Defining a Model Architecture.
2. Training Classification Layer Weights.
3. Fine-tuning DistilBert and Tranining All Weights.

# Update: 
Current Version
1. Use Keras-tuner to find the optimized learning rate for main model.

[Content](#0)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. Loading Data 💎</p>

Just load the dataset and global variables for colors and so on.

[Content](#0)

In [None]:
train_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_full = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print('Training Set Shape = {}'.format(train_full.shape))
print('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))

print('Test Set Shape = {}'.format(test_full.shape))
print('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">2. EDA 📊</p>


[Content](#0)

In [None]:
plot(train_full)

In [None]:
create_report(train_full)

In [None]:
plot(train_full, 'text')

In [None]:
create_report(train_full.text)

### Range from 120 to 140 characters is the most common in tweet.

In [None]:
create_report(train_full.target)

### Dataset is balanced

In [None]:
plot(train_full, "text", "target")

In [None]:
df1 = train_full.text[train_full.target == 0]
df2 = train_full.text[train_full.target == 1]
plot_diff([df1, df2])

<a id='3'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">3. Data Pre-processing </p>

Now we are going to engineering the data to make it easier for the model to clasiffy.

This section is very important to reduce the dimensions of the problem.




[Content](#0)

# Main technics I used in this data
    * [3.1] Remove 92 duplicated rows
    * [3.2] Cleaning text
    * [3.3] Spelling Checker
    * [3.4] Remove Stemming
 #### Step 3.3 spends a lot time (around 4000s in 4536s in total). 
 #### So, I splits Data Preprocessing into [another kernel](https://www.kaggle.com/phanttan/disastertweet-prepareddata). 
 #### And the prepared data to save in to [new dataset](https://www.kaggle.com/phanttan/disastertweet-prepared2)
 #### I am so appreciate to you for using/upvoting it.


In [None]:
# free some space
del train_full, test_full

# Read commited-dataset
df_train = pd.read_csv("/kaggle/input/disastertweet-prepared2/train_prepared.csv")
df_test = pd.read_csv("/kaggle/input/disastertweet-prepared2/test_prepared.csv")

<a id=4 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4. Vectorization</p>

Three steps using the Bag-of-words (BOW) model:
1. Term frequency : count occurrences of word in sentence
2. Inverse document frequency: 
3. L2 Norm
Reference : https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

[Content](#0)

<a id=4.1 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4.1 Common Vectorizer Usage</p>
Reference: https://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

[Content](#0)

In [None]:
# Instantiate the Vectorizer
vect = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0, max_df=0.9, max_features=100)
df_dtm = vect.fit_transform(df_train)
df_dtm.toarray()[0]

<a id=4.2 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4.2 TF-IDF</p>
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

[Content](#0)

In [None]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0, max_df=0.98, max_features=100)
df_ifidf= tfidf_vect.fit_transform(df_train)
df_ifidf.toarray()[0]

<a id=5 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">5. Transfer Learning with Hugging Face</p>

[Content](#0)

BERT(*Bi-directional Encoder Representations from Transformers*)

    - GLUE Score to 80.5%
    - MultiNLI accuracy to 86.7%
    - SQuAD v1.1 question answering Test F1 to 93.3
    - SQuAD v2.0 Test F1 to 83.1

In [None]:
BATCH_SIZE=64

<a id=5.1 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.1 Tokenizing Text</p>

[Content](#0)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
max_len = 0
# Find the longest sentence 
for sentence in pd.concat([df_train.text, df_test.text]):
    if len(sentence) > max_len: # number of word in a sentence tokenizer is greater max_len
        max_len = len(sentence)
max_len

In [None]:
# Using for Fine_tuning
train_texts, val_texts, train_labels, val_labels = train_test_split(df_train.text, 
                                                                    df_train.target, 
                                                                    test_size=0.2, 
                                                                    random_state=42)
# Use padding with max_len to get train/val/test with same dimension
train_encodings = tokenizer(train_texts.tolist(), truncation=True, max_length=max_len, padding="max_length", return_tensors='tf')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, max_length=max_len, padding="max_length", return_tensors='tf')
test_encodings = tokenizer(df_test.text.fillna('').tolist(), truncation=True, max_length=max_len, padding="max_length", return_tensors='tf')

print(train_encodings)

In [None]:
# Using for Keras-tuner
train_encodings_keras = tokenizer(df_train.text.tolist(), truncation=True, max_length=max_len, padding="max_length", return_tensors="tf")
train_encodings_keras

In [None]:
train_encodings_keras['input_ids']

In [None]:
# Encode Training Data
X_train_ids = train_encodings['input_ids'].numpy()
X_train_attention = train_encodings['attention_mask'].numpy
# Encode Validating Data
X_val_ids = val_encodings['input_ids'].numpy()
X_val_attention = val_encodings['attention_mask'].numpy()

In [None]:
train_tf_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
train_tf_dataset = train_tf_dataset.shuffle(len(train_encodings)).batch(BATCH_SIZE)

eval_tf_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))
eval_tf_dataset = eval_tf_dataset.batch(BATCH_SIZE)

<a id=5.2 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.2 Define a model based in DistilBERT</p>

In this part, I try a lighter model than BERT: 

[DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)

[Content](#0)


<img src="https://i.ibb.co/4tzyG1P/Bert-Classification.png" alt="Bert-Classification" border="0">

# Initialize the Base Model

In [None]:
from transformers import TFDistilBertModel, DistilBertConfig

BERT_DROPOUT = 0.2
BERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=BERT_DROPOUT, 
                          attention_dropout=BERT_ATT_DROPOUT, 
                          output_hidden_states=True)

# Add a Classification Head

In [None]:
# Model function
def create_model(transformer):
    
    # Make Transformer layers untrainable
    for layer in transformer.layers:
        layer.trainable = False
    # Input layers
    input_ids_layer = keras.Input(shape =(max_len,), 
                           dtype=tf.int32, 
                           name='input_ids') 
    input_attention_layer = keras.Input(shape=(max_len,),
                                    dtype=tf.int32, 
                                    name='attention_mask')  
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    # Hidden layers
    output = keras.layers.Dense(256,
                                kernel_initializer=keras.initializers.GlorotUniform(seed=1),  
                                kernel_constraint=None,
                                bias_initializer='zeros',
                                activation='relu')(cls_token)
    output = keras.layers.Dropout(0.2)(output)
    output = keras.layers.Dense(64, activation = 'relu')(output)
    # Output layer
    output = keras.layers.Dense(1, activation='sigmoid')(output)
    # Define the model 
    model = keras.Model([input_ids_layer, input_attention_layer],
                       output)
    model.summary()
    keras.utils.plot_model(model)
    
    return model

In [None]:
def distilBERT_NN_tuner(hp):

    # The bare, pre-trained DistilBERT transformer model outputting raw hidden-states 
    # and without any specific head on top.
    distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
    model = create_model(distilBERT)
    # Using learning_rate is recommendated from paper BERT: Pre-training of Deep Bidirectional Transformers forLanguage Understanding
    hp_learning_rate = hp.Choice('learning_rate', values=[5e-5, 4e-5 , 3e-5, 2e-5])
    optimizer = keras.optimizers.Adam(learning_rate=hp_learning_rate)
    # Compile the model
    model.compile(optimizer, 
                  loss="binary_crossentropy",
                  metrics=['accuracy'])
    return model

<a id=5.3 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.3 Training Classification Layer Weights</p>

[Content](#0)

# Using Keras-Tuner to find the best Learning-rate

### RandomSearch

In [None]:
tuner = kt.RandomSearch(distilBERT_NN_tuner,
                objective='val_accuracy')
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                           patience=4, 
                                           restore_best_weights=True)
tuner.search(train_tf_dataset,
                epochs=25,
                batch_size=BATCH_SIZE,
                validation_data=eval_tf_dataset,
                callbacks = [early_stop],
                verbose=2)
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""The hyperparameter search is complete. The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.""")

# Running model with the best Learning Rate

In [None]:
# Running with specific number
DistilBERTmodel = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
model = create_model(DistilBERTmodel)
# Compile the model
model.compile(keras.optimizers.Adam(lr=best_hps.get('learning_rate')), 
              loss="binary_crossentropy",
              metrics=['accuracy'])
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=4, restore_best_weights=True)

train_history1 = model.fit(train_tf_dataset,
                           epochs=25,
                           batch_size=BATCH_SIZE,
                           validation_data=eval_tf_dataset,
                           callbacks = [early_stop],
                           verbose=2)

<a id=5.4 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.4 Fine-tune DistilBERT model and Training all Weights</p>

    1. Unfrezzing layer weights in DistilBERT model
    2. Using lower learning rate to prevent large update to pre-trained weights
    3. Recompile model again
[Content](#0)

### Unfreeze all layer weights in distilBERT and make available for training

In [None]:
for layer in DistilBERTmodel.layers:
    layer.Trainable = True

## Recompile model after unfreezing

The lower learning-rate is chosen because of preventing the major update to pre-trained weights.

In [None]:
model.compile(keras.optimizers.Adam(lr=1e-5), 
              loss="binary_crossentropy",
              metrics=['accuracy'])

## Training the model again

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=2, restore_best_weights=True)
train_history2 = model.fit(train_tf_dataset,
                               epochs=25,
                               batch_size=BATCH_SIZE,
                               validation_data=eval_tf_dataset,
                               callbacks = [early_stop],
                               verbose=2 )

<a id=6 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">6. Make a Submission</p>

[Content](#0)

In [None]:
def submission(model, test):
    sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
    predictions =  model.predict(test.data, batch_size=BATCH_SIZE, verbose =1)
    y_preds = [ int(i) for i in np.rint(predictions)]
    sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_preds})
    sub.to_csv('submission.csv', index=False)

In [None]:
submission(model, test_encodings)

In [None]:
pd.read_csv('submission.csv')

<a id=7 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">7. References</p>

[Content](#0)

[Hugging Face Transformers Fine-Tunning DistilBert for Binary Classification Tasks](https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379)

[Keras Tuner](https://keras.io/keras_tuner)

[Distil Bert](https://huggingface.co/transformers/model_doc/distilbert.html)


# If you like this kernel, please upvote and tell me your thought. Thank you @@