In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Visualization
!pip install dataprep | grep -v 'already satisfied'
from dataprep.eda import plot, plot_diff, create_report

#Preprocessing and Modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Fine-tuning
!pip install -q -U keras-tuner
import keras_tuner as kt

# Warning
import warnings
warnings.filterwarnings('ignore')

<a id=0></a>
## <p style="background-color:lightblue; font-family:newtimeroman; font-size:120%; text-align:left; border-radius: 15px 50px;">Table of Content</p>
* [0. What are updated in the last version?](#0)
* [1. Loading Data](#1)
* [2. EDA ](#2)
* [3. Data Preprocessing](#3)
* [4. Vectorization](#4)
    * [4.1 Common Vectorizer Usage](#4.1)
    * [4.2 If-Idf Term Weightings](#4.2)

* [5. BERT model](#5)
    * [5.1 Preprocessing Data](#5.1)
    * [5.2 DistilBERT model with Fine-tuning using Keras](#5.2)
* [6. Make a Submission](#6)
* [7. References](#7)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">0. What are updated in the last version?</p>

## Current Version

   1. Upload syntax for pip install dataprep
   
   2. Using *val_accuracy* monitor in EarlyStopping because it make the better result.
   
   
# Older Versions

## Current Version

   1. Update References
   
   2. Using again data at [this dataset](https://www.kaggle.com/phanttan/disastertweet-prepared2) 

## Version 6

   1. Add 4e-5 into learning_rate for Tunning
   
   2. Using data at [this dataset](https://www.kaggle.com/phanttan/disastertweets-prepared) 
   
## Version 5

   1. Find the maximum length to create smaller data to model (from 256 -> 149)

[Content](#0)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. Loading Data 💎</p>

Just load the dataset and global variables for colors and so on.

[Content](#0)

In [None]:
train_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_full = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print('Training Set Shape = {}'.format(train_full.shape))
print('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))

print('Test Set Shape = {}'.format(test_full.shape))
print('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">2. EDA 📊</p>


[Content](#0)

In [None]:
plot(train_full)

In [None]:
create_report(train_full)

Range from 120 to 140 characters is the most common in tweet.

<a id='3'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">3. Data Pre-processing </p>

Now we are going to engineering the data to make it easier for the model to clasiffy.

This section is very important to reduce the dimensions of the problem.


[Content](#0)

# Main technics I used in this data
    * [3.1] Remove 92 duplicated rows
    * [3.2] Cleaning text
    * [3.3] Spelling Checker
    * [3.4] Remove Stemming
 #### Step 3.3 spends a lot time (around 4000s in 4536s in total). 
 #### So, I splits Data Preprocessing into [another kernel](http://https://www.kaggle.com/phanttan/disastertweet-prepareddata). 
 #### And the prepared data to save in to [new dataset](http://https://www.kaggle.com/phanttan/disastertweet-prepared2)
 #### I am so appreciate to you for using/upvoting it.


In [None]:
df_train = pd.read_csv('/kaggle/input/disastertweet-prepared2/train_prepared.csv')
df_test = pd.read_csv('/kaggle/input/disastertweet-prepared2/test_prepared.csv')

<a id=4 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4. Vectorization</p>

Three steps using the Bag-of-words (BOW) model:
1. Term frequency : count occurrences of word in sentence
2. Inverse document frequency: 
3. L2 Norm
Reference : https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

[Content](#0)

<a id=4.1 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4.1 Common Vectorizer Usage</p>
Reference: https://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

[Content](#0)

In [None]:
# Instantiate the Vectorizer
vect = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0, max_df=0.9, max_features=100)
df_dtm = vect.fit_transform(df_train)
df_dtm.toarray()[0]

<a id=4.2 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">4.2 TF-IDF</p>
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

[Content](#0)

In [None]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0, max_df=0.98, max_features=100)
df_ifidf= tfidf_vect.fit_transform(df_train)
df_ifidf.toarray()[0]

<a id=5 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5 BERT model</p>

[Content](#0)

BERT(*Bi-directional Encoder Representations from Transformers*)

    - GLUE Score to 80.5%
    - MultiNLI accuracy to 86.7%
    - SQuAD v1.1 question answering Test F1 to 93.3
    - SQuAD v2.0 Test F1 to 83.1

<a id=5.1 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.1 Preprocessing Data</p>

[Content](#0)

In [None]:
df_test.text

In [None]:
max_len = 0
# Find the longest sentence 
for sentence in pd.concat([df_train.text, df_test.text]):
    if len(sentence) > max_len: # number of word in a sentence tokenizer is greater max_len
        max_len = len(sentence)
max_len

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
train_x = tokenizer.batch_encode_plus(df_train.text.tolist(), max_length=max_len, padding='max_length',return_tensors='tf')
test_x = tokenizer.batch_encode_plus(df_test.text.tolist(), max_length=max_len, padding='max_length', return_tensors='tf')
train_y = df_train.target

In [None]:
train_x

<a id=5.2 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:100%; text-align:left; border-radius: 20px 50px;">5.2 DistilBERT model with Fine-tuning using Keras </p>

[Content](#0)

The DistilBERT model was proposed in the blog post Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT

In [None]:
from transformers import TFAutoModelForSequenceClassification

def distilBERT_tuner(hp):
    model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')
    # Using learning_rate values are recommendated from paper BERT: Pre-training of Deep Bidirectional Transformers forLanguage Understanding
    hp_learning_rate = hp.Choice('learning_rate', values=[5e-5, 4e-5 , 3e-5, 2e-5])
    optimizer = keras.optimizers.Adam(learning_rate=hp_learning_rate)
    model.compile(optimizer=optimizer,
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
    return model
    

## Instantiate the tuner and perform hypertuning

In [None]:
tuner = kt.Hyperband(distilBERT_tuner,
                    objective='val_accuracy',
                    max_epochs=4,
                    factor=3,
                    directory='my_dir',
                    project_name='DistilBERT_to_kt')

In [None]:
stop_early = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

In [None]:
tuner.search(train_x['input_ids'], 
             train_y, 
             epochs=4, 
             validation_split=0.2, 
             callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""The hyperparameter search is complete. The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.""")

### Train the model

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 4 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_x['input_ids'], 
                    train_y, 
                    epochs=4, 
                    validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

### Re-instantiate the hypermodel and train it with the optimal number of epochs from above

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(train_x['input_ids'], 
               train_y, 
               epochs=best_epoch, 
               validation_split=0.2)

<a id=6 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">6. Make a Submission</p>

[Content](#0)

In [None]:
def submission_transformer(model, test):
    """For Bert"""
    sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
    predictions =  model.predict(test['input_ids'])
    y_preds = [ np.argmax(x) for x in predictions[0]]
    sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_preds})
    sub.to_csv('submission.csv', index=False)

In [None]:
submission_transformer(hypermodel, test_x)

In [None]:
pd.read_csv('submission.csv')

<a id=7 ></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:left; border-radius: 20px 50px;">7. References</p>
[Content](#0)

[Keras Tuner](https://keras.io/keras_tuner)

[Distil Bert](https://huggingface.co/transformers/model_doc/distilbert.html)

# If you like this kernel, please upvote and tell me your thought. Thank you @@