This notebook is based on xhlulu's notebook (Disaster NLP: Keras BERT using TFHub https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub) with SVM as classificator instead of sigmoid.
Idea is simple - fine-tune BERT and pass CLS embeddings to SVM classifier

### References
- Disaster NLP: Keras BERT using TFHub https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
- Source for bert_encode function: https://www.kaggle.com/user123454321/bert-starter-inference
- All pre-trained BERT models from Tensorflow Hub: https://tfhub.dev/s?q=bert
- Deep Learning using Linear Support Vector Machines: http://deeplearning.net/wp-content/uploads/2013/03/dlsvm.pdf

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
### NATIVE PYTHON IMPORTS
import numpy as np
import pandas as pd

### TENSOR FLOW IMPORTS ###
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [None]:
data =  pd.read_csv('../input/hatespeech/2019-05-28_portuguese_hate_speech_binary_classification.csv')

data

In [None]:
# Coverting to lower case
data["preprocessed_text"] = data["text"].str.lower()

# Check if the column was Coverted correctly
data["preprocessed_text"] 

In [None]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
import re
# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text

def preprocessing(text):
    #text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = ' '.join(text)
    return text

In [None]:
pp_text_train = [] # preprocessed text column
for text_data in data['text']:
    pp_text_data = preprocessing(text_data)
    pp_text_train.append(pp_text_data)
data['preprocessed_text'] = pp_text_train # add the preprocessed text as a column
data['preprocessed_text']

 # Helper Functions

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Load and Preprocess¶
- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the bert layer
- Encode the text into tokens, masks, and segment flags

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
from sklearn.model_selection import train_test_split

train, test, train_labels, test_labels = train_test_split(data.preprocessed_text.values, data.hatespeech_comb.values, test_size=0.10, random_state=42)

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(train, tokenizer, max_len=160)
test_input = bert_encode(test, tokenizer, max_len=160)

# Model: Build, Fine-tune

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
#Using 1 epoch to avoid overfitting on small dataset
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=1,
    batch_size=16
)

model.save('model.h5')

In [None]:
#get output of slice layer from model above
cls_layer_model = Model(model.input, outputs=model.get_layer('tf_op_layer_strided_slice').output)

In [None]:
from tensorflow.keras.layers import Activation
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from statistics import mean
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support


sss = StratifiedShuffleSplit(n_splits=10, test_size = 0.1, train_size = 0.9)
f1 = []
accuracia = []
label = data['hatespeech_comb']
X_train, X_test, y_train, y_test = train_test_split(data.preprocessed_text.values, label, test_size=0.1, random_state = 42)
rodada = 1

print("# Training CV ")
print()
for train_index, test_index in sss.split(X_train, y_train):
    X_train_cv, X_test_cv = X_train[train_index],X_train[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index] 
    param_grid = {'eta': [0, 0.3, 1],
              'gamma': [0.1, 1, 10]
    }
    param_grid['nthread'] = [-1]
    X_train_cv = bert_encode(X_train_cv, tokenizer, max_len=160)
    X_train_cv = cls_layer_model.predict(X_train_cv)
    y_train_cv.fillna(0.0, inplace=True)
    modelXgb = xgb.XGBClassifier()
    grid_search_model = GridSearchCV(modelXgb, param_grid, scoring=['f1_micro', 'precision', 'recall'], refit='f1_micro', cv=10,return_train_score=True)
    grid_search_model.fit(X_train_cv,y_train_cv)
    print('Rodada: {} \n'.format(rodada))
    modelXgb = xgb.XGBClassifier(eta = grid_search_model.best_params_['eta'],gamma = grid_search_model.best_params_['gamma'],nthread = -1)
    modelXgb.fit(X_train_cv,y_train_cv)
    X_test_cv = bert_encode(X_test_cv, tokenizer, max_len=160)
    X_test_cv = cls_layer_model.predict(X_test_cv)
    y_test_cv.fillna(0.0, inplace=True)
    y_pred = modelXgb.predict(X_test_cv)
    result = f1_score(y_test_cv, y_pred.round(),average = 'micro')
    f1.append(result)
    print("# F1: ", result)
    acc = accuracy_score(y_test_cv, y_pred.round())
    accuracia.append(acc)
    print("# Accuracy: ", acc)
    rodada+=1
print()
print("# Mean Accuracy: ",mean(accuracia))
print("# CV f1 score:",mean(f1))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.preprocessed_text.values, label, test_size=0.1, random_state = 42)

print("# Start test set")
X_train = bert_encode(X_train, tokenizer, max_len=160)
X_train = cls_layer_model.predict(X_train)
y_train_cv.fillna(0.0, inplace=True)
param_grid = {'eta': [0, 0.3, 1],
              'gamma': [0.1, 1, 10]
}
param_grid['nthread'] = [-1]
modelXgb = xgb.XGBClassifier()
grid_search_model = GridSearchCV(modelXgb, param_grid, scoring=['f1_micro', 'precision', 'recall'], refit='f1_micro', cv=10,return_train_score=True)
grid_search_model.fit(X_train,y_train)
modelXgb = xgb.XGBClassifier(eta = grid_search_model.best_params_['eta'],gamma = grid_search_model.best_params_['gamma'],nthread = -1)
modelXgb.fit(X_train,y_train)
X_test = bert_encode(X_test, tokenizer, max_len=160)
X_test = cls_layer_model.predict(X_test)
y_test.fillna(0.0, inplace=True)
y_pred = modelXgb.predict(X_test)
print("test set f1 score:",f1_score(y_test, y_pred.round(),average = 'micro'))