In [21]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
from transformers import BertTokenizer, BertForSequenceClassification, BertTokenizerFast, TFBertModel

In [23]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.metrics import classification_report, confusion_matrix

#keras
import tensorflow as tf
from tensorflow import keras

In [24]:
data = pd.read_csv('train.csv')
data = data.drop('Unnamed: 0', axis=1)
data

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,no
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,e100ss gini buka informasi sejelas nya identit...,yes
3,neng solo wes ono terduga corona cobo neng ati...,no
4,midiahn nii akun gak takut takut nya isu coron...,no
...,...,...
21596,depok panas ga karuan kereta sampe pasming huj...,no
21597,oxfara arie kriting yg lebi goblo nya orang ke...,no
21598,virus corona menyaba depok cuci tangan makan n...,no
21599,mata sipit tinggal depok udah abis dah bahan c...,no


In [25]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [26]:
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')

In [27]:
token_lengths = []

for txt in data['text_a'].values:
    tokens = tokenizer.encode(txt, max_length=128, truncation=True)
    token_lengths.append(len(tokens))
    
max_len=np.max(token_lengths)
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

MAX TOKENIZED SENTENCE LENGTH: 128


In [28]:
token_lengths_test = []

for txt in test['text_a'].values:
    tokens = tokenizer.encode(txt, max_length=5000, truncation=True)
    token_lengths_test.append(len(tokens))
    
max_len=np.max(token_lengths_test)
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

MAX TOKENIZED SENTENCE LENGTH: 913


In [29]:
MAX_LEN = 100

In [30]:
def tokenize(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [31]:
data['label'] = data['label'].map({'yes':1, 'no':0})
test['label'] = test['label'].map({'yes':1, 'no':0})

In [32]:
test['label'].value_counts()

0    2093
1     707
Name: label, dtype: int64

In [33]:
X_train, X_valid, y_train, y_valid = train_test_split(data['text_a'].values, data['label'].values, test_size=0.1, stratify=data['label'].values, random_state=48)
X_test, y_test = test['text_a'].values, test['label'].values

In [34]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [35]:
train_input_ids, train_attention_masks = tokenize(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize(X_test, MAX_LEN)

In [36]:
# for x in range(len(train_input_ids)) : 
#   if(len(train_input_ids[x])) != 512 :
#     print("index = ", end="")
#     print(x) 
#     print(len(train_input_ids[x]))
train_input_ids.shape

(19440, 100)

In [37]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [38]:
def create_model(bert_model, max_len=MAX_LEN):
    
    ##params###
    opt = tf.keras.optimizers.Adam(learning_rate=5e-5, decay=5e-7)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()


    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    
    embeddings = bert_model([input_ids,attention_masks])[1]
    
    output = tf.keras.layers.Dense(2, activation="softmax")(embeddings)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)
    
    model.compile(opt, loss=loss, metrics=accuracy)
    
    
    return model

In [39]:
model = create_model(bert_model, MAX_LEN)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_3[0][0]',                
                                thPoolingAndCrossAt               'input_4[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                         

In [40]:
history_bert = model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [41]:
result_bert = model.predict([test_input_ids,test_attention_masks])

In [42]:
y_pred_bert =  np.zeros_like(result_bert)
y_pred_bert[np.arange(len(y_pred_bert)), result_bert.argmax(1)] = 1

In [43]:
y_pred_bert  

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [44]:
print('\tClassification Report for BERT:\n\n',classification_report(y_test,y_pred_bert, target_names=['No', 'Yes']))

	Classification Report for BERT:

               precision    recall  f1-score   support

          No       0.90      0.94      0.92      2093
         Yes       0.79      0.69      0.74       707

   micro avg       0.88      0.88      0.88      2800
   macro avg       0.85      0.82      0.83      2800
weighted avg       0.87      0.88      0.87      2800
 samples avg       0.88      0.88      0.88      2800

