In [None]:
!pip install transformers

In [None]:
# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from
# the outputs of this model
model = BertModel.from_pretrained('bert-base-multilingual-uncased',
                                  output_hidden_states = True,
                                  )
BERT_Embeddings = model.embeddings.word_embeddings.weight.detach().numpy()
np.save('./BERT_Multilingual Embeddings', BERT_Embeddings)
model.embeddings.word_embeddings.weight.shape

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import keras
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence
# from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical


In [2]:
os.chdir('/content/drive/MyDrive/Toxcity Urdu/')
path = f"/content/drive/MyDrive/Toxcity Urdu/"

In [3]:
import All_RUT_Models
import RUT_Utils

In [4]:
# hyper parameters for this model

max_len = 200
embed_size = 768
pre_trained_flag = True
embed_trainable = True
emb_weights_init = 'glorot_normal'
lr_rate = 0.0001
optimizer = 'adam'
multi_gpu_flag = False
gpus = 1
batch = 128
nepochs = 100
patience = 10
decay = True
decay_rate = 0.5
decay_after = 3

In [5]:
embeddingfile = path+'BERT_Multilingual Embeddings.npy'


max_features = 10000000

modelname = 'CNN_George_BERT'

p="/content/drive/MyDrive/Toxcity Urdu/DL Models BERT finetuned/"

modelpath = p+f'Models/' + modelname + '/'
resultpath = p+f'Results/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( resultpath ):
    os.makedirs( resultpath )

In [6]:
embedding_matrix = np.load('./BERT_Multilingual Embeddings.npy')
embedding_matrix.shape

(105879, 768)

In [7]:
train_dataset = pd.read_excel(path+"TrainingSet.xlsx")
train_dataset['clean'] = train_dataset['clean'].apply(str)
valid_dataset = pd.read_excel(path+"ValidationSet.xlsx")
valid_dataset['clean'] = valid_dataset['clean'].apply(str)
test_dataset = pd.read_excel(path+"Testing.xlsx")
len(train_dataset), len(valid_dataset), len(test_dataset)

(51023, 5670, 14174)

In [8]:
import pickle

# saving
# with open(path+'tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open(path+'tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [9]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

traincomments = tokenizer.texts_to_sequences( train_dataset[ 'clean' ].values )
validcomments = tokenizer.texts_to_sequences( valid_dataset[ 'clean' ].values )
testcomments = tokenizer.texts_to_sequences( test_dataset[ 'clean' ].values )

# pad the tokenized sequences
xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
xval = sequence.pad_sequences( validcomments, maxlen=max_len )
xtest = sequence.pad_sequences( testcomments, maxlen=max_len )

ytrain = train_dataset[ 'Toxic' ].values
yval = valid_dataset[ 'Toxic' ].values
ytest = test_dataset[ 'Toxic' ].values

ytrain = to_categorical( ytrain, 2 )
yval = to_categorical( yval, 2 )
ytest = to_categorical( ytest, 2 )

# check if pre-trained word embeddings flag is true
print(embedding_matrix.shape)
  
# define a model
model = All_RUT_Models.CNN_George( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                emb_weights_init=emb_weights_init, optimizer=optimizer,
                                multi_gpu_flag=multi_gpu_flag, gpus=gpus )

K.set_value( model.optimizer.lr, lr_rate )
v=( xval,yval )
# train the model with callbacks for early stopping
f1metric = RUT_Utils.F1Metrics(v, modelpath + modelname  + '.h5', patience=patience, decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=True )
hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ), epochs=nepochs, verbose=0, callbacks=[ f1metric ] )

# load saved model
loaded_model = load_model( modelpath + modelname  + '.h5' )

# get predictions (probabilities) for validation and test sets respectively
yval = [ np.argmax(y, axis=None, out=None) for y in yval ]
ytest = [ np.argmax(y, axis=None, out=None) for y in ytest ]
valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )[ :, 1 ]
testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )[ :, 1 ]

# optimizer threshold on validation set
threshold = RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy.append( accuracy_score( yval, vallabels ) )
valprecision.append( precision_score( yval, vallabels ) )
valrecall.append( recall_score( yval, vallabels ) )
valf1.append( f1_score( yval, vallabels ) )
valcm.append( confusion_matrix( yval, vallabels ) )    

testaccuracy.append( accuracy_score( ytest, testlabels ) )
testprecision.append( precision_score( ytest, testlabels ) )
testrecall.append( recall_score( ytest, testlabels ) )
testf1.append( f1_score( ytest, testlabels ) )
testcm.append( confusion_matrix( ytest, testlabels ) )

# save for future analysis and ensemble
com_indices.extend( test_dataset.index.values )
com_text.extend( test_dataset[ 'clean' ] )
com_label.extend( test_dataset[ 'Toxic' ].values )
com_predicted.extend( testlabels.tolist() )
com_prob.extend( testpredictions.tolist() )

(105879, 768)
Epoch: 000 --MaxValF1: 0.78483607 --CurValF1: 0.78483607 --Patience: 00 --improved f1: 0.78483607
Epoch: 001 --MaxValF1: 0.86100000 --CurValF1: 0.86100000 --Patience: 00 --improved f1: 0.86100000
Epoch: 002 --MaxValF1: 0.87660415 --CurValF1: 0.87660415 --Patience: 00 --improved f1: 0.87660415
Epoch: 003 --MaxValF1: 0.88077118 --CurValF1: 0.88077118 --Patience: 00 --improved f1: 0.88077118
Epoch: 004 --MaxValF1: 0.88095238 --CurValF1: 0.88095238 --Patience: 00 --improved f1: 0.88095238
Epoch: 005 --MaxValF1: 0.88095238 --CurValF1: 0.87963892 --Patience: 00
Epoch: 006 --MaxValF1: 0.88095238 --CurValF1: 0.88059701 --Patience: 01
Epoch: 007 --MaxValF1: 0.88095238 --CurValF1: 0.87731256 --Patience: 02
Epoch: 008 --MaxValF1: 0.88095238 --CurValF1: 0.87821782 --Patience: 03
Epoch: 009 --MaxValF1: 0.88095238 --CurValF1: 0.87828627 --Patience: 04
Epoch: 010 --MaxValF1: 0.88095238 --CurValF1: 0.87855044 --Patience: 05
Epoch: 011 --MaxValF1: 0.88095238 --CurValF1: 0.87847731 --Patie

In [10]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9577'] 0.9576719576719577 +- 0.0 

Validation Precision
['0.8961'] 0.8960645812310797 +- 0.0 

Validation Recall
['0.8663'] 0.8663414634146341 +- 0.0 

Validation F1
['0.8810'] 0.8809523809523809 +- 0.0


In [11]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 888  137]
 [ 103 4542]] 



In [12]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9582'] 0.9582333850712572 +- 0.0 

Test Precision
['0.9146'] 0.914598233066891 +- 0.0 

Test Recall
['0.8482'] 0.8482247366367538 +- 0.0 

Test F1
['0.8802'] 0.880161943319838 +- 0.0


In [13]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2174   389]
 [  203 11408]] 



In [14]:
file = open( resultpath + 'ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [15]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( resultpath + modelname + '.csv', index=False )
dfPredictions.shape

(14174, 5)