In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical


In [3]:
os.chdir('/content/drive/MyDrive/Toxcity Urdu/')
path = f"/content/drive/MyDrive/Toxcity Urdu/"

In [4]:
import All_RUT_Models
import RUT_Utils

In [5]:
# hyper parameters for this model

max_len = 200
embed_size = 300
pre_trained_flag = True
embed_trainable = True
emb_weights_init = 'glorot_normal'
spdrpt = 0.65
drpt = 0.65
fc_weights_init = 'glorot_uniform'
fc_act = 'elu'
lr_rate = 0.0001
optimizer = 'adam'
fcl_loss_alp = 0.25
fcl_loss_gam = 5
multi_gpu_flag = 0
gpus = 1
batch = 128
nepochs = 100
patience = 10
decay = True
decay_rate = 0.5
decay_after = 3

In [6]:
embeddingfile = path+'cc.ur.300.vec'


embedding_matrix = []
max_features = 10000000

modelname = 'BGRU_P_ft'
p="/content/drive/MyDrive/Toxcity Urdu/DL Models Fasttext finetuned/"


modelpath = p+f'Models/' + modelname + '/'
resultpath = p+f'Results/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( resultpath ):
    os.makedirs( resultpath )

In [7]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

In [8]:
train_df = pd.read_excel(path+"TrainingSet.xlsx")
train_df['clean'] = train_df['clean'].apply(str)
valid_df = pd.read_excel(path+"ValidationSet.xlsx")
valid_df['clean'] = valid_df['clean'].apply(str)
test_df = pd.read_excel(path+"Testing.xlsx")
len(train_df), len(valid_df), len(test_df)

(51023, 5670, 14174)

In [9]:
import pickle

# saving
# with open(path+'tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open(path+'tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [10]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

traincomments = tokenizer.texts_to_sequences( train_df[ 'clean' ].values )
valcomments = tokenizer.texts_to_sequences( valid_df[ 'clean' ].values )
testcomments = tokenizer.texts_to_sequences( test_df[ 'clean' ].values )

# pad the tokenized sequences
xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
xval = sequence.pad_sequences( valcomments, maxlen=max_len )
xtest = sequence.pad_sequences( testcomments, maxlen=max_len )

ytrain = train_df[ 'Toxic' ].values
ytest = test_df[ 'Toxic' ].values
yval = valid_df[ 'Toxic' ].values

# check if pre-trained word embeddings flag is true
if pre_trained_flag == True:
    embedding_matrix = get_vectors( tokenizer=tokenizer)

# define a model
model = All_RUT_Models.BGRU_P( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                              embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                              spdrpt=spdrpt, drpt=drpt, emb_weights_init=emb_weights_init,
                              fc_weights_init=fc_weights_init, fc_act=fc_act, optimizer=optimizer,
                              fcl_loss_alp=fcl_loss_alp, fcl_loss_gam=fcl_loss_gam,
                              multi_gpu_flag=multi_gpu_flag, gpus=gpus )

K.set_value( model.optimizer.lr, lr_rate )
v=( xval,yval )
# train the model with callbacks for early stopping
f1metric = RUT_Utils.F1Metrics(v, modelpath + modelname + '.h5', patience=patience, decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=False )
hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ),
                  epochs=nepochs, verbose=0, callbacks=[ f1metric ] )

# load saved model
custt_obss = { 'focal_loss_fixed':All_RUT_Models.focal_loss(alpha=fcl_loss_alp, gamma=fcl_loss_gam) }
loaded_model = load_model( modelpath + modelname + '.h5', custom_objects=custt_obss )

# get predictions (probabilities) for validation and test sets respectively
valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )
testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )

# optimizer threshold on validation set
threshold = RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy.append( accuracy_score( yval, vallabels ) )
valprecision.append( precision_score( yval, vallabels ) )
valrecall.append( recall_score( yval, vallabels ) )
valf1.append( f1_score( yval, vallabels ) )
valcm.append( confusion_matrix( yval, vallabels ) )    

testaccuracy.append( accuracy_score( ytest, testlabels ) )
testprecision.append( precision_score( ytest, testlabels ) )
testrecall.append( recall_score( ytest, testlabels ) )
testf1.append( f1_score( ytest, testlabels ) )
testcm.append( confusion_matrix( ytest, testlabels ) )

# save for future analysis and ensemble
com_indices.extend( test_df.index.tolist() )
com_text.extend( test_df[ 'clean' ] )
com_label.extend( test_df[ 'Toxic' ].tolist() )
com_predicted.extend( testlabels.tolist() )
com_prob.extend( testpredictions.tolist() )

Epoch: 000 --MaxValF1: 0.56717806 --CurValF1: 0.56717806 --Patience: 00 --improved f1: 0.56717806
Epoch: 001 --MaxValF1: 0.76602086 --CurValF1: 0.76602086 --Patience: 00 --improved f1: 0.76602086
Epoch: 002 --MaxValF1: 0.81711666 --CurValF1: 0.81711666 --Patience: 00 --improved f1: 0.81711666
Epoch: 003 --MaxValF1: 0.83611691 --CurValF1: 0.83611691 --Patience: 00 --improved f1: 0.83611691
Epoch: 004 --MaxValF1: 0.85074627 --CurValF1: 0.85074627 --Patience: 00 --improved f1: 0.85074627
Epoch: 005 --MaxValF1: 0.85642317 --CurValF1: 0.85642317 --Patience: 00 --improved f1: 0.85642317
Epoch: 006 --MaxValF1: 0.86686687 --CurValF1: 0.86686687 --Patience: 00 --improved f1: 0.86686687
Epoch: 007 --MaxValF1: 0.87058824 --CurValF1: 0.87058824 --Patience: 00 --improved f1: 0.87058824
Epoch: 008 --MaxValF1: 0.87569200 --CurValF1: 0.87569200 --Patience: 00 --improved f1: 0.87569200
Epoch: 009 --MaxValF1: 0.87995982 --CurValF1: 0.87995982 --Patience: 00 --improved f1: 0.87995982
Epoch: 010 --MaxValF

In [11]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9598'] 0.9597883597883597 +- 0.0 

Validation Precision
['0.8880'] 0.8880233690360273 +- 0.0 

Validation Recall
['0.8898'] 0.8897560975609756 +- 0.0 

Validation F1
['0.8889'] 0.8888888888888888 +- 0.0


In [12]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 912  113]
 [ 115 4530]] 



In [13]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9594'] 0.9594327642161704 +- 0.0 

Test Precision
['0.9005'] 0.9004834810636584 +- 0.0 

Test Recall
['0.8720'] 0.8720249707374171 +- 0.0 

Test F1
['0.8860'] 0.8860257680872151 +- 0.0


In [14]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2235   328]
 [  247 11364]] 



In [15]:
file = open( resultpath+'ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [16]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( resultpath + modelname + '.csv', index=False )
dfPredictions.shape

(14174, 5)