In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import keras
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence
# from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical


In [None]:
os.chdir('/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/')
path = f"/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/Data/"

In [None]:
import All_RUT_Models
import RUT_Utils

In [None]:
# hyper parameters for this model

max_len = 200
embed_size = 300
pre_trained_flag = True
embed_trainable = True
emb_weights_init = 'glorot_normal'
lr_rate = 0.001
optimizer = 'adam'
multi_gpu_flag = False
gpus = 1
batch = 128
nepochs = 100
patience = 10
decay = True
decay_rate = 0.5
decay_after = 3

In [None]:
embeddingfile = '/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/emb/embeddings_matrix.npy.npz'

embedding_matrix = []
max_features = 10000000

modelname = 'CNN_George_ft'

p="/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/DL Model Custom Embeddings/"

modelpath = p+f'Models/' + modelname + '/'
resultpath = p+f'Results/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( resultpath ):
    os.makedirs( resultpath )

In [None]:
PTH = f"/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/Data/"

train_dataset = pd.read_excel(path+"TrainingSet.xlsx")
train_dataset['clean'] = train_dataset['clean'].apply(str)
valid_dataset = pd.read_excel(path+"ValidationSet.xlsx")
valid_dataset['clean'] = valid_dataset['clean'].apply(str)
test_dataset = pd.read_excel(path+"Testing.xlsx")
len(train_dataset), len(valid_dataset), len(test_dataset)

(51023, 5670, 14174)

In [None]:
import pickle

with open('/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/emb/custom_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
embed = np.load(embeddingfile, allow_pickle=True)
embedding_matrix = embed['arr_0']
emb_matrix = embedding_matrix
VOCAB_LEN = len(tokenizer.word_index)+1
embed_size = embedding_matrix.shape[1]
VOCAB_LEN, embed_size

(51755, 300)

In [None]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

traincomments = tokenizer.texts_to_sequences( train_dataset[ 'clean' ].values )
validcomments = tokenizer.texts_to_sequences( valid_dataset[ 'clean' ].values )
testcomments = tokenizer.texts_to_sequences( test_dataset[ 'clean' ].values )

# pad the tokenized sequences
xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
xval = sequence.pad_sequences( validcomments, maxlen=max_len )
xtest = sequence.pad_sequences( testcomments, maxlen=max_len )

ytrain = train_dataset[ 'Toxic' ].values
yval = valid_dataset[ 'Toxic' ].values
ytest = test_dataset[ 'Toxic' ].values

ytrain = to_categorical( ytrain, 2 )
yval = to_categorical( yval, 2 )
ytest = to_categorical( ytest, 2 )

# check if pre-trained word embeddings flag is true
print(embedding_matrix.shape)
  
# define a model
model = All_RUT_Models.CNN_George( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                emb_weights_init=emb_weights_init, optimizer=optimizer,
                                multi_gpu_flag=multi_gpu_flag, gpus=gpus )

K.set_value( model.optimizer.lr, lr_rate )
v=( xval,yval )
# train the model with callbacks for early stopping
f1metric = RUT_Utils.F1Metrics(v, modelpath + modelname  + '.h5', patience=patience, decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=True )
hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ), epochs=nepochs, verbose=0, callbacks=[ f1metric ] )

# load saved model
loaded_model = load_model( modelpath + modelname  + '.h5' )

# get predictions (probabilities) for validation and test sets respectively
yval = [ np.argmax(y, axis=None, out=None) for y in yval ]
ytest = [ np.argmax(y, axis=None, out=None) for y in ytest ]
valpredictions = loaded_model.predict( xval, verbose=0, batch_size=256 )[ :, 1 ]
testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=256 )[ :, 1 ]

# optimizer threshold on validation set
threshold = RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy.append( accuracy_score( yval, vallabels ) )
valprecision.append( precision_score( yval, vallabels ) )
valrecall.append( recall_score( yval, vallabels ) )
valf1.append( f1_score( yval, vallabels ) )
valcm.append( confusion_matrix( yval, vallabels ) )    

testaccuracy.append( accuracy_score( ytest, testlabels ) )
testprecision.append( precision_score( ytest, testlabels ) )
testrecall.append( recall_score( ytest, testlabels ) )
testf1.append( f1_score( ytest, testlabels ) )
testcm.append( confusion_matrix( ytest, testlabels ) )

# save for future analysis and ensemble
com_indices.extend( test_dataset.index.values )
com_text.extend( test_dataset[ 'clean' ] )
com_label.extend( test_dataset[ 'Toxic' ].values )
com_predicted.extend( testlabels.tolist() )
com_prob.extend( testpredictions.tolist() )

(51755, 300)
Epoch: 000 --MaxValF1: 0.77586207 --CurValF1: 0.77586207 --Patience: 00 --improved f1: 0.77586207
Epoch: 001 --MaxValF1: 0.85079365 --CurValF1: 0.85079365 --Patience: 00 --improved f1: 0.85079365
Epoch: 002 --MaxValF1: 0.85891918 --CurValF1: 0.85891918 --Patience: 00 --improved f1: 0.85891918
Epoch: 003 --MaxValF1: 0.86634146 --CurValF1: 0.86634146 --Patience: 00 --improved f1: 0.86634146
Epoch: 004 --MaxValF1: 0.86634146 --CurValF1: 0.86327345 --Patience: 00
Epoch: 005 --MaxValF1: 0.86634146 --CurValF1: 0.86610254 --Patience: 01
Epoch: 006 --MaxValF1: 0.86634146 --CurValF1: 0.86352357 --Patience: 02
Epoch: 007 --MaxValF1: 0.86634146 --CurValF1: 0.86296485 --Patience: 03
Epoch: 008 --MaxValF1: 0.86634146 --CurValF1: 0.85993821 --Patience: 04
Epoch: 009 --MaxValF1: 0.86634146 --CurValF1: 0.86052496 --Patience: 05
Epoch: 010 --MaxValF1: 0.86634146 --CurValF1: 0.86111111 --Patience: 06
Epoch: 011 --MaxValF1: 0.86634146 --CurValF1: 0.86177215 --Patience: 07


In [None]:
xprint( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

In [None]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

In [None]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [None]:
file = open( resultpath + 'ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [None]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( resultpath + modelname + '.csv', index=False )
dfPredictions.shape