In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
import keras
import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence
# from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical


In [2]:
os.chdir('/content/drive/MyDrive/Toxcity Urdu/')
path = f"/content/drive/MyDrive/Toxcity Urdu/"

In [3]:
import All_RUT_Models
import RUT_Utils

In [4]:
# hyper parameters for this model

max_len = 200
embed_size = 300
pre_trained_flag = True
embed_trainable = False
emb_weights_init = 'glorot_normal'
lr_rate = 0.001
optimizer = 'adam'
multi_gpu_flag = False
gpus = 1
batch = 256
nepochs = 100
patience = 10
decay = True
decay_rate = 0.5
decay_after = 3

In [5]:
embeddingfile = path+'cc.ur.300.vec'

embedding_matrix = []
max_features = 10000000

modelname = 'CNN_George_ft'

p="/content/drive/MyDrive/Toxcity Urdu/DL Models Pretrained/"

modelpath = p+f'Models/' + modelname + '/'
resultpath = p+f'Results/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( resultpath ):
    os.makedirs( resultpath )

In [6]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

In [7]:
train_dataset = pd.read_excel(path + "Training.xlsx")
train_dataset['clean'] = train_dataset['clean'].apply(str)
train_dataset.reset_index(inplace=True) 
train_dataset.drop(['index'], axis=1, inplace=True)
train_dataset.head(3)

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length
0,41334,Ye Dil Bhulaata Nahe Hai Mohabbatain UskiPadi ...,0,یہ دل بحولااتا ںہیں ہے محبتیں وسکیپادی ہوئی ت...,یہ دل بحولااتا ںہیں ہے محبتیں وسکیپادی ہوئی تھ...,86
1,46464,Mumbai ke aik RTI karkun ki qanoon haq aag_hi ...,0,ممبئی کے ایک آر ٹی اے کارکن کی قانون حق آگہی ...,ممبئی کے ایک آر ٹی اے کارکن کی قانون حق آگہی ک...,48
2,50086,un nishistoun mein teen nashistain schedule ca...,0,ان نشستوں میں تِین نشستیں شیڈول کاسٹ کے لیے م...,ان نشستوں میں تین نشستیں شیڈول کاسٹ کے لیے مخص...,13


In [8]:
test_dataset = pd.read_excel(path + "Testing.xlsx")
test_dataset.head(3)

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length
0,49468,Shahar ko saaf suthra bananay mein Naujawanon ...,0,شہر کو صاف ستھرا بنانے میں نوجوانوں کی شراکت,شہر کو صاف ستھرا بنانے میں نوجوانوں کی شراکت,9
1,8547,Hahaha hahaha hahaha pagl ka bcha,1,ہاہاہا ہاہاہا ہاہاہا پاگل کا بچہ,ہاہاہا ہاہاہا ہاہاہا پاگل کا بچہ,6
2,678,"Exactly yaha kabi gunehghar ko saza nhi huti, ...",1,ایگزیکٹلی یہاں کبھی گنہگار کو سزا نہیں ہوتی ،...,ایگزیکٹلی یہاں کبھی گنہگار کو سزا نہیں ہوتی ان...,13


In [9]:
train_dataset[train_dataset['clean'].duplicated()]

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length


In [10]:
test_dataset[test_dataset['clean'].duplicated()]

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length


In [11]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

# tokenization with keras tokenizer
tokenizer = text.Tokenizer( num_words=max_features )
tokenizer.fit_on_texts( train_dataset[ 'clean' ].values )

In [12]:
import pickle

# saving
with open(path+'tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
# with open(path+'tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [13]:
train_index, val_index = train_test_split(train_dataset.index.values, test_size=0.10, random_state=0, stratify=train_dataset.Toxic.values)
print(f"Training set size: {len(train_index)}, Validation set size: {len(val_index)}")

Training set size: 51023, Validation set size: 5670


In [14]:
train_dataset.iloc[train_index].to_excel(path+"TrainingSet.xlsx")
train_dataset.iloc[val_index].to_excel(path+"ValidationSet.xlsx")

In [15]:
traincomments = tokenizer.texts_to_sequences( train_dataset.iloc[train_index][ 'clean' ].values )
validcomments = tokenizer.texts_to_sequences( train_dataset.iloc[val_index][ 'clean' ].values )
testcomments = tokenizer.texts_to_sequences( test_dataset[ 'clean' ].values )

# pad the tokenized sequences
xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
xval = sequence.pad_sequences( validcomments, maxlen=max_len )
xtest = sequence.pad_sequences( testcomments, maxlen=max_len )

ytrain = train_dataset.iloc[train_index][ 'Toxic' ].values
yval = train_dataset.iloc[val_index][ 'Toxic' ].values
ytest = test_dataset[ 'Toxic' ].values

ytrain = to_categorical( ytrain, 2 )
yval = to_categorical( yval, 2 )
ytest = to_categorical( ytest, 2 )

# check if pre-trained word embeddings flag is true
if pre_trained_flag == True:
  embedding_matrix = get_vectors( tokenizer=tokenizer)

# define a model
model = All_RUT_Models.CNN_George( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                emb_weights_init=emb_weights_init, optimizer=optimizer,
                                multi_gpu_flag=multi_gpu_flag, gpus=gpus )

K.set_value( model.optimizer.lr, lr_rate )
v=( xval,yval )
# train the model with callbacks for early stopping
f1metric = RUT_Utils.F1Metrics(v, modelpath + modelname  + '.h5', patience=patience, decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=True )
hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ), epochs=nepochs, verbose=0, callbacks=[ f1metric ] )

# load saved model
loaded_model = load_model( modelpath + modelname  + '.h5' )

# get predictions (probabilities) for validation and test sets respectively
yval = [ np.argmax(y, axis=None, out=None) for y in yval ]
ytest = [ np.argmax(y, axis=None, out=None) for y in ytest ]
valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )[ :, 1 ]
testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )[ :, 1 ]

# optimizer threshold on validation set
threshold = RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy.append( accuracy_score( yval, vallabels ) )
valprecision.append( precision_score( yval, vallabels ) )
valrecall.append( recall_score( yval, vallabels ) )
valf1.append( f1_score( yval, vallabels ) )
valcm.append( confusion_matrix( yval, vallabels ) )    

testaccuracy.append( accuracy_score( ytest, testlabels ) )
testprecision.append( precision_score( ytest, testlabels ) )
testrecall.append( recall_score( ytest, testlabels ) )
testf1.append( f1_score( ytest, testlabels ) )
testcm.append( confusion_matrix( ytest, testlabels ) )

# save for future analysis and ensemble
com_indices.extend( test_dataset.index.values )
com_text.extend( test_dataset[ 'clean' ] )
com_label.extend( test_dataset[ 'Toxic' ].values )
com_predicted.extend( testlabels.tolist() )
com_prob.extend( testpredictions.tolist() )

Epoch: 000 --MaxValF1: 0.77513896 --CurValF1: 0.77513896 --Patience: 00 --improved f1: 0.77513896
Epoch: 001 --MaxValF1: 0.80396040 --CurValF1: 0.80396040 --Patience: 00 --improved f1: 0.80396040
Epoch: 002 --MaxValF1: 0.80991736 --CurValF1: 0.80991736 --Patience: 00 --improved f1: 0.80991736
Epoch: 003 --MaxValF1: 0.81240911 --CurValF1: 0.81240911 --Patience: 00 --improved f1: 0.81240911
Epoch: 004 --MaxValF1: 0.81240911 --CurValF1: 0.81086232 --Patience: 00
Epoch: 005 --MaxValF1: 0.81240911 --CurValF1: 0.81159420 --Patience: 01
Epoch: 006 --MaxValF1: 0.81622177 --CurValF1: 0.81622177 --Patience: 02 --improved f1: 0.81622177
Epoch: 007 --MaxValF1: 0.82043344 --CurValF1: 0.82043344 --Patience: 00 --improved f1: 0.82043344
Epoch: 008 --MaxValF1: 0.82043344 --CurValF1: 0.81926514 --Patience: 00
Epoch: 009 --MaxValF1: 0.82043344 --CurValF1: 0.81886603 --Patience: 01
Epoch: 010 --MaxValF1: 0.82043344 --CurValF1: 0.81981982 --Patience: 02
Epoch: 011 --MaxValF1: 0.82582583 --CurValF1: 0.8258

In [16]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

Validation Accuracy
['0.9386'] 0.9386243386243386 +- 0.0 

Validation Precision
['0.8451'] 0.8450560652395515 +- 0.0 

Validation Recall
['0.8088'] 0.8087804878048781 +- 0.0 

Validation F1
['0.8265'] 0.8265204386839483 +- 0.0


In [17]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 829  196]
 [ 152 4493]] 



In [18]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9405'] 0.9404543530407788 +- 0.0 

Test Precision
['0.8729'] 0.8728850325379609 +- 0.0 

Test Recall
['0.7850'] 0.7850175575497463 +- 0.0 

Test F1
['0.8266'] 0.8266228430566968 +- 0.0


In [19]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2012   551]
 [  293 11318]] 



In [20]:
file = open( resultpath + 'ResultsMain.csv', mode='a' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [21]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( resultpath + modelname + '.csv', index=False )
dfPredictions.shape

(14174, 5)