In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [3]:
os.chdir('/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/')

In [4]:
path = f"/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/Data/"

In [5]:
import All_RUT_Models
import RUT_Utils

In [6]:
# no hyper parameters required for this model

In [6]:
modelname = 'NB'

p="/content/drive/MyDrive/Roman-Urdu-Toxic-Comments-master/  ML Models Tf-Idf/"

modelpath = p+f'Models/' + modelname + '/'
resultpath = p+f'Results/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( resultpath ):
    os.makedirs( resultpath )

In [7]:
train_dataset = pd.read_excel(path + "Training.xlsx")
train_dataset['clean'] = train_dataset['clean'].apply(str) 
train_dataset.head(3)

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length
0,41334,Ye Dil Bhulaata Nahe Hai Mohabbatain UskiPadi ...,0,یہ دل بحولااتا ںہیں ہے محبتیں وسکیپادی ہوئی ت...,یہ دل بحولااتا ںہیں ہے محبتیں وسکیپادی ہوئی تھ...,86
1,46464,Mumbai ke aik RTI karkun ki qanoon haq aag_hi ...,0,ممبئی کے ایک آر ٹی اے کارکن کی قانون حق آگہی ...,ممبئی کے ایک آر ٹی اے کارکن کی قانون حق آگہی ک...,48
2,50086,un nishistoun mein teen nashistain schedule ca...,0,ان نشستوں میں تِین نشستیں شیڈول کاسٹ کے لیے م...,ان نشستوں میں تین نشستیں شیڈول کاسٹ کے لیے مخص...,13


In [8]:
test_dataset = pd.read_excel(path + "Testing.xlsx") 
test_dataset.head(3)

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length
0,49468,Shahar ko saaf suthra bananay mein Naujawanon ...,0,شہر کو صاف ستھرا بنانے میں نوجوانوں کی شراکت,شہر کو صاف ستھرا بنانے میں نوجوانوں کی شراکت,9
1,8547,Hahaha hahaha hahaha pagl ka bcha,1,ہاہاہا ہاہاہا ہاہاہا پاگل کا بچہ,ہاہاہا ہاہاہا ہاہاہا پاگل کا بچہ,6
2,678,"Exactly yaha kabi gunehghar ko saza nhi huti, ...",1,ایگزیکٹلی یہاں کبھی گنہگار کو سزا نہیں ہوتی ،...,ایگزیکٹلی یہاں کبھی گنہگار کو سزا نہیں ہوتی ان...,13


In [9]:
train_dataset[train_dataset['clean'].duplicated()]

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length


In [10]:
test_dataset[test_dataset['clean'].duplicated()]

Unnamed: 0.1,Unnamed: 0,Comment,Toxic,Urdu,clean,length


In [11]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
com_text, com_label, com_predicted, com_prob = [], [], [], []
com_indices = []

# tf.idf vectorization    
vectorizer = TfidfVectorizer(  )
vectorizer.fit( train_dataset[ 'clean' ].values )

xtrain = vectorizer.transform( train_dataset[ 'clean' ].values )
xtest = vectorizer.transform( test_dataset[ 'clean' ].values )
ytrain = train_dataset[ 'Toxic' ].values
ytest = test_dataset[ 'Toxic' ].values

xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.10, random_state=0)
print(f"Length of Dataset \nTrain: {xtrain.shape[0]}, Valid: {xval.shape[0]}, Test: {xtest.shape[0]}")

# define a model
model = All_RUT_Models.NB_Model(  )

# train the model
model.fit( xtrain, ytrain )

# save the model
with open( modelpath + modelname  + '.pkl', 'wb' ) as f:
    pickle.dump( model, f )

# load saved model
with open( modelpath + modelname  + '.pkl', 'rb' ) as f:
    model = pickle.load( f )

# get predictions (probabilities) for validation and test sets respectively
valpredictions = model.predict_proba( xval )[ :, 1 ]
testpredictions = model.predict_proba( xtest )[ :, 1 ]

# optimizer threshold on validation set
threshold = RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy.append( accuracy_score( yval, vallabels ) )
valprecision.append( precision_score( yval, vallabels ) )
valrecall.append( recall_score( yval, vallabels ) )
valf1.append( f1_score( yval, vallabels ) )
valcm.append( confusion_matrix( yval, vallabels ) )    

testaccuracy.append( accuracy_score( ytest, testlabels ) )
testprecision.append( precision_score( ytest, testlabels ) )
testrecall.append( recall_score( ytest, testlabels ) )
testf1.append( f1_score( ytest, testlabels ) )
testcm.append( confusion_matrix( ytest, testlabels ) )

# save for future analysis and ensemble
com_indices.extend( test_dataset.index.values )
com_text.extend( test_dataset[ 'clean' ] )
com_label.extend( test_dataset[ 'Toxic' ].tolist() )
com_predicted.extend( testlabels )
com_prob.extend( testpredictions )

In [12]:
print( 'Validation Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in valaccuracy ], np.mean( valaccuracy ), '+-', np.std( valaccuracy ), '\n' )

print( 'Validation Precision' )
print( [ '{:0.4f}'.format( x ) for x in valprecision ], np.mean( valprecision ), '+-', np.std( valprecision ), '\n' )

print( 'Validation Recall' )
print( [ '{:0.4f}'.format( x ) for x in valrecall ], np.mean( valrecall ), '+-', np.std( valrecall ), '\n' )

print( 'Validation F1' )
print( [ '{:0.4f}'.format( x ) for x in valf1 ], np.mean( valf1 ), '+-', np.std( valf1 ) )

In [13]:
for c in valcm:
    print( np.rot90(np.rot90(c)), '\n' )

In [14]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

Test Accuracy
['0.9419'] 0.9418653873289121 +- 0.0 

Test Precision
['0.8577'] 0.8576717400246812 +- 0.0 

Test Recall
['0.8135'] 0.813499804916114 +- 0.0 

Test F1
['0.8350'] 0.8350020024028835 +- 0.0


In [15]:
for c in testcm:
    print( np.rot90(np.rot90(c)), '\n' )

[[ 2085   478]
 [  346 11265]] 



In [16]:
file = open( resultpath + 'ResultsMain.csv', mode='w' )
file.write( modelname )
file.write( ',' )
file.write( str(np.mean( testaccuracy ))[:7] + '+-' + str(np.std( testaccuracy ))[:6] )
file.write( ',' )
file.write( str(np.mean( testprecision ))[:7] + '+-' + str(np.std( testprecision ))[:6] )
file.write( ',' )
file.write( str(np.mean( testrecall ))[:7] + '+-' + str(np.std( testrecall ))[:6] )
file.write( ',' )
file.write( str(np.mean( testf1 ))[:7] + '+-' + str(np.std( testf1 ))[:6] )
file.write( '\n' )
file.close()

In [17]:
dfPredictions = pd.DataFrame(  )
dfPredictions[ 'comment_indices' ] = com_indices
dfPredictions[ 'comment_text' ] = com_text #comment text
dfPredictions[ 'comment_label' ] = com_label
dfPredictions[ 'comment_predicted' ] = com_predicted
dfPredictions[ 'comment_prob' ] = com_prob
dfPredictions.to_csv( resultpath + modelname + '.csv', index=False )
dfPredictions.shape

(14174, 5)