In [None]:
#import Library
import pickle as pkl
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf

#Data read

In [None]:
#Data read from json file
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Load training data from google drive
with open('/content/gdrive/My Drive/Abusive_text_detection/Feature_extraction/Extracted_feature_deep_learning/training_padded.pkl','rb') as f:
  training_padded = pkl.load(f)

#Load trainning level from google drive
with open('/content/gdrive/My Drive/Abusive_text_detection/Feature_extraction/Label/train_label.pkl','rb') as f:
  train_level = pkl.load(f)

#Load embedding matrix from google drive
with open('/content/gdrive/My Drive/Abusive_text_detection/Feature_extraction/Extracted_feature_deep_learning/Embedding_matrix/embedding_matrix_using_glove_twitter_25_deep_learning.pkl','rb') as f:
  embedding_matrix = pkl.load(f)

#Load vocabulary size from google drive
with open('/content/gdrive/My Drive/Abusive_text_detection/Feature_extraction/Extracted_feature_deep_learning/vocab_size_and_max_length.pkl','rb') as f:
  vocab_size,max_length = pkl.load(f)

#Deep learning Parameter Tuning

#LSTM

In [None]:
#For LSTM 
def create_model(optimizer = 'adam',hidden_layer = 3,hidden_node =  30,dropout = 0.1,recurrent_dropout = 0.1):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size,embedding_matrix.shape[1],weights=[embedding_matrix],input_length=max_length,trainable=False))
  for i in range(hidden_layer):
    if hidden_layer == 1:
      model.add(tf.keras.layers.LSTM(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout))
    else:
      if i==(hidden_layer-1):
        model.add(tf.keras.layers.LSTM(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout))
      else:
        model.add(tf.keras.layers.LSTM(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout,return_sequences=True))
    
      
  model.add( tf.keras.layers.Dense(hidden_node , activation='relu' ))
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

  model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
  model.summary()
  return model

In [None]:
# create model
LSTM_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = create_model) #,verbose = 0

In [None]:
#define the Random search parameters
param_grid = dict(batch_size=[16,32,64,128], epochs= [1,2,3,4,5],optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam', 'Ftrl'],hidden_layer = [1,2,3,4,5],hidden_node = [16,32,64,128,256],dropout = [0.1,0.2,0.3,0.4,0.5],recurrent_dropout = [0.1,0.2,0.3,0.4,0.5])
LSTM_cv = RandomizedSearchCV(estimator=LSTM_model, param_distributions=param_grid, n_jobs=-1,verbose = 1) #cv = none means -> cv = 5
random_result = LSTM_cv.fit(training_padded,train_level)

Fitting 5 folds for each of 100000 candidates, totalling 500000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 500000 out of 500000 | elapsed: 2084.9min finished


In [None]:
# summarize results
print("Best parameters are : %s" % (random_result.best_params_))

Best parameters are : {'batch_size': 32, 'epochs': 4, 'optimizer': 'Adam', 'hidden_layer': 2, 'hidden_node': 64, 'dropout': 0.1, 'recurrent_dropout': 0.1}


#GRU

In [None]:
#For GRU
def create_model(optimizer = 'adam',hidden_layer = 3,hidden_node =  30,dropout = 0.1,recurrent_dropout = 0.1):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size,embedding_matrix.shape[1],weights=[embedding_matrix],input_length=max_length,trainable=False))
  for i in range(hidden_layer):
    if hidden_layer == 1:
      model.add(tf.keras.layers.GRU(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout))
    else:
      if i==(hidden_layer-1):
        model.add(tf.keras.layers.GRU(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout))
      else:
        model.add(tf.keras.layers.GRU(hidden_node, dropout=dropout, recurrent_dropout=recurrent_dropout,return_sequences=True))
    
      
  model.add( tf.keras.layers.Dense(hidden_node , activation='relu' ))
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

  model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
  model.summary()
  return model

In [None]:
# create model
GRU_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn = create_model) #,verbose = 0

In [None]:
#define the random search parameters
param_grid = dict(batch_size=[16,32,64,128], epochs= [1,2,3,4,5],optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam', 'Ftrl'],hidden_layer = [1,2,3,4,5],hidden_node = [16,32,64,128,256],dropout = [0.1,0.2,0.3,0.4,0.5],recurrent_dropout = [0.1,0.2,0.3,0.4,0.5])
GRU_cv = RandomizedSearchCV(estimator=GRU_model, param_distributions=param_grid, n_jobs=-1,verbose = 1) #cv = none means -> cv = 5
random_result = GRU_cv.fit(training_padded,train_level)

Fitting 5 folds for each of 100000 candidates, totalling 500000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 500000 out of 500000 | elapsed: 2084.9min finished


In [None]:
# summarize results
print("Best parameters are : %s" % (random_result.best_params_))

Best parameters are : {'batch_size': 32, 'epochs': 4, 'optimizer': 'Adam', 'hidden_layer': 2, 'hidden_node': 64, 'dropout': 0.1, 'recurrent_dropout': 0.1}
