In [None]:
from sklearn.metrics import confusion_matrix,classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from tensorflow.keras import regularizers
import numpy as np
from nltk.stem import SnowballStemmer
import re
import tensorflow as tf
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
nltk.download('stopwords')
from nltk.corpus import stopwords

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv('/Data/train_data.csv')
df.columns
df['label'] = df['label'].replace({"hate":0,"nothate":1})

In [None]:
def preprocess_text(df):
  df['text'] = df['text'].str.lower()
  df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
  df['text'] = df['text'].apply(lambda x: x.split())
  stop_words = set(stopwords.words('english'))
  df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
  stemmer = SnowballStemmer('english')
  df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])
  df['text'] = df['text'].apply(lambda x: ' '.join(x))
  return df['text']
max_length = 1000

In [None]:
df['text'] = preprocess_text(df)

char_set = 'abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\\|_@#$%^&*~‘+-=<>()[]{} \n'
char_to_index = {char: index for index, char in enumerate(char_set)}

X = []
for sentence in df['text']:
    encoded_sentence = np.zeros((max_length, len(char_set)))
    for i, char in enumerate(sentence):
        if i >= max_length:
            break
        if char in char_set:
            encoded_sentence[i, char_to_index[char]] = 1
    X.append(encoded_sentence)
X = np.array(X)

In [None]:
print(X.shape)
len(X[0])

(3004, 1000, 70)


1000

In [None]:
checkpoint_filepath = '/Models/CharacterLevel/characterlevel_3000.h5'
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

## Model Summary



```plaintext
Input shape: (1000, 70)

Layer (type)                     Output Shape         Param #     Connected to
==================================================================================================
input_1 (InputLayer)             [(None, 1000, 70)]   0
__________________________________________________________________________________________________
conv1d (Conv1D)                  (None, 996, 128)     44928       input_1[0][0]
__________________________________________________________________________________________________
max_pooling1d (MaxPooling1D)     (None, 249, 128)     0           conv1d[0][0]
__________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 245, 256)     164096      max_pooling1d[0][0]
__________________________________________________________________________________________________
max_pooling1d_1 (MaxPooling1D)   (None, 61, 256)      0           conv1d_1[0][0]
__________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 57, 512)      655872      max_pooling1d_1[0][0]
__________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D)   (None, 14, 512)      0           conv1d_2[0][0]
__________________________________________________________________________________________________
flatten (Flatten)                (None, 7168)         0           max_pooling1d_2[0][0]
__________________________________________________________________________________________________
dense (Dense)                    (None, 512)          3670528     flatten[0][0]
__________________________________________________________________________________________________
dropout (Dropout)                (None, 512)          0           dense[0][0]
__________________________________________________________________________________________________
dense_1 (Dense)                  (None, 256)          131328      dropout[0][0]
__________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)          0           dense_1[0][0]
__________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)            257         dropout_1[0][0]
==================================================================================================
Total params: 4,034,009
Trainable params: 4,034,009
Non-trainable params: 0


In [None]:
input_shape = (1000,70)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Conv1D(256, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Conv1D(512, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X,df['label'],validation_split = 0.1, epochs=100, batch_size=32,callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.68771, saving model to /content/drive/MyDrive/FYP-DATA-AND-RESOURCES/Models_and_data/CharacterLevel/characterlevel_3000.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.68771 to 0.71429, saving model to /content/drive/MyDrive/FYP-DATA-AND-RESOURCES/Models_and_data/CharacterLevel/characterlevel_3000.h5
Epoch 3/100
Epoch 3: val_accuracy improved from 0.71429 to 0.73754, saving model to /content/drive/MyDrive/FYP-DATA-AND-RESOURCES/Models_and_data/CharacterLevel/characterlevel_3000.h5
Epoch 4/100
Epoch 4: val_accuracy improved from 0.73754 to 0.78738, saving model to /content/drive/MyDrive/FYP-DATA-AND-RESOURCES/Models_and_data/CharacterLevel/characterlevel_3000.h5
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.78738
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.78738
Epoch 7/100
Epoch 7: val_accuracy improved from 0.78738 to 0.79402, saving model to /content/drive/MyDrive/FYP-DATA-AND-RESOURCES/Models_and_

<keras.callbacks.History at 0x7efee73e6b30>

In [None]:
df2 = pd.read_csv('/Data/test_data.csv')
df2['label'] = df2['label'].replace({"hate":0,"nothate":1})
df2['text'] = preprocess_text(df2)

char_set = 'abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\\|_@#$%^&*~‘+-=<>()[]{} \n'
char_to_index = {char: index for index, char in enumerate(char_set)}

X2 = []
for sentence in df2['text']:
    encoded_sentence = np.zeros((max_length, len(char_set)))
    for i, char in enumerate(sentence):
        if i >= max_length:
            break
        if char in char_set:
            encoded_sentence[i, char_to_index[char]] = 1
    X2.append(encoded_sentence)
X2 = np.array(X2)

In [None]:
loaded_model = load_model('/Models/CharacterLevel/characterlevel_3000.h5')
pred = loaded_model.predict(X2)
loss, acc = loaded_model.evaluate(X2,df2['label'], batch_size=32)

print('Test loss:', loss)
print('Test accuracy:', acc,end = '\n\n')

for i,x in enumerate(pred):#['hate = 0' 'nothate = 1']
  if x >= 0.5:
    pred[i] = 1
  else:
    pred[i] = 0
cm = confusion_matrix(df2['label'],pred)
print(cm)
cr = classification_report(df2['label'],pred)
print(cr)

Test loss: 0.5187922716140747
Test accuracy: 0.8982036113739014

[[172  17]
 [ 17 128]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       189
           1       0.88      0.88      0.88       145

    accuracy                           0.90       334
   macro avg       0.90      0.90      0.90       334
weighted avg       0.90      0.90      0.90       334

