<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/hatespeech/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


# Reading the generated encodings and preparing the train and test dataset

In [None]:
!pip install numpy



In [None]:
import csv
import numpy as np

# Specify the CSV file path
csv_file_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/embedded2.csv"

# Read the data from the CSV file
data = np.genfromtxt(csv_file_path, delimiter=',', skip_header=1)

# Split the data into `bertEmbeddings` and `labels` arrays
bertEmbeddings = data[:, :-1]
labels = data[:, -1].astype(int)

In [None]:
print(f'bertEmbeddings: {bertEmbeddings}')
print(f'labels: {labels}')

bertEmbeddings: [[-13.00299263  -0.73461741  -0.57270575 ...  -1.18679011   4.70789433
  -13.14398575]
 [-12.77867699  -2.68920493  -3.35125685 ...  -2.41241527   5.1453104
  -12.67943573]
 [-14.3321743   -7.83530235  -3.7488277  ...  -1.73290098  -3.04261637
  -14.16944313]
 ...
 [-15.28562164  -2.84474206  -2.16434884 ...   0.17359121   4.78863382
   -0.67929214]
 [-12.23878384  -3.43063712  -7.23090363 ...   0.35684556   1.14516771
  -12.33890343]
 [-13.61747742  -4.03414202  -3.02905583 ...  -2.24742937  -0.34948486
  -13.41543579]]
labels: [0 1 3 ... 0 0 1]


In [None]:
from keras.utils import to_categorical

onehot_labels = to_categorical(labels)

In [None]:
print(f'Onehot labels: {onehot_labels}')

Onehot labels: [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(bertEmbeddings, onehot_labels, test_size=0.3, random_state=42)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.5, random_state=40)

In [None]:
print(f'X_train: {X_train.shape}\n\
        y_train: {y_train.shape}\n\
        X_val: {X_val.shape}\n\
        y_val: {y_val.shape}\n\
        X_test: {X_test.shape}\n\
        y_test: {y_test.shape}')

X_train: (3988, 128)
        y_train: (3988, 4)
        X_val: (855, 128)
        y_val: (855, 4)
        X_test: (855, 128)
        y_test: (855, 4)


# passing embeddings to lstm model

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, Dropout

In [None]:
input_shape = (128, 1)
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(4, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_23 (Bidirecti  (None, 128, 128)         33792     
 onal)                                                           
                                                                 
 batch_normalization_20 (Bat  (None, 128, 128)         512       
 chNormalization)                                                
                                                                 
 dropout_20 (Dropout)        (None, 128, 128)          0         
                                                                 
 bidirectional_24 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 batch_normalization_21 (Bat  (None, 64)               256       
 chNormalization)                                     

In [None]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7e103732f9a0>

In [None]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}")

Validation Loss: 1.3233, Validation Accuracy: 0.3977


In [None]:
from sklearn.metrics import classification_report

# Step 7: Get predictions from the model
y_test_pred = model.predict(X_test)

# Convert one-hot encoded predictions to class labels
y_test_pred_labels = np.argmax(y_test_pred, axis=1)
y_test_true_labels = np.argmax(y_test, axis=1)

# Step 8: Get the classification report for the test data
report = classification_report(y_test_true_labels, y_test_pred_labels)
print("Classification Report (Test Data):")
print(report)


Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.37      0.96      0.53       306
           1       0.00      0.00      0.00       126
           2       0.00      0.00      0.00       150
           3       0.44      0.09      0.15       273

    accuracy                           0.37       855
   macro avg       0.20      0.26      0.17       855
weighted avg       0.27      0.37      0.24       855



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
