<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/hatespeech/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Reading the generated encodings and preparing the train and test dataset

In [60]:
!pip install numpy



In [61]:
import csv
import numpy as np

# Specify the CSV file path
csv_file_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/embedded2.csv"

# Read the data from the CSV file
data = np.genfromtxt(csv_file_path, delimiter=',', skip_header=1)

# Split the data into `bertEmbeddings` and `labels` arrays
bertEmbeddings = data[:, :-1]
labels = data[:, -1].astype(int)

In [62]:
print(f'bertEmbeddings: {bertEmbeddings}')
print(f'labels: {labels}')

bertEmbeddings: [[-13.00299454  -0.73462087  -0.57271749 ...   5.38898468  -2.41615009
   -0.43181288]
 [-12.77867985  -2.68920445  -3.35125947 ...   0.24795489  -0.66592824
   -1.92575634]
 [-14.33217907  -7.8353014   -3.74883032 ...  -0.76926965  -1.4627775
   -2.04114509]
 ...
 [-15.2856226   -2.84475231  -2.1643548  ...   1.90715456   0.3952359
   -0.55821282]
 [-12.2387886   -3.43063974  -7.23090553 ...   6.47149897   1.32374465
   -1.50549579]
 [-13.61747932  -4.03414297  -3.02906322 ...  -5.82230282 -13.61741447
    2.46557331]]
labels: [0 1 3 ... 0 0 1]


In [63]:
from keras.utils import to_categorical

onehot_labels = to_categorical(labels)

In [64]:
print(f'Onehot labels: {onehot_labels}')

Onehot labels: [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [65]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(bertEmbeddings, onehot_labels, test_size=0.3, random_state=42)

In [66]:
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.5, random_state=40)

In [67]:
print(f'X_train: {X_train.shape}\n\
        y_train: {y_train.shape}\n\
        X_val: {X_val.shape}\n\
        y_val: {y_val.shape}\n\
        X_test: {X_test.shape}\n\
        y_test: {y_test.shape}')

X_train: (3988, 30)
        y_train: (3988, 4)
        X_val: (855, 30)
        y_val: (855, 4)
        X_test: (855, 30)
        y_test: (855, 4)


# passing embeddings to lstm model

In [68]:
import numpy as np
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, BatchNormalization, Dropout

In [69]:
input_shape = (30, 1)
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(4, activation='softmax'))

In [70]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [71]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_7 (Bidirectio  (None, 30, 128)          33792     
 nal)                                                            
                                                                 
 batch_normalization_4 (Batc  (None, 30, 128)          512       
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 30, 128)           0         
                                                                 
 bidirectional_8 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 batch_normalization_5 (Batc  (None, 64)               256       
 hNormalization)                                      

In [72]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7d558346d6f0>

In [73]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}")

Validation Loss: 1.2983, Validation Accuracy: 0.4105


In [74]:
from sklearn.metrics import classification_report

# Step 7: Get predictions from the model
y_test_pred = model.predict(X_test)

# Convert one-hot encoded predictions to class labels
y_test_pred_labels = np.argmax(y_test_pred, axis=1)
y_test_true_labels = np.argmax(y_test, axis=1)

# Step 8: Get the classification report for the test data
report = classification_report(y_test_true_labels, y_test_pred_labels)
print("Classification Report (Test Data):")
print(report)


Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.42      0.75      0.54       306
           1       0.17      0.01      0.02       126
           2       0.25      0.07      0.11       150
           3       0.44      0.41      0.43       273

    accuracy                           0.42       855
   macro avg       0.32      0.31      0.27       855
weighted avg       0.36      0.42      0.35       855

