<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/hatespeech/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Reading the generated encodings and preparing the train and test dataset

In [2]:
!pip install numpy



In [3]:
import csv
import numpy as np

# Specify the CSV file path
csv_file_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/embedded2.csv"

# Read the data from the CSV file
data = np.genfromtxt(csv_file_path, delimiter=',', skip_header=1)

# Split the data into `bertEmbeddings` and `labels` arrays
bertEmbeddings = data[:, :-1]
labels = data[:, -1].astype(int)

In [4]:
print(f'bertEmbeddings: {bertEmbeddings}')
print(f'labels: {labels}')

bertEmbeddings: [[-13.00299454  -0.73462087  -0.57271749 ...   5.38898468  -2.41615009
   -0.43181288]
 [-12.77867985  -2.68920445  -3.35125947 ...   0.24795489  -0.66592824
   -1.92575634]
 [-14.33217907  -7.8353014   -3.74883032 ...  -0.76926965  -1.4627775
   -2.04114509]
 ...
 [-15.2856226   -2.84475231  -2.1643548  ...   1.90715456   0.3952359
   -0.55821282]
 [-12.2387886   -3.43063974  -7.23090553 ...   6.47149897   1.32374465
   -1.50549579]
 [-13.61747932  -4.03414297  -3.02906322 ...  -5.82230282 -13.61741447
    2.46557331]]
labels: [0 1 3 ... 0 0 1]


In [5]:
from keras.utils import to_categorical

onehot_labels = to_categorical(labels)

In [6]:
print(f'Onehot labels: {onehot_labels}')

Onehot labels: [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(bertEmbeddings, onehot_labels, test_size=0.3, random_state=42)

In [27]:
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.5, random_state=40)

In [28]:
print(f'X_train: {X_train.shape}\n\
        y_train: {y_train.shape}\n\
        X_val: {X_val.shape}\n\
        y_val: {y_val.shape}\n\
        X_test: {X_test.shape}\n\
        y_test: {y_test.shape}')

X_train: (3988, 30)
        y_train: (3988, 4)
        X_val: (855, 30)
        y_val: (855, 4)
        X_test: (855, 30)
        y_test: (855, 4)


# passing embeddings to lstm model

In [9]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense


In [10]:
input_shape = (30, 1)  # Add a new axis to the input shape

model = Sequential()
model.add(LSTM(64, input_shape=input_shape))  # You can adjust the number of LSTM units (e.g., 64) based on your data and complexity requirements
model.add(Dense(4, activation='softmax'))  # Output layer with 4 units, using softmax activation for multi-class classification

In [11]:
# Step 3: Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
# Assuming you have already loaded your data as numpy arrays (X_train, y_train, X_val, y_val)
model.fit(X_train, y_train, batch_size=64, epochs=20, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7d55b71b38b0>

In [30]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}")

Validation Loss: 1.3144, Validation Accuracy: 0.4047


In [31]:
from sklearn.metrics import classification_report

# Step 7: Get predictions from the model
y_test_pred = model.predict(X_test)

# Convert one-hot encoded predictions to class labels
y_test_pred_labels = np.argmax(y_test_pred, axis=1)
y_test_true_labels = np.argmax(y_test, axis=1)

# Step 8: Get the classification report for the test data
report = classification_report(y_test_true_labels, y_test_pred_labels)
print("Classification Report (Test Data):")
print(report)


Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.42      0.78      0.55       306
           1       0.33      0.02      0.03       126
           2       0.23      0.05      0.08       150
           3       0.48      0.44      0.46       273

    accuracy                           0.43       855
   macro avg       0.37      0.32      0.28       855
weighted avg       0.39      0.43      0.36       855

