Ensemble used on lstm, bidirectional lstm, rnn, ffnn models

In [25]:
# Import required libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)

In [28]:
# Define model-building functions for each model type
def build_lstm_model(output_dim):
    model = Sequential([
        Embedding(input_dim=5001, output_dim=128, input_length=100),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_bilstm_model(output_dim):
    model = Sequential([
        Embedding(input_dim=5001, output_dim=128, input_length=100),
        Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_rnn_model(output_dim):
    model = Sequential([
        Embedding(input_dim=5001, output_dim=128, input_length=100),
        SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_ffnn_model(output_dim):
    model = Sequential([
        Embedding(input_dim=5001, output_dim=128, input_length=100),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [29]:
# Encode hazard-category
lb_hazard_category = LabelBinarizer()
y_hazard_category = lb_hazard_category.fit_transform(train_data['hazard-category'])

# Split data
X_train, X_val, y_train_hazard_category, y_val_hazard_category = train_test_split(
    X, y_hazard_category, test_size=0.2, random_state=42, stratify=y_hazard_category
)

# Train each model for hazard-category
lstm_model = build_lstm_model(len(lb_hazard_category.classes_))
lstm_model.fit(X_train, y_train_hazard_category, epochs=5, validation_data=(X_val, y_val_hazard_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
lstm_preds = lstm_model.predict(X_val)

bilstm_model = build_bilstm_model(len(lb_hazard_category.classes_))
bilstm_model.fit(X_train, y_train_hazard_category, epochs=5, validation_data=(X_val, y_val_hazard_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
bilstm_preds = bilstm_model.predict(X_val)

rnn_model = build_rnn_model(len(lb_hazard_category.classes_))
rnn_model.fit(X_train, y_train_hazard_category, epochs=5, validation_data=(X_val, y_val_hazard_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
rnn_preds = rnn_model.predict(X_val)

ffnn_model = build_ffnn_model(len(lb_hazard_category.classes_))
ffnn_model.fit(X_train, y_train_hazard_category, epochs=5, validation_data=(X_val, y_val_hazard_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
ffnn_preds = ffnn_model.predict(X_val)

# Ensemble predictions
ensemble_preds = (lstm_preds + bilstm_preds + rnn_preds + ffnn_preds) / 4
ensemble_pred_classes = np.argmax(ensemble_preds, axis=1)

# Classification report
classification_rep_hazard_category = classification_report(
    np.argmax(y_val_hazard_category, axis=1),
    ensemble_pred_classes,
    target_names=lb_hazard_category.classes_
)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard_category)


Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 251ms/step - accuracy: 0.4701 - loss: 1.6340 - val_accuracy: 0.6708 - val_loss: 0.9478
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 193ms/step - accuracy: 0.7440 - loss: 0.8187 - val_accuracy: 0.7277 - val_loss: 0.8772
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 176ms/step - accuracy: 0.8375 - loss: 0.5307 - val_accuracy: 0.7536 - val_loss: 0.8181
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 176ms/step - accuracy: 0.9089 - loss: 0.3206 - val_accuracy: 0.7485 - val_loss: 0.9127
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 179ms/step - accuracy: 0.9238 - loss: 0.2518 - val_accuracy: 0.7561 - val_loss: 0.9759
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step
Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 329ms/step - accuracy: 0.4473

In [30]:
# Encode product-category
lb_product_category = LabelBinarizer()
y_product_category = lb_product_category.fit_transform(train_data['product-category'])

# Split data
X_train, X_val, y_train_product_category, y_val_product_category = train_test_split(
    X, y_product_category, test_size=0.2, random_state=42, stratify=y_product_category
)

# Train each model for product-category
lstm_model = build_lstm_model(len(lb_product_category.classes_))
lstm_model.fit(X_train, y_train_product_category, epochs=5, validation_data=(X_val, y_val_product_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
lstm_preds = lstm_model.predict(X_val)

bilstm_model = build_bilstm_model(len(lb_product_category.classes_))
bilstm_model.fit(X_train, y_train_product_category, epochs=5, validation_data=(X_val, y_val_product_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
bilstm_preds = bilstm_model.predict(X_val)

rnn_model = build_rnn_model(len(lb_product_category.classes_))
rnn_model.fit(X_train, y_train_product_category, epochs=5, validation_data=(X_val, y_val_product_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
rnn_preds = rnn_model.predict(X_val)

ffnn_model = build_ffnn_model(len(lb_product_category.classes_))
ffnn_model.fit(X_train, y_train_product_category, epochs=5, validation_data=(X_val, y_val_product_category), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
ffnn_preds = ffnn_model.predict(X_val)

# Ensemble predictions
ensemble_preds = (lstm_preds + bilstm_preds + rnn_preds + ffnn_preds) / 4
ensemble_pred_classes = np.argmax(ensemble_preds, axis=1)

# Classification report
classification_rep_product_category = classification_report(
    np.argmax(y_val_product_category, axis=1),
    ensemble_pred_classes,
    target_names=lb_product_category.classes_
)
print("Classification Report for Product-Category:")
print(classification_rep_product_category)


Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 188ms/step - accuracy: 0.2630 - loss: 2.5735 - val_accuracy: 0.3525 - val_loss: 2.1413
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 185ms/step - accuracy: 0.4395 - loss: 1.9407 - val_accuracy: 0.5480 - val_loss: 1.5745
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 187ms/step - accuracy: 0.6369 - loss: 1.2563 - val_accuracy: 0.6341 - val_loss: 1.3120
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 212ms/step - accuracy: 0.7697 - loss: 0.8268 - val_accuracy: 0.6650 - val_loss: 1.2584
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 188ms/step - accuracy: 0.8401 - loss: 0.5737 - val_accuracy: 0.6717 - val_loss: 1.2762
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step
Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 338ms/step - accuracy: 0.2783

In [32]:
# Encode hazard
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])

# Split data
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(
    X, y_hazard, test_size=0.2, random_state=42)

# Train each model for hazard
lstm_model = build_lstm_model(len(lb_hazard.classes_))
lstm_model.fit(X_train, y_train_hazard, epochs=5, validation_data=(X_val, y_val_hazard), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
lstm_preds = lstm_model.predict(X_val)

bilstm_model = build_bilstm_model(len(lb_hazard.classes_))
bilstm_model.fit(X_train, y_train_hazard, epochs=5, validation_data=(X_val, y_val_hazard), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
bilstm_preds = bilstm_model.predict(X_val)

rnn_model = build_rnn_model(len(lb_hazard.classes_))
rnn_model.fit(X_train, y_train_hazard, epochs=5, validation_data=(X_val, y_val_hazard), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
rnn_preds = rnn_model.predict(X_val)

ffnn_model = build_ffnn_model(len(lb_hazard.classes_))
ffnn_model.fit(X_train, y_train_hazard, epochs=5, validation_data=(X_val, y_val_hazard), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
ffnn_preds = ffnn_model.predict(X_val)

# Ensemble predictions
ensemble_preds = (lstm_preds + bilstm_preds + rnn_preds + ffnn_preds) / 4
ensemble_pred_classes = np.argmax(ensemble_preds, axis=1)

# Classification report
unique_labels_hazard = np.unique(np.argmax(y_val_hazard, axis=1))

# Generate classification report for hazard
classification_rep_hazard = classification_report(
    np.argmax(y_val_hazard, axis=1),
    ensemble_pred_classes,
    labels=unique_labels_hazard,  # Ensures labels match validation data
    target_names=[lb_hazard.classes_[i] for i in unique_labels_hazard]
)
print("Classification Report for Hazard:")
print(classification_rep_hazard)



Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 191ms/step - accuracy: 0.1322 - loss: 4.1450 - val_accuracy: 0.2665 - val_loss: 3.3509
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 182ms/step - accuracy: 0.3035 - loss: 3.1168 - val_accuracy: 0.3968 - val_loss: 2.9276
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 183ms/step - accuracy: 0.4399 - loss: 2.5712 - val_accuracy: 0.4160 - val_loss: 2.7112
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 184ms/step - accuracy: 0.5156 - loss: 2.1938 - val_accuracy: 0.4586 - val_loss: 2.5923
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 184ms/step - accuracy: 0.5870 - loss: 1.8737 - val_accuracy: 0.4720 - val_loss: 2.5373
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step
Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 340ms/step - accuracy: 0.1385

In [33]:
# Encode product
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])

# Split data
X_train, X_val, y_train_product, y_val_product = train_test_split(
    X, y_product, test_size=0.2, random_state=42)

# Train each model for product
lstm_model = build_lstm_model(len(lb_product.classes_))
lstm_model.fit(X_train, y_train_product, epochs=5, validation_data=(X_val, y_val_product), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
lstm_preds = lstm_model.predict(X_val)

bilstm_model = build_bilstm_model(len(lb_product.classes_))
bilstm_model.fit(X_train, y_train_product, epochs=5, validation_data=(X_val, y_val_product), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
bilstm_preds = bilstm_model.predict(X_val)

rnn_model = build_rnn_model(len(lb_product.classes_))
rnn_model.fit(X_train, y_train_product, epochs=5, validation_data=(X_val, y_val_product), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
rnn_preds = rnn_model.predict(X_val)

ffnn_model = build_ffnn_model(len(lb_product.classes_))
ffnn_model.fit(X_train, y_train_product, epochs=5, validation_data=(X_val, y_val_product), callbacks=[EarlyStopping(monitor='val_loss', patience=3)])
ffnn_preds = ffnn_model.predict(X_val)

# Ensemble predictions
ensemble_preds = (lstm_preds + bilstm_preds + rnn_preds + ffnn_preds) / 4
ensemble_pred_classes = np.argmax(ensemble_preds, axis=1)

# Classification report for product with matching labels
unique_labels_product = np.unique(np.argmax(y_val_product, axis=1))

# Generate classification report for product
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    ensemble_pred_classes,
    labels=unique_labels_product,  # Ensures labels match validation data
    target_names=[lb_product.classes_[i] for i in unique_labels_product]
)
print("Classification Report for Product:")
print(classification_rep_product)



Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 190ms/step - accuracy: 0.0255 - loss: 6.6859 - val_accuracy: 0.0334 - val_loss: 6.3831
Epoch 2/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 189ms/step - accuracy: 0.0339 - loss: 6.1247 - val_accuracy: 0.0343 - val_loss: 6.2692
Epoch 3/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 185ms/step - accuracy: 0.0535 - loss: 5.8337 - val_accuracy: 0.0602 - val_loss: 6.1591
Epoch 4/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 185ms/step - accuracy: 0.0658 - loss: 5.5364 - val_accuracy: 0.0785 - val_loss: 5.9930
Epoch 5/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 185ms/step - accuracy: 0.1029 - loss: 5.0670 - val_accuracy: 0.1170 - val_loss: 5.8209
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step
Epoch 1/5
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 370ms/step - accuracy: 0.0240