In [3]:
# Import required libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)

In [12]:
from tensorflow.keras.layers import Bidirectional, LSTM

# Converting hazard-category to binary format
lb_hazard_category = LabelBinarizer()
y_hazard_category = lb_hazard_category.fit_transform(train_data['hazard-category'])

# Split data
X_train, X_val, y_train_hazard_category, y_val_hazard_category = train_test_split(
    X, y_hazard_category, test_size=0.2, random_state=42, stratify=y_hazard_category
)

# Model for Hazard Category
model_hazard_category = Sequential()
model_hazard_category.add(Embedding(input_dim=5001, output_dim=128, input_length=100))

# Stacked Bidirectional LSTM Layers
model_hazard_category.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_hazard_category.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

# Output Layer for multi-class classification
model_hazard_category.add(Dense(len(lb_hazard_category.classes_), activation='softmax'))

# Compile model
model_hazard_category.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
hazard_category_train = model_hazard_category.fit(
    X_train, y_train_hazard_category,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard_category),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for hazard-category
y_pred_hazard_category = model_hazard_category.predict(X_val)
y_pred_hazard_category_classes = np.argmax(y_pred_hazard_category, axis=1)

# Generate classification report
unique_labels_hazard_category = np.unique(np.argmax(y_val_hazard_category, axis=1))
classification_rep_hazard_category = classification_report(
    np.argmax(y_val_hazard_category, axis=1),
    y_pred_hazard_category_classes,
    labels=unique_labels_hazard_category,
    target_names=[lb_hazard_category.classes_[i] for i in unique_labels_hazard_category]
)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard_category)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 790ms/step - accuracy: 0.3930 - loss: 1.6750 - val_accuracy: 0.6500 - val_loss: 1.0369
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 621ms/step - accuracy: 0.6997 - loss: 0.9352 - val_accuracy: 0.7101 - val_loss: 0.8649
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 651ms/step - accuracy: 0.8154 - loss: 0.5923 - val_accuracy: 0.7469 - val_loss: 0.8393
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 618ms/step - accuracy: 0.8777 - loss: 0.3997 - val_accuracy: 0.7452 - val_loss: 0.9067
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 640ms/step - accuracy: 0.9024 - loss: 0.3264 - val_accuracy: 0.7552 - val_loss: 1.0138
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 625ms/step - accuracy: 0.9265 - loss: 0.2432 - val_accuracy: 0.7385 - val_loss: 1.0384
Epoch 6: early stoppin

In [13]:
# Converting product-category to binary format
lb_product_category = LabelBinarizer()
y_product_category = lb_product_category.fit_transform(train_data['product-category'])

# Split data
X_train, X_val, y_train_product_category, y_val_product_category = train_test_split(
    X, y_product_category, test_size=0.2, random_state=42, stratify=y_product_category
)

# Model for Product Category
model_product_category = Sequential()
model_product_category.add(Embedding(input_dim=5001, output_dim=128, input_length=100))

# Stacked Bidirectional LSTM Layers
model_product_category.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_product_category.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

# Output Layer for multi-class classification
model_product_category.add(Dense(len(lb_product_category.classes_), activation='softmax'))

# Compile model
model_product_category.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
product_category_train = model_product_category.fit(
    X_train, y_train_product_category,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product_category),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for product-category
y_pred_product_category = model_product_category.predict(X_val)
y_pred_product_category_classes = np.argmax(y_pred_product_category, axis=1)

# Generate classification report
unique_labels_product_category = np.unique(np.argmax(y_val_product_category, axis=1))
classification_rep_product_category = classification_report(
    np.argmax(y_val_product_category, axis=1),
    y_pred_product_category_classes,
    labels=unique_labels_product_category,
    target_names=[lb_product_category.classes_[i] for i in unique_labels_product_category]
)
print("Classification Report for Product-Category:")
print(classification_rep_product_category)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 624ms/step - accuracy: 0.2667 - loss: 2.5697 - val_accuracy: 0.3534 - val_loss: 2.1676
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 639ms/step - accuracy: 0.4037 - loss: 2.0011 - val_accuracy: 0.5104 - val_loss: 1.6697
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 612ms/step - accuracy: 0.5877 - loss: 1.3757 - val_accuracy: 0.5906 - val_loss: 1.4197
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 610ms/step - accuracy: 0.7141 - loss: 0.9753 - val_accuracy: 0.6266 - val_loss: 1.3631
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 633ms/step - accuracy: 0.7977 - loss: 0.7030 - val_accuracy: 0.6424 - val_loss: 1.4070
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 610ms/step - accuracy: 0.8328 - loss: 0.5804 - val_accuracy: 0.6433 - val_loss: 1.4082
Epoch 7/10
[1m75/75[

In [15]:
# Converting hazard to binary format
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])

# Split data
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(
    X, y_hazard, test_size=0.2, random_state=42)

# Model for Hazard
model_hazard = Sequential()
model_hazard.add(Embedding(input_dim=5001, output_dim=128, input_length=100))

# Stacked Bidirectional LSTM Layers
model_hazard.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_hazard.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

# Output Layer for multi-class classification
model_hazard.add(Dense(len(lb_hazard.classes_), activation='softmax'))

# Compile model
model_hazard.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
hazard_train = model_hazard.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for hazard
y_pred_hazard = model_hazard.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Generate classification report
unique_labels_hazard = np.unique(np.argmax(y_val_hazard, axis=1))
classification_rep_hazard = classification_report(
    np.argmax(y_val_hazard, axis=1),
    y_pred_hazard_classes,
    labels=unique_labels_hazard,
    target_names=[lb_hazard.classes_[i] for i in unique_labels_hazard]
)
print("Classification Report for Hazard:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 625ms/step - accuracy: 0.1252 - loss: 4.1389 - val_accuracy: 0.2097 - val_loss: 3.5676
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 633ms/step - accuracy: 0.2141 - loss: 3.4363 - val_accuracy: 0.2556 - val_loss: 3.2451
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 615ms/step - accuracy: 0.2815 - loss: 2.9524 - val_accuracy: 0.3124 - val_loss: 3.0123
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 621ms/step - accuracy: 0.3797 - loss: 2.6312 - val_accuracy: 0.3425 - val_loss: 2.9363
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 629ms/step - accuracy: 0.4402 - loss: 2.3584 - val_accuracy: 0.3801 - val_loss: 2.8581
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 612ms/step - accuracy: 0.4980 - loss: 2.1132 - val_accuracy: 0.3851 - val_loss: 2.8872
Epoch 7/10
[1m75/75[

In [16]:
# Converting product to binary format
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])

# Split data
X_train, X_val, y_train_product, y_val_product = train_test_split(
    X, y_product, test_size=0.2, random_state=42)

# Model for Product
model_product = Sequential()
model_product.add(Embedding(input_dim=5001, output_dim=128, input_length=100))

# Stacked Bidirectional LSTM Layers
model_product.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_product.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

# Output Layer for multi-class classification
model_product.add(Dense(len(lb_product.classes_), activation='softmax'))

# Compile model
model_product.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
product_train = model_product.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for product
y_pred_product = model_product.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Generate classification report
unique_labels_product = np.unique(np.argmax(y_val_product, axis=1))
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    y_pred_product_classes,
    labels=unique_labels_product,
    target_names=[lb_product.classes_[i] for i in unique_labels_product]
)
print("Classification Report for Product:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 636ms/step - accuracy: 0.0211 - loss: 6.7128 - val_accuracy: 0.0334 - val_loss: 6.3922
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 627ms/step - accuracy: 0.0357 - loss: 6.1198 - val_accuracy: 0.0334 - val_loss: 6.4388
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 610ms/step - accuracy: 0.0335 - loss: 6.1212 - val_accuracy: 0.0501 - val_loss: 6.2912
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 633ms/step - accuracy: 0.0431 - loss: 5.9072 - val_accuracy: 0.0593 - val_loss: 6.1962
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 615ms/step - accuracy: 0.0596 - loss: 5.6090 - val_accuracy: 0.0660 - val_loss: 6.1238
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 634ms/step - accuracy: 0.0784 - loss: 5.3369 - val_accuracy: 0.0835 - val_loss: 5.9503
Epoch 7/10
[1m75/75[