In [3]:
# Import required libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)

In [5]:
# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_hazard)

# Updated model with Stacked LSTM layers
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))

# Stacked LSTM Layers
model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Output Layer for multi-class classification
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Generate classification report for hazard-category
classification_rep_hazard = classification_report(np.argmax(y_val_hazard, axis=1), y_pred_hazard_classes, target_names=lb_hazard.classes_)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 354ms/step - accuracy: 0.3256 - loss: 1.7796 - val_accuracy: 0.5990 - val_loss: 1.1747
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 344ms/step - accuracy: 0.6219 - loss: 1.1211 - val_accuracy: 0.6759 - val_loss: 0.9765
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 534ms/step - accuracy: 0.7215 - loss: 0.8659 - val_accuracy: 0.6992 - val_loss: 0.9431
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 336ms/step - accuracy: 0.7975 - loss: 0.6360 - val_accuracy: 0.7201 - val_loss: 0.9122
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 340ms/step - accuracy: 0.8588 - loss: 0.4482 - val_accuracy: 0.7327 - val_loss: 0.9581
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 345ms/step - accuracy: 0.8872 - loss: 0.3561 - val_accuracy: 0.7402 - val_loss: 1.0114
Epoch 7/10
[1m75/75[

In [6]:
# Converting product-category to binary format using LabelBinarizer (for multi-class classification)
lb_product_category = LabelBinarizer()
y_product_category = lb_product_category.fit_transform(train_data['product-category'])

# Split into training and validation sets
X_train, X_val, y_train_product_category, y_val_product_category = train_test_split(
    X, y_product_category, test_size=0.2, random_state=42, stratify=y_product_category
)

# Model for Product Category
model_product_category = Sequential()
model_product_category.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model_product_category.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model_product_category.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_product_category.add(Dense(len(lb_product_category.classes_), activation='softmax'))

# Compile model
model_product_category.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
product_category_train = model_product_category.fit(
    X_train, y_train_product_category,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product_category),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for product-category
y_pred_product_category = model_product_category.predict(X_val)
y_pred_product_category_classes = np.argmax(y_pred_product_category, axis=1)
classification_rep_product_category = classification_report(
    np.argmax(y_val_product_category, axis=1), y_pred_product_category_classes, target_names=lb_product_category.classes_
)
print("Classification Report for Product-Category:")
print(classification_rep_product_category)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 355ms/step - accuracy: 0.2671 - loss: 2.6250 - val_accuracy: 0.3266 - val_loss: 2.3275
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 342ms/step - accuracy: 0.3419 - loss: 2.2221 - val_accuracy: 0.4286 - val_loss: 1.9385
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 342ms/step - accuracy: 0.4853 - loss: 1.7460 - val_accuracy: 0.5196 - val_loss: 1.6227
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 327ms/step - accuracy: 0.5927 - loss: 1.3108 - val_accuracy: 0.5631 - val_loss: 1.5170
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 339ms/step - accuracy: 0.6990 - loss: 0.9936 - val_accuracy: 0.5848 - val_loss: 1.4426
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 348ms/step - accuracy: 0.7864 - loss: 0.7496 - val_accuracy: 0.6207 - val_loss: 1.4577
Epoch 7/10
[1m75/75[

In [9]:
# Converting hazard to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(
    X, y_hazard, test_size=0.2, random_state=42)

# Model for Hazard
model_hazard = Sequential()
model_hazard.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model_hazard.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model_hazard.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_hazard.add(Dense(len(lb_hazard.classes_), activation='softmax'))

# Compile model
model_hazard.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
hazard_train = model_hazard.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for hazard
y_pred_hazard = model_hazard.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Use unique classes in y_val_hazard to avoid mismatched target names
unique_labels = np.unique(np.argmax(y_val_hazard, axis=1))

# Generate classification report for hazard with matching target names
classification_rep_hazard = classification_report(
    np.argmax(y_val_hazard, axis=1),
    y_pred_hazard_classes,
    labels=unique_labels,  # Ensures labels match validation data
    target_names=[lb_hazard.classes_[i] for i in unique_labels]
)
print("Classification Report for Hazard:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 341ms/step - accuracy: 0.1095 - loss: 4.2566 - val_accuracy: 0.1228 - val_loss: 3.7369
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 341ms/step - accuracy: 0.1390 - loss: 3.6721 - val_accuracy: 0.2063 - val_loss: 3.5156
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 340ms/step - accuracy: 0.2237 - loss: 3.3175 - val_accuracy: 0.2080 - val_loss: 3.2635
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 333ms/step - accuracy: 0.2669 - loss: 2.9995 - val_accuracy: 0.2749 - val_loss: 3.1430
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 342ms/step - accuracy: 0.3398 - loss: 2.8072 - val_accuracy: 0.3208 - val_loss: 3.0309
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 342ms/step - accuracy: 0.4004 - loss: 2.5456 - val_accuracy: 0.3475 - val_loss: 2.9509
Epoch 7/10
[1m75/75[

In [10]:
# Converting product to binary format using LabelBinarizer (for multi-class classification)
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])

# Split into training and validation sets
X_train, X_val, y_train_product, y_val_product = train_test_split(
    X, y_product, test_size=0.2, random_state=42)

# Model for Product
model_product = Sequential()
model_product.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model_product.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model_product.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_product.add(Dense(len(lb_product.classes_), activation='softmax'))

# Compile model
model_product.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
product_train = model_product.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Prediction and classification report for product
y_pred_product = model_product.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Use unique classes in y_val_product to avoid mismatched target names
unique_labels_product = np.unique(np.argmax(y_val_product, axis=1))

# Generate classification report for product with matching target names
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    y_pred_product_classes,
    labels=unique_labels_product,  # Ensures labels match validation data
    target_names=[lb_product.classes_[i] for i in unique_labels_product]
)
print("Classification Report for Product:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 341ms/step - accuracy: 0.0275 - loss: 6.7535 - val_accuracy: 0.0334 - val_loss: 6.3725
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 335ms/step - accuracy: 0.0298 - loss: 6.1815 - val_accuracy: 0.0334 - val_loss: 6.4251
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 339ms/step - accuracy: 0.0324 - loss: 6.1465 - val_accuracy: 0.0334 - val_loss: 6.4532
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 340ms/step - accuracy: 0.0337 - loss: 6.1009 - val_accuracy: 0.0326 - val_loss: 6.3198
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 326ms/step - accuracy: 0.0352 - loss: 5.9239 - val_accuracy: 0.0359 - val_loss: 6.3527
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 331ms/step - accuracy: 0.0423 - loss: 5.7565 - val_accuracy: 0.0485 - val_loss: 6.2785
Epoch 7/10
[1m75/75[