In [33]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Flatten, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)



# Bidirectional LSTM

In [38]:

# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_hazard)

# Simple Bidirectional LSTM model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))  # Simple Bidirectional LSTM

# Output Layer for multi-class classification
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the Bidirectional LSTM model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Generate classification report for hazard-category
classification_rep_hazard = classification_report(np.argmax(y_val_hazard, axis=1), y_pred_hazard_classes, target_names=lb_hazard.classes_)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 340ms/step - accuracy: 0.4244 - loss: 1.7137 - val_accuracy: 0.6316 - val_loss: 1.0305
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 365ms/step - accuracy: 0.7013 - loss: 0.9319 - val_accuracy: 0.7201 - val_loss: 0.8673
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 346ms/step - accuracy: 0.8043 - loss: 0.6397 - val_accuracy: 0.7519 - val_loss: 0.8191
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 351ms/step - accuracy: 0.8640 - loss: 0.4580 - val_accuracy: 0.7469 - val_loss: 0.8607
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 342ms/step - accuracy: 0.9039 - loss: 0.3338 - val_accuracy: 0.7544 - val_loss: 0.9222
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 341ms/step - accuracy: 0.9161 - loss: 0.2719 - val_accuracy: 0.7435 - val_loss: 1.0004
Epoch 6: early stoppin

In [39]:

# Converting product-category to binary format using LabelBinarizer (for multi-class classification)
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product-category'])  # Change to 'product' column

# Split into training and validation sets
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42, stratify=y_product)

# Simple Bidirectional LSTM model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))  # Simple Bidirectional LSTM

# Output Layer for multi-class classification
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the Bidirectional LSTM model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product-category
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Generate classification report for product-category
classification_rep_product = classification_report(np.argmax(y_val_product, axis=1), y_pred_product_classes, target_names=lb_product.classes_)
print("Classification Report for Product-Category:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 345ms/step - accuracy: 0.2611 - loss: 2.6199 - val_accuracy: 0.3358 - val_loss: 2.2458
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 337ms/step - accuracy: 0.3651 - loss: 2.1105 - val_accuracy: 0.4695 - val_loss: 1.7765
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 361ms/step - accuracy: 0.5546 - loss: 1.5236 - val_accuracy: 0.6007 - val_loss: 1.4145
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 336ms/step - accuracy: 0.7248 - loss: 0.9986 - val_accuracy: 0.6541 - val_loss: 1.2825
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 337ms/step - accuracy: 0.8028 - loss: 0.7268 - val_accuracy: 0.6558 - val_loss: 1.2918
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 343ms/step - accuracy: 0.8489 - loss: 0.5692 - val_accuracy: 0.6591 - val_loss: 1.3297
Epoch 7/10
[1m75/75[

In [42]:

# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42)

# Simple Bidirectional LSTM model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))  # Simple Bidirectional LSTM

# Output Layer for multi-class classification
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the Bidirectional LSTM model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

unique_classes = unique_labels(y_val_hazard_classes, y_pred_hazard_classes)

# Generate classification report with the correct labels for hazard
classification_rep_hazard = classification_report(
    y_val_hazard_classes,
    y_pred_hazard_classes,
    target_names=[lb_hazard.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Hazard:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 349ms/step - accuracy: 0.1267 - loss: 4.2014 - val_accuracy: 0.2398 - val_loss: 3.5959
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 353ms/step - accuracy: 0.2765 - loss: 3.2902 - val_accuracy: 0.3266 - val_loss: 2.9805
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 340ms/step - accuracy: 0.3815 - loss: 2.7201 - val_accuracy: 0.3985 - val_loss: 2.7249
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 362ms/step - accuracy: 0.4987 - loss: 2.2445 - val_accuracy: 0.4403 - val_loss: 2.5966
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 340ms/step - accuracy: 0.5627 - loss: 1.9583 - val_accuracy: 0.4653 - val_loss: 2.5420
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 338ms/step - accuracy: 0.6249 - loss: 1.6576 - val_accuracy: 0.4804 - val_loss: 2.4746
Epoch 7/10
[1m75/75[

In [45]:

# Converting product-category to binary format using LabelBinarizer (for multi-class classification)
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])  # Change to 'product' column

# Split into training and validation sets
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42)

# Simple Bidirectional LSTM model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))  # Simple Bidirectional LSTM

# Output Layer for multi-class classification
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the Bidirectional LSTM model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product-category
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Convert true validation labels to their original categorical form
y_val_product_classes = np.argmax(y_val_product, axis=1)

# Get the unique classes in both predictions and true labels
unique_classes = unique_labels(y_val_product_classes, y_pred_product_classes)

# Generate classification report with the correct labels for product-category
classification_rep_product = classification_report(
    y_val_product_classes,
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Product:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 346ms/step - accuracy: 0.0256 - loss: 6.7015 - val_accuracy: 0.0334 - val_loss: 6.3887
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 337ms/step - accuracy: 0.0317 - loss: 6.1820 - val_accuracy: 0.0393 - val_loss: 6.3898
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 358ms/step - accuracy: 0.0329 - loss: 6.0922 - val_accuracy: 0.0677 - val_loss: 6.2004
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 351ms/step - accuracy: 0.0652 - loss: 5.6983 - val_accuracy: 0.0936 - val_loss: 5.9690
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 340ms/step - accuracy: 0.0955 - loss: 5.2776 - val_accuracy: 0.1228 - val_loss: 5.7604
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 340ms/step - accuracy: 0.1500 - loss: 4.8588 - val_accuracy: 0.1395 - val_loss: 5.6448
Epoch 7/10
[1m75/75[