In [36]:
import pandas as pd
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

nltk.download('stopwords')

# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:


# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)

# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_hazard)

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1, bidirectional=False, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]  # Take the last hidden state
        output = self.fc(hidden)
        return output

# Set parameters
vocab_size = 5000  # Should match tokenizer's num_words
embedding_dim = 128
hidden_dim = 64
output_dim = y_hazard.shape[1]  # Number of classes
n_layers = 2
bidirectional = True
dropout = 0.5
batch_size = 32
num_epochs = 50
learning_rate = 0.001
patience = 5  # Early stopping patience

# Initialize model, loss, and optimizer
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)
criterion = nn.BCEWithLogitsLoss()  # Suitable for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Prepare data loaders
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train_hazard, dtype=torch.float))
val_data = TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val_hazard, dtype=torch.float))

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Early stopping variables
best_val_loss = np.inf
epochs_no_improve = 0

# Training loop with early stopping
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            preds = torch.sigmoid(outputs) > 0.5
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Check early stopping condition
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pt')  # Save the best model
    else:
        epochs_no_improve += 1
        if epochs_no_improve == patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Final evaluation with classification report
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        preds = torch.sigmoid(outputs) > 0.5
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Classification report
print("Final Classification Report:")
print(classification_report(np.array(all_labels), np.array(all_preds), target_names=lb_hazard.classes_))


Epoch 1/50, Training Loss: 0.2927
Validation Loss: 0.1896, Validation Accuracy: 0.4302
Epoch 2/50, Training Loss: 0.1844
Validation Loss: 0.1590, Validation Accuracy: 0.5322
Epoch 3/50, Training Loss: 0.1647
Validation Loss: 0.1497, Validation Accuracy: 0.5781
Epoch 4/50, Training Loss: 0.1530
Validation Loss: 0.1454, Validation Accuracy: 0.6032
Epoch 5/50, Training Loss: 0.1450
Validation Loss: 0.1402, Validation Accuracy: 0.6048
Epoch 6/50, Training Loss: 0.1378
Validation Loss: 0.1382, Validation Accuracy: 0.6199
Epoch 7/50, Training Loss: 0.1321
Validation Loss: 0.1360, Validation Accuracy: 0.6316
Epoch 8/50, Training Loss: 0.1260
Validation Loss: 0.1345, Validation Accuracy: 0.6383
Epoch 9/50, Training Loss: 0.1212
Validation Loss: 0.1326, Validation Accuracy: 0.6516
Epoch 10/50, Training Loss: 0.1165
Validation Loss: 0.1317, Validation Accuracy: 0.6583
Epoch 11/50, Training Loss: 0.1128
Validation Loss: 0.1349, Validation Accuracy: 0.6717
Epoch 12/50, Training Loss: 0.1084
Valida