# Libraries

In [51]:
%%capture
!pip install datasets

In [66]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn as nn
import torch.optim as optim
import re
# Import nltk before using it
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Step 1: Load your datasets
train_data = pd.read_csv('incidents_labelled.csv')  # Training dataset with labels
test_data = pd.read_csv('incidents_val.csv')  # Test dataset without labels

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# All Classes

In [53]:
# Step 2: Preprocessing Function - Text Cleaning
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply text preprocessing
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)



In [54]:

# Step 3: Label Binarization (MultiLabelBinarizer for multi-label classification)
labels = ['hazard-category', 'product-category', 'hazard', 'product']
mlb = MultiLabelBinarizer()

# Apply MultiLabelBinarizer to the labels in the training data
y_train = mlb.fit_transform(train_data[labels].values)

# Step 4: Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['cleaned_text'], y_train, test_size=0.2, random_state=42
)

# Step 5: Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed

# Fit TF-IDF on training data and transform both training and validation sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [55]:

# Step 6: Create PyTorch Dataset Class for Multi-Label Data
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)  # Convert sparse matrix to dense array
        self.y = torch.tensor(y, dtype=torch.float32)  # Ensure labels are float

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create PyTorch Dataset and DataLoader for training and validation
train_dataset = TextDataset(X_train_tfidf, y_train)
val_dataset = TextDataset(X_val_tfidf, y_val)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)


In [56]:
# Step 7: Define FFNN Model
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

# Step 8: Define LSTM Model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), 128).to(x.device)
        c_0 = torch.zeros(1, x.size(0), 128).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h_0, c_0))
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

# Step 9: Define RNN Model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), 128).to(x.device)
        out, _ = self.rnn(x.unsqueeze(1), h_0)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)


In [57]:
# Step 10: Initialize models
input_size = X_train_tfidf.shape[1]
output_size = y_train.shape[1]

model_ffnn = FFNN(input_size, output_size).to('cpu')
model_lstm = LSTM(input_size, 128, output_size).to('cpu')
model_rnn = RNN(input_size, 128, output_size).to('cpu')

# Step 11: Training Setup
optimizer_ffnn = optim.Adam(model_ffnn.parameters(), lr=0.001)
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.001)
optimizer_rnn = optim.Adam(model_rnn.parameters(), lr=0.001)

loss_fn = nn.BCELoss()


In [58]:
# Step 12: Training Loop with Accuracy
def train_model(model, optimizer):
    model.train()
    for epoch in range(5):  # Train for 5 epochs
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for batch_X, batch_y in train_dataloader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Calculate accuracy
            preds = (output > 0.5).float()  # Convert to binary predictions
            correct_predictions += (preds == batch_y).sum().item()
            total_predictions += np.prod(batch_y.size())

        epoch_loss = running_loss / len(train_dataloader)
        epoch_accuracy = correct_predictions / total_predictions

        print(f"Epoch {epoch+1}, Loss: {epoch_loss}, Accuracy: {epoch_accuracy * 100:.2f}%")


In [64]:
from sklearn.metrics import classification_report, accuracy_score

# Step 13: Combined Evaluation for All Labels
def evaluate_model_combined(model, label_names):
    model.eval()
    all_preds = []
    all_labels = []

    # Disable gradient calculation for evaluation
    with torch.no_grad():
        for batch_X, batch_y in val_dataloader:
            output = model(batch_X)
            preds = (output > 0.5).cpu().numpy()  # Binary classification (multi-label)
            all_preds.append(preds)
            all_labels.append(batch_y.cpu().numpy())

    # Stack all predictions and labels for the evaluation
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Evaluate each label separately
    for i, label_name in enumerate(label_names):
        print(f"Classification report for {label_name}:")
        print(classification_report(all_labels[:, i], all_preds[:, i], target_names=[f"{label_name} class 0", f"{label_name} class 1"]))

    # Combined classification report for all labels
    print("Combined classification report for all labels:")
    print(classification_report(all_labels, all_preds))

    # Compute overall accuracy for multi-label classification
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Overall accuracy: {accuracy * 100:.2f}%")

# Example labels (should match your dataset's labels)
label_names = ['hazard-category', 'product-category', 'hazard', 'product']


In [60]:
# Step 14: Generate Predictions on Test Set
def predict_on_test(model):
    X_test_tfidf = tfidf_vectorizer.transform(test_data['cleaned_text'])
    model.eval()
    test_predictions = []
    with torch.no_grad():
        for batch_X in DataLoader(torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32), batch_size=32):
            output = model(batch_X)
            preds = (output > 0.5).cpu().numpy()
            test_predictions.append(preds)

    test_predictions = np.vstack(test_predictions)
    predicted_labels = mlb.inverse_transform(test_predictions)
    test_data['predicted_labels'] = predicted_labels
    test_data.to_csv('test_predictions.csv', index=False)


# FFNN

In [61]:
# Train FFNN
print("Training FFNN:")
train_model(model_ffnn, optimizer_ffnn)

Training FFNN:
Epoch 1, Loss: 0.18681446527441342, Accuracy: 96.66%
Epoch 2, Loss: 0.0154550690886875, Accuracy: 99.70%
Epoch 3, Loss: 0.014023678017159303, Accuracy: 99.70%
Epoch 4, Loss: 0.013442544527351856, Accuracy: 99.71%
Epoch 5, Loss: 0.01276042057822148, Accuracy: 99.73%


In [67]:
# Evaluate FFNN
print("Evaluating FFNN:")
evaluate_model(model_ffnn)

Evaluating FFNN:
Classification report for hazard-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         3

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       0.99      1.00      1.00      1197

Classification report for product-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1196
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       1.00      1.00      1.00      1197

Classification report for hazard:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1197

    accuracy                           1.00      1197
   macro avg       1.00      1.00      1.00 

# LSTM

In [68]:
# Train LSTM
print("Training LSTM:")
train_model(model_lstm, optimizer_lstm)


Training LSTM:
Epoch 1, Loss: 0.23747163527955611, Accuracy: 96.46%
Epoch 2, Loss: 0.018571319822221995, Accuracy: 99.70%
Epoch 3, Loss: 0.014982426458348831, Accuracy: 99.70%
Epoch 4, Loss: 0.014161314809074005, Accuracy: 99.70%
Epoch 5, Loss: 0.013833714922269185, Accuracy: 99.70%


In [69]:
# Evaluate LSTM
print("Evaluating LSTM:")
evaluate_model(model_lstm)

Evaluating LSTM:
Classification report for hazard-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         3

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       0.99      1.00      1.00      1197

Classification report for product-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1196
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       1.00      1.00      1.00      1197

Classification report for hazard:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1197

    accuracy                           1.00      1197
   macro avg       1.00      1.00      1.00 

# RNN

In [70]:
# Train RNN
print("Training RNN:")
train_model(model_rnn, optimizer_rnn)

Training RNN:
Epoch 1, Loss: 0.15474116284400224, Accuracy: 97.86%
Epoch 2, Loss: 0.01549387101083994, Accuracy: 99.70%
Epoch 3, Loss: 0.01398799628019333, Accuracy: 99.70%
Epoch 4, Loss: 0.013597080080459515, Accuracy: 99.70%
Epoch 5, Loss: 0.013418644741177559, Accuracy: 99.70%


In [71]:
# Evaluate RNN
print("Evaluating RNN:")
evaluate_model(model_rnn)

Evaluating RNN:
Classification report for hazard-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         3

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       0.99      1.00      1.00      1197

Classification report for product-category:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1196
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1197
   macro avg       0.50      0.50      0.50      1197
weighted avg       1.00      1.00      1.00      1197

Classification report for hazard:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1197

    accuracy                           1.00      1197
   macro avg       1.00      1.00      1.00  

# Predict on Test Set

In [None]:
# Predict on test set
#print("Predicting with FFNN:")
#predict_on_test(model_ffnn)