In [14]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)



# LSTM

In [28]:

# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_hazard)

# RNN model using LSTM
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification as it determines
# probability distribution over multiple classes, and we want to select the most probable class.

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the RNN model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Generate classification report for hazard-category
classification_rep_hazard = classification_report(np.argmax(y_val_hazard, axis=1), y_pred_hazard_classes, target_names=lb_hazard.classes_)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 221ms/step - accuracy: 0.4260 - loss: 1.7517 - val_accuracy: 0.6391 - val_loss: 1.0860
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 189ms/step - accuracy: 0.6716 - loss: 1.0249 - val_accuracy: 0.7059 - val_loss: 0.8683
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 198ms/step - accuracy: 0.7918 - loss: 0.6843 - val_accuracy: 0.7460 - val_loss: 0.7997
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 190ms/step - accuracy: 0.8367 - loss: 0.5262 - val_accuracy: 0.7552 - val_loss: 0.8015
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 188ms/step - accuracy: 0.8913 - loss: 0.3876 - val_accuracy: 0.7510 - val_loss: 0.8771
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 191ms/step - accuracy: 0.9076 - loss: 0.3003 - val_accuracy: 0.7544 - val_loss: 0.9099
Epoch 6: early stoppin

In [25]:
# Convert product-category to binary format using LabelBinarizer
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product-category'])

# Stratified split
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42, stratify=y_product)

# RNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product-category
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Ensure that we include all unique labels from y_val and predictions
unique_classes = unique_labels(np.argmax(y_val_product, axis=1), y_pred_product_classes)

# Generate classification report with the correct labels
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes
)

print("Classification Report for Product-Category:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 328ms/step - accuracy: 0.2658 - loss: 2.6209 - val_accuracy: 0.2824 - val_loss: 2.3308
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 190ms/step - accuracy: 0.3202 - loss: 2.2261 - val_accuracy: 0.4194 - val_loss: 2.0090
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 209ms/step - accuracy: 0.4465 - loss: 1.8768 - val_accuracy: 0.5021 - val_loss: 1.6883
Epoch 3: early stopping
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step
Classification Report for Product-Category:
                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.00      0.00      0.00        15
                      cereals and bakery products       0.61      0.50      0.55       156
     cocoa and cocoa preparations, coffee and tea       0.30      0.43      0.35        49
    

In [26]:
# Converting hazard (multi-class target) to binary format using LabelBinarizer
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])  # Change to 'hazard' column

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42)

# RNN model using LSTM
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the RNN model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Convert back to original label format if needed (from binary to categorical labels)
y_val_hazard_classes = np.argmax(y_val_hazard, axis=1)  # True class labels

unique_classes = unique_labels(y_val_hazard_classes, y_pred_hazard_classes)

# Generate classification report with the correct labels for hazard
classification_rep_hazard = classification_report(
    y_val_hazard_classes,
    y_pred_hazard_classes,
    target_names=[lb_hazard.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Hazard:")
print(classification_rep_hazard)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 244ms/step - accuracy: 0.1169 - loss: 4.3749 - val_accuracy: 0.2464 - val_loss: 3.6494
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 274ms/step - accuracy: 0.2462 - loss: 3.4637 - val_accuracy: 0.2590 - val_loss: 3.2488
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 190ms/step - accuracy: 0.3024 - loss: 3.0622 - val_accuracy: 0.3525 - val_loss: 2.9905
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 220ms/step - accuracy: 0.4009 - loss: 2.7072 - val_accuracy: 0.4110 - val_loss: 2.8131
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 187ms/step - accuracy: 0.4725 - loss: 2.3926 - val_accuracy: 0.4211 - val_loss: 2.6737
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 228ms/step - accuracy: 0.5209 - loss: 2.1670 - val_accuracy: 0.4486 - val_loss: 2.6134
Epoch 7/10
[1m75/75[

In [27]:
# Converting product (multi-class target) to binary format using LabelBinarizer
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])  # Change to 'product' column

# Split into training and validation sets
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42)

# RNN model using LSTM
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the RNN model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Convert true validation labels to their original categorical form
y_val_product_classes = np.argmax(y_val_product, axis=1)  # True class labels

# Generate classification report with the correct labels for product
unique_classes = unique_labels(y_val_product_classes, y_pred_product_classes)

classification_rep_product = classification_report(
    y_val_product_classes,
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Product:")
print(classification_rep_product)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 242ms/step - accuracy: 0.0177 - loss: 6.7592 - val_accuracy: 0.0334 - val_loss: 6.3820
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 200ms/step - accuracy: 0.0312 - loss: 6.1871 - val_accuracy: 0.0334 - val_loss: 6.4293
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 213ms/step - accuracy: 0.0383 - loss: 6.1216 - val_accuracy: 0.0384 - val_loss: 6.4210
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 189ms/step - accuracy: 0.0451 - loss: 6.0569 - val_accuracy: 0.0526 - val_loss: 6.2860
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 218ms/step - accuracy: 0.0558 - loss: 5.8601 - val_accuracy: 0.0635 - val_loss: 6.1401
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 187ms/step - accuracy: 0.0641 - loss: 5.5981 - val_accuracy: 0.0643 - val_loss: 6.0805
Epoch 7/10
[1m75/75[

# FFNN

In [29]:
# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_hazard)

# FFNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer remains
model.add(Flatten())  # Flatten the output from the embedding layer to feed into dense layers
model.add(Dense(128, activation='relu'))  # First fully connected layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(64, activation='relu'))  # Second fully connected layer
model.add(Dropout(0.5))  # Another Dropout for regularization
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Output layer for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the FFNN model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Generate classification report for hazard-category
classification_rep_hazard = classification_report(np.argmax(y_val_hazard, axis=1), y_pred_hazard_classes, target_names=lb_hazard.classes_)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard)



Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 0.2899 - loss: 1.9498 - val_accuracy: 0.5856 - val_loss: 1.2428
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5915 - loss: 1.2574 - val_accuracy: 0.6717 - val_loss: 0.9960
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7164 - loss: 0.8944 - val_accuracy: 0.7185 - val_loss: 0.8350
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8169 - loss: 0.5842 - val_accuracy: 0.7310 - val_loss: 0.8661
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8763 - loss: 0.4198 - val_accuracy: 0.7586 - val_loss: 0.8631
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9204 - loss: 0.2688 - val_accuracy: 0.7527 - val_loss: 1.0064
Epoch 6: early stopping
[1m38/38[0m 

In [30]:
# Convert product-category to binary format using LabelBinarizer
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product-category'])

# Stratified split
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42, stratify=y_product)

# FFNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer
model.add(Flatten())  # Flatten the output from the embedding layer to feed into dense layers
model.add(Dense(128, activation='relu'))  # First fully connected layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(64, activation='relu'))  # Second fully connected layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the FFNN model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product-category
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Ensure that we include all unique labels from y_val and predictions
unique_classes = unique_labels(np.argmax(y_val_product, axis=1), y_pred_product_classes)

# Generate classification report with the correct labels
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes
)

print("Classification Report for Product-Category:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.1644 - loss: 2.8565 - val_accuracy: 0.2824 - val_loss: 2.3837
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2605 - loss: 2.4943 - val_accuracy: 0.3241 - val_loss: 2.2080
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3181 - loss: 2.1714 - val_accuracy: 0.3743 - val_loss: 2.0486
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3628 - loss: 1.9804 - val_accuracy: 0.4336 - val_loss: 1.8893
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4531 - loss: 1.7166 - val_accuracy: 0.5079 - val_loss: 1.6973
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5639 - loss: 1.3886 - val_accuracy: 0.5631 - val_loss: 1.5439
Epoch 7/10
[1m75/75[0m [32m━━━━━━━━━

In [31]:
# Converting hazard (multi-class target) to binary format using LabelBinarizer
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard'])  # Change to 'hazard' column

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42)

# FFNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer remains
model.add(Flatten())  # Flatten the output from the embedding layer
model.add(Dense(128, activation='relu'))  # First fully connected layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(64, activation='relu'))  # Second fully connected layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the FFNN model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)

# Convert back to original label format if needed (from binary to categorical labels)
y_val_hazard_classes = np.argmax(y_val_hazard, axis=1)  # True class labels

# Get unique classes present in predictions and true labels
unique_classes = unique_labels(y_val_hazard_classes, y_pred_hazard_classes)

# Generate classification report with the correct labels for hazard
classification_rep_hazard = classification_report(
    y_val_hazard_classes,
    y_pred_hazard_classes,
    target_names=[lb_hazard.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Hazard:")
print(classification_rep_hazard)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.0558 - loss: 4.7326 - val_accuracy: 0.1637 - val_loss: 3.7254
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1377 - loss: 3.7570 - val_accuracy: 0.2356 - val_loss: 3.3375
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2070 - loss: 3.3871 - val_accuracy: 0.2840 - val_loss: 3.0946
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2753 - loss: 2.9909 - val_accuracy: 0.3175 - val_loss: 2.9524
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3413 - loss: 2.6908 - val_accuracy: 0.3709 - val_loss: 2.8211
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4058 - loss: 2.4818 - val_accuracy: 0.4110 - val_loss: 2.7642
Epoch 7/10
[1m75/75[0m [32m━━━━━━━━━

In [32]:
# Converting product (multi-class target) to binary format using LabelBinarizer
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product'])  # Change to 'product' column

# Split into training and validation sets
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42)

# FFNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))  # Embedding layer
model.add(Flatten())  # Flatten the output from the embedding layer
model.add(Dense(128, activation='relu'))  # First fully connected layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(64, activation='relu'))  # Second fully connected layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the FFNN model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Convert true validation labels to their original categorical form
y_val_product_classes = np.argmax(y_val_product, axis=1)  # True class labels

# Generate classification report with the correct labels for product
unique_classes = unique_labels(y_val_product_classes, y_pred_product_classes)

classification_rep_product = classification_report(
    y_val_product_classes,
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes  # Specify the unique classes
)

print("Classification Report for Product:")
print(classification_rep_product)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.0112 - loss: 6.8997 - val_accuracy: 0.0334 - val_loss: 6.3195
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.0244 - loss: 6.3435 - val_accuracy: 0.0334 - val_loss: 6.2296
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0342 - loss: 6.0733 - val_accuracy: 0.0334 - val_loss: 6.1786
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0309 - loss: 5.9097 - val_accuracy: 0.0334 - val_loss: 6.1814
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0271 - loss: 5.7782 - val_accuracy: 0.0376 - val_loss: 6.2162
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0384 - loss: 5.6688 - val_accuracy: 0.0468 - val_loss: 6.2497
Epoch 6: early stopping
[1m38/38[0m 