In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models


df = pd.read_csv("messages.csv")

df['text'] = (df['subject'].astype(str) + " " + df['message'].astype(str)).fillna('')

labels = df['label'].values


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')

input_ids = inputs['input_ids'].numpy()

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:

wolves = 10
dimensions = train_inputs.shape[1]
iterations = 50

solutions = np.random.rand(wolves, dimensions)

def fitness_function(solution):
    selected_features = np.nonzero(solution > 0.5)[0]
    if len(selected_features) == 0:
        return 0
    return len(selected_features)

for iter in range(iterations):
    fitness_values = np.array([fitness_function(sol) for sol in solutions])
    sorted_indices = np.argsort(fitness_values)[::-1]
    alpha, beta, delta = solutions[sorted_indices[:3]]


    for i in range(wolves):
        A = 2 * np.random.rand(dimensions) - 1
        C = 2 * np.random.rand(dimensions)
        D_alpha = abs(C * alpha - solutions[i])
        X_new = alpha - A * D_alpha
        solutions[i] = np.clip(X_new, 0, 1)


best_solution = solutions[np.argmax(fitness_values)]
selected_features = np.nonzero(best_solution > 0.5)[0]

train_inputs_selected = train_inputs[:, selected_features]
val_inputs_selected = val_inputs[:, selected_features]

print(f"Selected {len(selected_features)} important features using GWO.")



Selected 63 important features using GWO.


In [None]:

train_inputs_padded = pad_sequences(train_inputs_selected, padding='post', maxlen=len(selected_features))
val_inputs_padded = pad_sequences(val_inputs_selected, padding='post', maxlen=len(selected_features))


In [None]:

def build_lstm_model(input_dim):
    input_ids = layers.Input(shape=(input_dim,), dtype=tf.int32, name='input_ids')

    embedding = layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=768, input_length=input_dim)(input_ids)


    x = layers.LSTM(128, return_sequences=True)(embedding)
    x = layers.LSTM(128)(x)

    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)


    output = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=input_ids, outputs=output)


    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

model = build_lstm_model(len(selected_features))
model.summary()




In [None]:

history = model.fit(
    train_inputs_padded, train_labels,
    epochs=10, batch_size=16,
    validation_data=(val_inputs_padded, val_labels)
)

loss, accuracy = model.evaluate(val_inputs_padded, val_labels)
print(f"Validation Accuracy: {accuracy*100:.2f}%")


Epoch 1/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 572ms/step - accuracy: 0.8311 - loss: 0.4809 - val_accuracy: 0.8342 - val_loss: 0.4224
Epoch 2/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 590ms/step - accuracy: 0.8858 - loss: 0.3210 - val_accuracy: 0.9292 - val_loss: 0.2349
Epoch 3/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 647ms/step - accuracy: 0.9636 - loss: 0.1273 - val_accuracy: 0.9206 - val_loss: 0.2121
Epoch 4/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 598ms/step - accuracy: 0.9797 - loss: 0.0570 - val_accuracy: 0.9741 - val_loss: 0.0812
Epoch 5/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 589ms/step - accuracy: 0.9940 - loss: 0.0267 - val_accuracy: 0.9275 - val_loss: 0.3018
Epoch 6/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 595ms/step - accuracy: 0.9911 - loss: 0.0282 - val_accuracy: 0.9568 - val_loss: 0.1280
Epoch

In [None]:
print(f"Accuracy: {accuracy}%")

Accuracy: 0.9671847820281982%


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict(val_inputs_padded)


y_pred = (y_pred_prob > 0.5).astype(int)


accuracy = accuracy_score(val_labels, y_pred)
precision = precision_score(val_labels, y_pred)
recall = recall_score(val_labels, y_pred)
f1 = f1_score(val_labels, y_pred)


print(f"Validation Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-score: {f1 * 100:.2f}%")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 107ms/step
Validation Accuracy: 96.72%
Precision: 92.86%
Recall: 90.43%
F1-score: 91.63%
