In [80]:
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import keras_tuner as kt


In [20]:
# Set up TensorFlow to use CPU
physical_devices = tf.config.list_physical_devices('CPU')
tf.config.set_visible_devices(physical_devices, 'CPU')

In [30]:
df = pd.read_csv('Cleaned_Data.csv')



In [36]:
# Assuming 'text' and 'labels' columns in the dataset
texts = df['text'].values
labels = df['target'].values  # Adjust to your target column


In [38]:
with open('bert_embeddings.pkl', 'rb') as f:
    bert_embeddings = pickle.load(f)

# Check the embeddings shape to confirm they match your dataset
print(f"Embeddings shape: {bert_embeddings.shape}")
print(f"Labels shape: {labels.shape}")

Embeddings shape: (1600000, 768)
Labels shape: (1600000,)


In [40]:
assert len(bert_embeddings) == len(df), "Embeddings and dataset length mismatch!"

In [86]:
# 3. Ensure labels are in a numpy array format
y = labels  # Target labels
x = bert_embeddings  # Features from BERT embeddings

In [88]:
x_train, x_test, y_train, y_test = train_test_split(
    bert_embeddings, 
    labels, 
    test_size=0.2,  # 20% for testing
    random_state=42
)

In [90]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, 
    y_train, 
    test_size=0.2,  # 20% for validation (20% of the 80% training set, so 16% of total data)
    random_state=42
)


In [98]:
def build_model(hp):
    model = models.Sequential([
        layers.InputLayer(input_shape=(bert_embeddings.shape[1],)),  # Input layer with the shape of BERT embeddings (768,)
        layers.Reshape((1, bert_embeddings.shape[1])),  # Reshape to (1, 768) for compatibility with LSTM
        layers.Bidirectional(layers.LSTM(
            units=hp.Int('lstm_units', min_value=64, max_value=256, step=64), 
            return_sequences=False
        )),
        layers.Dense(
            hp.Int('dense_units', min_value=32, max_value=128, step=32),
            activation='relu'
        ),
        layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])

    model.compile(
        optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [100]:
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',  # Optimize validation accuracy
    max_epochs=10,
    hyperband_iterations=2,
    directory='tuner_dir',
    project_name='blstm_tuning'
)




In [102]:
tuner.search(x_train, y_train, epochs=10, validation_data=(x_val, y_val))


Trial 40 Complete [00h 07m 13s]
val_accuracy: 0.7942500114440918

Best val_accuracy So Far: 0.7996992468833923
Total elapsed time: 06h 01m 50s


In [104]:
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the model on the test set
test_loss, test_acc = best_model.evaluate(x_test, y_test, verbose=2)
print(f"Test accuracy: {test_acc}")

  saveable.load_own_variables(weights_store.get(inner_path))


10000/10000 - 20s - 2ms/step - accuracy: 0.8004 - loss: 0.4283
Test accuracy: 0.8003906011581421


In [106]:
# Predict on test set
y_pred = best_model.predict(x_test)
y_pred = (y_pred > 0.5).astype('int32')  # Convert probabilities to binary labels

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80    159494
           1       0.80      0.80      0.80    160506

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000



In [112]:
best_model.save('best_bilstm_model.keras')


Epoch 1/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 2ms/step - accuracy: 0.7735 - loss: 0.4727 - val_accuracy: 0.7919 - val_loss: 0.4440
Epoch 2/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3ms/step - accuracy: 0.7903 - loss: 0.4445 - val_accuracy: 0.7922 - val_loss: 0.4427
Epoch 3/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2ms/step - accuracy: 0.7946 - loss: 0.4379 - val_accuracy: 0.7944 - val_loss: 0.4382
Epoch 4/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 3ms/step - accuracy: 0.7979 - loss: 0.4321 - val_accuracy: 0.7956 - val_loss: 0.4361
Epoch 5/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2ms/step - accuracy: 0.7996 - loss: 0.4280 - val_accuracy: 0.7921 - val_loss: 0.4430
Epoch 6/10
[1m30000/30000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2ms/step - accuracy: 0.8014 - loss: 0.4261 - val_accuracy: 0.7980 - val_loss: 0.432

<keras.src.callbacks.history.History at 0x1af6e4c7fe0>

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 865us/step - accuracy: 0.7994 - loss: 0.4312
Test Loss: 0.4318515658378601
Test Accuracy: 0.7991218566894531


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 656us/step



Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80    159790
           1       0.80      0.79      0.80    160210

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000

