In [1]:
import os
import sys

# Set the absolute path to the 'src' directory
module_path = r"C:\Users\shali\Documents\shalin\ASU_2nd_SEM\APM 523 Optimization\APM523_HybridSwarm_TextClassification\src"

# Add it to sys.path if it's not already there
if module_path not in sys.path:
    sys.path.append(module_path)

# Verify that the path exists
print("SRC Path Exists:", os.path.exists(module_path))
print("SRC Path:", module_path)

# Try importing your models
try:
    from models import build_lstm_model, build_cnn_model, build_bert_model
    print("Successfully imported models!")
except ModuleNotFoundError as e:
    print("Error importing models:", e)


SRC Path Exists: True
SRC Path: C:\Users\shali\Documents\shalin\ASU_2nd_SEM\APM 523 Optimization\APM523_HybridSwarm_TextClassification\src
Successfully imported models!


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import os
from models import build_lstm_model, build_cnn_model, build_bert_model
from tensorflow.keras.callbacks import EarlyStopping

np.random.seed(42)
tf.random.set_seed(42)

In [18]:
# Load Preprocessed Data
import os
import sys
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer  # Add this

processed_dir = '../data/processed/'
train_tfidf_path = os.path.join(processed_dir, 'train_tfidf.pkl')
test_tfidf_path = os.path.join(processed_dir, 'test_tfidf.pkl')
train_csv_path = os.path.join(processed_dir, 'train_preprocessed.csv')
test_csv_path = os.path.join(processed_dir, 'test_preprocessed.csv')

# Load or create TF-IDF data
if os.path.exists(train_tfidf_path) and os.path.exists(test_tfidf_path):
    with open(train_tfidf_path, 'rb') as f:
        X_train_tfidf = pickle.load(f)
    with open(test_tfidf_path, 'rb') as f:
        X_test_tfidf = pickle.load(f)
else:
    train_df = pd.read_csv(train_csv_path)
    test_df = pd.read_csv(test_csv_path)
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(train_df['processed_text']).toarray()
    X_test_tfidf = vectorizer.transform(test_df['processed_text']).toarray()
    with open(train_tfidf_path, 'wb') as f:
        pickle.dump(X_train_tfidf, f)
    with open(test_tfidf_path, 'wb') as f:
        pickle.dump(X_test_tfidf, f)
    # Save the vectorizer
    with open(os.path.join(processed_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
        pickle.dump(vectorizer, f)

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
y_train = train_df['Class Index'].values - 1
y_test = test_df['Class Index'].values - 1
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=4)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=4)

print("TF-IDF X_train shape:", X_train_tfidf.shape)
print("TF-IDF X_test shape:", X_test_tfidf.shape)

TF-IDF X_train shape: (120000, 5000)
TF-IDF X_test shape: (7600, 5000)


In [4]:
# # Load raw text for BERT
# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# max_length = 128

# def tokenize_text(texts, tokenizer, max_length):
#     return tokenizer(texts.tolist(), max_length=max_length, padding='max_length', 
#                      truncation=True, return_tensors='tf')

# # Use a subset for faster testing (e.g., 10% of data)
# subset_size = int(0.1 * len(train_df))  # 12,000 samples
# train_subset_df = train_df.sample(n=subset_size, random_state=42)
# test_subset_df = test_df.sample(n=int(0.1 * len(test_df)), random_state=42)  # 760 samples

# # Tokenize subset
# train_encodings = tokenize_text(train_subset_df['processed_text'], tokenizer, max_length)
# test_encodings = tokenize_text(test_subset_df['processed_text'], tokenizer, max_length)

# X_train_bert = {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']}
# X_test_bert = {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']}

# # Subset labels
# y_train_subset = train_subset_df['Class Index'].values - 1
# y_test_subset = test_subset_df['Class Index'].values - 1
# y_train_subset_cat = tf.keras.utils.to_categorical(y_train_subset, num_classes=4)
# y_test_subset_cat = tf.keras.utils.to_categorical(y_test_subset, num_classes=4)

# print("BERT X_train input_ids shape:", X_train_bert['input_ids'].shape)
# print("BERT X_test input_ids shape:", X_test_bert['input_ids'].shape)

In [5]:
# LSTM Training with EarlyStopping
input_dim = X_train_tfidf.shape[1]
output_dim = 4
lstm_default = build_lstm_model(input_dim, output_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_lstm = lstm_default.fit(X_train_tfidf, y_train_cat, epochs=10, batch_size=32,
                               validation_split=0.2, verbose=1, callbacks=[early_stopping])

y_pred_lstm = lstm_default.predict(X_test_tfidf)
y_pred_labels_lstm = np.argmax(y_pred_lstm, axis=1)
lstm_accuracy = accuracy_score(y_test, y_pred_labels_lstm)
lstm_f1 = f1_score(y_test, y_pred_labels_lstm, average='weighted')
print("LSTM - Test Accuracy:", lstm_accuracy)
print("LSTM - Test F1-Score:", lstm_f1)

Epoch 1/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - accuracy: 0.7860 - loss: 0.6428 - val_accuracy: 0.8859 - val_loss: 0.3213
Epoch 2/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.8964 - loss: 0.3317 - val_accuracy: 0.8839 - val_loss: 0.3252
Epoch 3/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 14ms/step - accuracy: 0.9032 - loss: 0.3023 - val_accuracy: 0.8815 - val_loss: 0.3362
Epoch 4/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.9067 - loss: 0.2804 - val_accuracy: 0.8822 - val_loss: 0.3420
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
LSTM - Test Accuracy: 0.8898684210526315
LSTM - Test F1-Score: 0.8893417663179489


In [6]:
# CNN Training with Adjusted Architecture and EarlyStopping
subset_size = 12000  # Use 10% subset for faster initial training
train_subset_idx = np.random.choice(len(train_df), subset_size, replace=False)
X_train_tfidf_subset = X_train_tfidf[train_subset_idx]
y_train_cat_subset = y_train_cat[train_subset_idx]

cnn_default = build_cnn_model(input_dim, output_dim, filters=64, kernel_size=3)  # Reduced filters and kernel size
history_cnn = cnn_default.fit(X_train_tfidf_subset, y_train_cat_subset, epochs=10, batch_size=32,
                              validation_split=0.2, verbose=1, callbacks=[early_stopping])

y_pred_cnn = cnn_default.predict(X_test_tfidf, verbose=0)
y_pred_labels_cnn = np.argmax(y_pred_cnn, axis=1)
cnn_accuracy = accuracy_score(y_test, y_pred_labels_cnn)
cnn_f1 = f1_score(y_test, y_pred_labels_cnn, average='weighted')
print("CNN - Test Accuracy:", cnn_accuracy)
print("CNN - Test F1-Score:", cnn_f1)

Epoch 1/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 76ms/step - accuracy: 0.2502 - loss: 1.3874 - val_accuracy: 0.2446 - val_loss: 1.3856
Epoch 2/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 70ms/step - accuracy: 0.2457 - loss: 1.3868 - val_accuracy: 0.2463 - val_loss: 1.3846
Epoch 3/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 72ms/step - accuracy: 0.2560 - loss: 1.3855 - val_accuracy: 0.2871 - val_loss: 1.3829
Epoch 4/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 72ms/step - accuracy: 0.2575 - loss: 1.3854 - val_accuracy: 0.2912 - val_loss: 1.3814
Epoch 5/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 76ms/step - accuracy: 0.2554 - loss: 1.3846 - val_accuracy: 0.2800 - val_loss: 1.3807
Epoch 6/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 75ms/step - accuracy: 0.2743 - loss: 1.3844 - val_accuracy: 0.2925 - val_loss: 1.3801
Epoch 7/10
[1m3

In [7]:
# BERT Training with Fine-Tuning and Larger Subset
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
max_length = 64  # Reduced from 128 for speed

def tokenize_text(texts, tokenizer, max_length):
    return tokenizer(texts.tolist(), max_length=max_length, padding='max_length', 
                     truncation=True, return_tensors='tf')

subset_size = 2000  # Reduced subset for faster training
train_subset_df = train_df.sample(n=subset_size, random_state=42)
test_subset_df = test_df  # Full test set for evaluation

train_encodings = tokenize_text(train_subset_df['processed_text'], tokenizer, max_length)
test_encodings = tokenize_text(test_subset_df['processed_text'], tokenizer, max_length)

X_train_bert = {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']}
X_test_bert = {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']}

y_train_subset = train_subset_df['Class Index'].values - 1
y_test_subset = test_df['Class Index'].values - 1
y_train_subset_cat = tf.keras.utils.to_categorical(y_train_subset, num_classes=4)
y_test_subset_cat = tf.keras.utils.to_categorical(y_test_subset, num_classes=4)

bert_default = build_bert_model(trainable=False)  # Freeze BERT for speed
history_bert = bert_default.fit(X_train_bert, y_train_subset_cat, epochs=3, batch_size=16,  # Smaller batch size
                               validation_split=0.2, verbose=1, callbacks=[early_stopping])

y_pred_bert = bert_default.predict(X_test_bert, verbose=0)
y_pred_labels_bert = np.argmax(y_pred_bert, axis=1)
bert_accuracy = accuracy_score(y_test_subset, y_pred_labels_bert)
bert_f1 = f1_score(y_test_subset, y_pred_labels_bert, average='weighted')
print("BERT - Test Accuracy:", bert_accuracy)
print("BERT - Test F1-Score:", bert_f1)





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/3
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 2s/step - accuracy: 0.2580 - loss: 1.5831 - val_accuracy: 0.3900 - val_loss: 1.2958
Epoch 2/3
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 2s/step - accuracy: 0.3091 - loss: 1.4523 - val_accuracy: 0.5325 - val_loss: 1.2074
Epoch 3/3
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 2s/step - accuracy: 0.3684 - loss: 1.3283 - val_accuracy: 0.6050 - val_loss: 1.1301
BERT - Test Accuracy: 0.6275
BERT - Test F1-Score: 0.6261429058778213


In [8]:
# Save Models and Results (Fixed BERT Saving)
output_dir = '../outputs/'
models_dir = os.path.join(output_dir, 'models')
results_dir = os.path.join(output_dir, 'results')
os.makedirs(models_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

lstm_default.save(os.path.join(models_dir, 'baseline_lstm_default.keras'))
cnn_default.save(os.path.join(models_dir, 'baseline_cnn_default.keras'))
bert_default.save(os.path.join(models_dir, 'baseline_bert_default.keras'))  # Unified .keras format

results = pd.DataFrame({
    'Model': ['LSTM', 'CNN', 'BERT (Subset)'],
    'Accuracy': [lstm_accuracy, cnn_accuracy, bert_accuracy],
    'F1-Score': [lstm_f1, cnn_f1, bert_f1]
})
results.to_csv(os.path.join(results_dir, 'baseline_results.csv'), index=False)

print("Models and results saved to", output_dir)

Models and results saved to ../outputs/
