In [4]:
import tensorflow as tf
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense
from keras.models import Sequential, load_model
from gensim.models import FastText
import fasttext.util
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb  # Import XGBoost
from keras.backend import floatx



# Load and preprocess text data
data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["class"], test_size=0.2) # Replace with your data splitting function

# Tokenize text (if not already pre-tokenized)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a uniform length
max_length = 100  # Adjust as needed
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_sequences, maxlen=max_length)

# Load or create FastText model
fasttext_model = fasttext.load_model("D:\Mini Project\minipj\models\cc.en.300.bin")
vocab_size = len(tokenizer.word_index) + 1  # Account for unknown words
embedding_dim = 300  # Match FastText embedding dimension
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = fasttext_model.get_word_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

#label encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

# Load the saved CNN-BiLSTM model
cnn_bilstm_model = load_model('D:\Mini Project\minipj\models\CNN_BiLSTM_model2.h5')

# Extract features from the CNN-BiLSTM model (assuming a suitable layer is named 'bidirectional_1')
cnn_bilstm_features = cnn_bilstm_model.get_layer('bidirectional_2').output  # Access intermediate layer
intermediate_model = tf.keras.models.Model(inputs=cnn_bilstm_model.input, outputs=cnn_bilstm_features)

# Extract features for training and testing sets
cnn_bilstm_features_train = intermediate_model.predict(X_train_padded)
cnn_bilstm_features_test = intermediate_model.predict(X_test_padded)

# Prepare data for XGBoost
dtrain = xgb.DMatrix(cnn_bilstm_features_train, label=y_train)
dtest = xgb.DMatrix(cnn_bilstm_features_test, label=y_test)

# ... (rest of the code for XGBoost training and evaluation, as provided earlier)
# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Adjust for your task
    'max_depth': 5,
    'learning_rate': 0.1,
    # ... (other hyperparameters as needed)
}

# Train the XGBoost model
xgb_model = xgb.train(params, dtrain)

# Evaluate the XGBoost model on the test set
predictions = xgb_model.predict(dtest)

# Calculate evaluation metrics (e.g., accuracy, precision, recall, F1-score)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, predictions.round())
precision = precision_score(y_test, predictions.round())
recall = recall_score(y_test, predictions.round())
f1 = f1_score(y_test, predictions.round())

print("XGBoost Test Accuracy:", accuracy)
print("XGBoost Test Precision:", precision)
print("XGBoost Test Recall:", recall)
print("XGBoost Test F1-score:", f1)


AttributeError: module 'keras.src.backend' has no attribute 'floatx'

In [6]:
import tensorflow as tf
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense
from keras.models import Sequential, Model  # Use Model for intermediate model creation
from gensim.models import FastText
import fasttext.util
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb # Import XGBoost


# Load and preprocess text data
data = pd.read_csv("D:\Mini Project\minipj\data\Suicide_Detection.csv")

# First split into training and testing sets
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(data["text"], data["class"], test_size=0.2, random_state=42)

# Then split the training set further into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.25, random_state=42)  # 25% for validation

# Replace with your data splitting function

# Tokenize text (if not already pre-tokenized)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_val_sequences = tokenizer.texts_to_sequences(X_val)

# Pad sequences to a uniform length
max_length = 100  # Adjust as needed
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_sequences, maxlen=max_length)
X_val_padded = tf.keras.preprocessing.sequence.pad_sequences(X_val_sequences, maxlen=max_length)


# Load or create FastText model
fasttext_model = fasttext.load_model("D:\Mini Project\minipj\models\cc.en.300.bin")
vocab_size = len(tokenizer.word_index) + 1  # Account for unknown words
embedding_dim = 300  # Match FastText embedding dimension
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = fasttext_model.get_word_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

#label encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)
y_val_encoded = encoder.fit_transform(y_val)

# Load the saved CNN-BiLSTM model
model = load_model('D:\Mini Project\minipj\models\CNN_BiLSTM_model2.h5')

# Extract features from an intermediate layer
# Correctly create intermediate model for feature extraction
intermediate_model = Model(inputs=model.input, outputs=model.layers[4].output)  # Assuming the BiLSTM layer is the 4th
cnn_bilstm_features = intermediate_model.predict(X_train_padded)
cnn_bilstm_features_val = intermediate_model.predict(X_val_padded)  # Assuming you have X_val_padded prepared
cnn_bilstm_predictions = model.predict(X_val_padded)  # Use the full CNN-BiLSTM model for predictions



# Prepare data for XGBoost
xgb_train = xgb.DMatrix(cnn_bilstm_features, label=y_train)
xgb_val = xgb.DMatrix(cnn_bilstm_features_val, label=y_val_encoded)

# Train XGBoost model
xgb_params = {
    'objective': 'binary:logistic',  # Adjust for your task
    'max_depth': 5,  # Tune hyperparameters
    'learning_rate': 0.1,
    # ... (other hyperparameters)
}
xgb_model = xgb.train(params=xgb_params, dtrain=xgb_train, evals=[(xgb_val, 'val')])

# Ensemble predictions (example using weighted averaging)
ensemble_predictions = 0.6 * cnn_bilstm_predictions + 0.4 * xgb_model.predict(xgb_val)

# Evaluate ensemble performance
accuracy = accuracy_score(y_val, ensemble_predictions.round())
print("Ensemble Accuracy:", accuracy)


[0]	val-logloss:0.66425
[1]	val-logloss:0.64069
[2]	val-logloss:0.62131
[3]	val-logloss:0.60490
[4]	val-logloss:0.59143
[5]	val-logloss:0.58006
[6]	val-logloss:0.57062
[7]	val-logloss:0.56264
[8]	val-logloss:0.55585
[9]	val-logloss:0.55024


MemoryError: Unable to allocate 8.03 GiB for an array with shape (2154352225,) and data type float32