In [None]:
# Initialize Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")  # oov_token handles unknown words
tokenizer.fit_on_texts(df_balanced["text"])  # Fit tokenizer on text

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df_balanced["text"])

# Pad sequences to ensure consistent input length
max_length = 200  # Ensure consistency with model input shape
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Save Tokenizer for future use
joblib.dump(tokenizer, "..../1vectorizer.pkl")

# Convert labels to NumPy array
labels = np.array(df_balanced["label"])

# Print Tokenizer details
print(f"✅ Tokenizer Created! Vocabulary Size: {len(tokenizer.word_index)}")
print(f"First Text Sequence: {sequences[0]}")
print(f"Padded Sequence Shape: {padded_sequences.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

print(f"✅ Data Split! Training Samples: {len(X_train)}, Testing Samples: {len(X_test)}")

In [None]:
# Build the LSTM Model
text_model = Sequential([
    Embedding(10000, 128, input_length=max_length),  # Word Embeddings
    LSTM(128, return_sequences=True),  # LSTM Layer 1
    Dropout(0.4),  # Prevent Overfitting
    LSTM(64, return_sequences=True),  # LSTM Layer 2
    Dropout(0.4),
    LSTM(32),  # LSTM Layer 3
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output Layer (Binary Classification)
])

# Compile the model
text_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
text_model.fit(X_train, y_train, epochs=100, batch_size=8, validation_data=(X_test, y_test))

# Save the trained model
text_model.save("..../text_model1.h5")
print("✅ Text Fraud Detection Model Saved!")