In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# ==========================================
# PART 1: LOAD LINKED DATA
# ==========================================
print("1. Loading Linked Session Data...")

try:
    df = pd.read_csv(r'user_sessions.csv')
except FileNotFoundError:
    print("❌ user_sessions.csv not found! Run the session generator first.")
    exit()

# Map actions to numbers
ACTIONS = {
    'LOGIN': 1, 'VIEW_BALANCE': 2, 'VIEW_TRANSACTIONS': 3,
    'TRANSFER_SMALL': 4, 'TRANSFER_LARGE': 5,
    'CHANGE_PASSWORD': 6, 'ADD_RECIPIENT': 7, 'LOGOUT': 8
}

print(f"   Loaded {len(df)} sessions.")

# ==========================================
# PART 2: PREPROCESSING
# ==========================================
print("2. Converting text sequences to vectors...")

sequences = []
labels = []

for index, row in df.iterrows():
    # Convert string "LOGIN,VIEW_BALANCE" -> List ['LOGIN', 'VIEW_BALANCE']
    action_list = row['session_sequence'].split(',')
    
    # Convert Words -> Numbers
    # If we find an unknown action, skip it or use 0
    seq_nums = [ACTIONS.get(action, 0) for action in action_list]
    sequences.append(seq_nums)
    
    # Label
    labels.append(row['is_attack'])

# Pad Sequences
MAX_SEQ_LENGTH = 10
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
y = np.array(labels)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==========================================
# PART 3: BUILD LSTM MODEL
# ==========================================
print("\n3. Building LSTM Architecture...")
model = Sequential([
    Embedding(input_dim=9, output_dim=32, input_length=MAX_SEQ_LENGTH),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# ==========================================
# PART 4: TRAIN
# ==========================================
print("\n4. Training LSTM on Real Linked Data...")

# Define Early Stopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,             # Stop after 3 epochs of no improvement
    restore_best_weights=True, # Revert to the best model found
    verbose=1
)

# Increased epochs to 20 to let Early Stopping determine the optimal end point
model.fit(
    X_train, 
    y_train, 
    epochs=20, 
    batch_size=32, 
    verbose=1, 
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)

model.save("model_lstm.h5")
print("\n✅ LSTM Model Saved (Trained on user_sessions.csv).")

1. Loading Linked Session Data...
❌ user_sessions.csv not found! Run the session generator first.


NameError: name 'df' is not defined

: 