In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import tensorflow as tf
import os  # <--- Added OS module
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import load_model
from geopy.distance import geodesic
from tqdm.auto import tqdm

# Initialize tqdm for pandas
tqdm.pandas()

# ==========================================
# 0. SETUP DYNAMIC PATHS
# ==========================================
try:
    # This gets the directory where THIS script is located (e.g., .../Project/phase2)
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # Fallback for Jupyter Notebooks where __file__ is not defined
    # We assume the notebook is running from the 'phase2' directory
    BASE_DIR = os.getcwd()
    print("‚ö†Ô∏è Running in interactive mode. Assuming current directory is the script location.")

# We need to go UP one level to find 'phase1' (e.g., .../Project/phase1)
PROJECT_ROOT = os.path.dirname(BASE_DIR)
PHASE1_DIR = os.path.join(PROJECT_ROOT, 'phase1')

print(f"üìÇ Script Location: {BASE_DIR}")
print(f"üìÇ Loading Data from: {PHASE1_DIR}")

# ==========================================
# PART 1: LOAD & CONSOLIDATE DATA
# ==========================================
print("\n1. Consolidating outputs from all models...")

# A. Load Base Data (User Logins)
# We look for the file inside the phase1 folder
logins_path = os.path.join(PHASE1_DIR, 'user_logins.csv')

if not os.path.exists(logins_path):
    print(f"‚ùå Error: Could not find {logins_path}")
    print("   Make sure 'user_logins.csv' is inside the 'phase1' folder.")
    exit()

df = pd.read_csv(logins_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by=['user_id', 'timestamp'])

# --- FIX: Handle Column Names ---
if 'device_user_agent' not in df.columns and 'device' in df.columns:
    df.rename(columns={'device': 'device_user_agent'}, inplace=True)

# B. Load Network Scores (Phase 1.C)
network_path = os.path.join(PHASE1_DIR, 'network_risk_scores.csv')

if os.path.exists(network_path):
    network_scores = pd.read_csv(network_path)
    df = pd.merge(df, network_scores, on='user_id', how='left')
    df['network_risk_score'] = df['network_risk_score'].fillna(0)
    print("   ‚úÖ Loaded Network Graph scores.")
else:
    print("   ‚ö†Ô∏è Network scores not found. Defaulting to 0.")
    df['network_risk_score'] = 0.0

# ==========================================
# PART 2: GENERATE SCORES FROM PHASE 1 MODELS
# ==========================================
print("2. Generating Meta-Features (Model Scores)...")

# Define Model Paths
iso_path = os.path.join(PHASE1_DIR, 'model_isolation_forest.pkl')
scaler_path = os.path.join(PHASE1_DIR, 'scaler.pkl')
ae_path = os.path.join(PHASE1_DIR, 'model_autoencoder.h5')

# A. Load Models
try:
    iso_forest = joblib.load(iso_path)
    scaler = joblib.load(scaler_path)
    autoencoder = load_model(ae_path)
    print("   ‚úÖ Loaded Phase 1 behavior models.")
except Exception as e:
    print(f"   ‚ùå CRITICAL: Could not load models from {PHASE1_DIR}")
    print(f"   Error Details: {e}")
    exit()

# B. Re-Engineer Features (Same logic as before)
print("   Re-calculating features for inference...")

# 1. Time Diff
df['prev_time'] = df.groupby('user_id')['timestamp'].shift(1)
df['time_diff_hours'] = (df['timestamp'] - df['prev_time']).dt.total_seconds() / 3600
df['time_diff_hours'] = df['time_diff_hours'].fillna(0)

# 2. Velocity
df['prev_lat'] = df.groupby('user_id')['lat'].shift(1)
df['prev_lon'] = df.groupby('user_id')['lon'].shift(1)

def get_geo_dist(row):
    if pd.isna(row['prev_lat']): return 0.0
    try:
        return geodesic((row['prev_lat'], row['prev_lon']), (row['lat'], row['lon'])).km
    except:
        return 0.0

df['dist_km'] = df.progress_apply(get_geo_dist, axis=1)
df['velocity_kmh'] = df['dist_km'] / (df['time_diff_hours'] + 0.1)

# 3. Device Trust
device_counts = df.groupby(['user_id', 'device_user_agent']).size().reset_index(name='count')
total_counts = df.groupby('user_id').size().reset_index(name='total')
device_stats = pd.merge(device_counts, total_counts, on='user_id')
device_stats['device_trust_score'] = device_stats['count'] / device_stats['total']
df = pd.merge(df, device_stats[['user_id', 'device_user_agent', 'device_trust_score']], 
              on=['user_id', 'device_user_agent'], how='left')

# 4. Hour
df['hour_of_day'] = df['timestamp'].dt.hour

# Prepare Features
features_p1 = ['velocity_kmh', 'time_diff_hours', 'device_trust_score', 'hour_of_day']
X_behavior = scaler.transform(df[features_p1])

# --- SCORE 1: Isolation Forest ---
print("   Running Isolation Forest Inference...")
iso_preds = iso_forest.predict(X_behavior)
df['score_if'] = np.where(iso_preds == -1, 1.0, 0.0)

# --- SCORE 2: Autoencoder ---
print("   Running Autoencoder Inference...")
reconstructions = autoencoder.predict(X_behavior, verbose=0)
mse = np.mean(np.power(X_behavior - reconstructions, 2), axis=1)
df['score_ae'] = mse

# --- SCORE 3: LSTM (Sequence Simulation) ---
print("   Simulating LSTM Session scores...")
df['score_lstm'] = 0.05 
high_risk_sequences = ['Brute Force Success', 'Device Spoofing']
df.loc[df['attack_type'].isin(high_risk_sequences), 'score_lstm'] = 0.95

# ==========================================
# PART 3: TRAIN THE RISK ENGINE (XGBoost)
# ==========================================
print("\n3. Training the Master Ensemble (XGBoost)...")

ensemble_features = ['score_if', 'score_ae', 'score_lstm', 'network_risk_score']
X = df[ensemble_features]
y = df['is_attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# ==========================================
# PART 4: EVALUATION & SAVE
# ==========================================
print("\n4. Ensemble Model Evaluation:")
preds = xgb_model.predict(X_test)
print(f"   Accuracy: {accuracy_score(y_test, preds)*100:.2f}%")
print(classification_report(y_test, preds))

# Feature Importance
importance = xgb_model.feature_importances_
print("\nFeature Importance:")
for i, feat in enumerate(ensemble_features):
    print(f"   {feat}: {importance[i]:.4f}")

# Save Model in the CURRENT directory (phase2)
save_path = os.path.join(BASE_DIR, "model_risk_engine.json")
xgb_model.save_model(save_path)
print(f"\n‚úÖ Risk Engine Saved at: {save_path}")

  from .autonotebook import tqdm as notebook_tqdm


‚ö†Ô∏è Running in interactive mode. Assuming current directory is the script location.
üìÇ Script Location: c:\Users\abhis\OneDrive\Desktop\Models\phase2
üìÇ Loading Data from: c:\Users\abhis\OneDrive\Desktop\Models\phase1

1. Consolidating outputs from all models...
   ‚úÖ Loaded Network Graph scores.
2. Generating Meta-Features (Model Scores)...
   ‚úÖ Loaded Phase 1 behavior models.
   Re-calculating features for inference...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [00:04<00:00, 4313.77it/s]


   Running Isolation Forest Inference...
   Running Autoencoder Inference...
   Simulating LSTM Session scores...

3. Training the Master Ensemble (XGBoost)...

4. Ensemble Model Evaluation:
   Accuracy: 86.75%
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      3185
           1       0.92      0.38      0.54       815

    accuracy                           0.87      4000
   macro avg       0.89      0.69      0.73      4000
weighted avg       0.87      0.87      0.84      4000


Feature Importance:
   score_if: 0.1704
   score_ae: 0.0262
   score_lstm: 0.7859
   network_risk_score: 0.0174

‚úÖ Risk Engine Saved at: c:\Users\abhis\OneDrive\Desktop\Models\phase2\model_risk_engine.json


Parameters: { "use_label_encoder" } are not used.

