In [36]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Paths
RAW_DATA    = "../data/expanded_login_metadata.csv"
MODEL_DIR   = "../models"
os.makedirs(MODEL_DIR, exist_ok=True)

# 1) Load raw CSV
df_raw = pd.read_csv(RAW_DATA, parse_dates=["timestamp"])



In [37]:
# 2) Define which columns to encode (now including ip_country and user_id)
ENCODER_MAPPING = {
    'location_encoder.pkl':    'location_city',
    'ip_country_encoder.pkl':  'ip_country',
    'isp_encoder.pkl':         'isp',
    'role_encoder.pkl':        'role',
    'device_os_encoder.pkl':   'device_os',
    'browser_encoder.pkl':     'browser',
    'device_type_encoder.pkl': 'device_type',
    'user_id_encoder.pkl':     'user_id'
}

# 3) Fit & save each LabelEncoder, ensuring "UNKNOWN" is always a valid class
for fname, col in ENCODER_MAPPING.items():
    le = LabelEncoder()
    series = df_raw[col].fillna("UNKNOWN").astype(str)
    # Prepend "UNKNOWN" so the encoder will know that fallback
    values = pd.concat([pd.Series(["UNKNOWN"]), series], ignore_index=True)
    le.fit(values)
    joblib.dump(le, os.path.join(MODEL_DIR, fname))

print("✅ All LabelEncoders (including ip_country & user_id) fitted and saved to", MODEL_DIR)

✅ All LabelEncoders (including ip_country & user_id) fitted and saved to ../models


In [38]:
# Cell 2: Build numeric DataFrame using saved encoders

import os, joblib

# Extract base numeric features
df_num = pd.DataFrame({
    "hour":             df_raw["timestamp"].dt.hour,
    "weekday":          df_raw["timestamp"].dt.weekday,
    "login_success":    df_raw["login_success"],
    #"login_type":       df_raw["login_type"],
    "session_duration": df_raw["session_duration"],
    "attempt_count":    df_raw["attempt_count"],
})

# Apply each LabelEncoder (including ip_country and user_id)
for fname, col in ENCODER_MAPPING.items():
    enc = joblib.load(os.path.join(MODEL_DIR, fname))
    df_num[col] = enc.transform(
        df_raw[col].fillna("UNKNOWN").astype(str)
    )

# (Optional) Save out for inspection or downstream use
df_num.to_csv("../data/numeric_login_data.csv", index=False)

df_num.head()

Unnamed: 0,hour,weekday,login_success,session_duration,attempt_count,location_city,ip_country,isp,role,device_os,browser,device_type,user_id
0,0,5,1,72,3,0,1,1,2,0,1,1,4
1,0,5,0,0,1,0,1,0,1,3,2,2,3
2,16,5,1,91,3,1,1,3,3,4,0,0,3
3,19,5,1,91,2,3,1,3,1,3,1,3,4
4,22,5,1,15,2,1,1,1,0,0,0,1,3


In [39]:
# Cell 4: Train & save the Isolation Forest model

from sklearn.ensemble import IsolationForest
import joblib, os

# 1) Make sure models directory exists
os.makedirs(MODEL_DIR, exist_ok=True)

# 2) Instantiate & fit the model
model = IsolationForest(
    n_estimators=100,
    contamination=0.2,
    random_state=42
)
model.fit(df_num)

# 3) Persist to disk
model_path = os.path.join(MODEL_DIR, "isolation_forest.pkl")
joblib.dump(model, model_path)

print(f"✅ Model trained and saved to {model_path}")

✅ Model trained and saved to ../models\isolation_forest.pkl
