In [54]:
import pandas as pd
import numpy as np
import os
import gc
import joblib
import psutil
import warnings
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, roc_auc_score

warnings.filterwarnings('ignore')


In [55]:
def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024 / 1024


In [56]:
def safe_load_data(file_path, sample_size=1_000_000, chunk_size=100_000):
    print(f"🔄 Loading {sample_size:,} rows from {file_path}")
    rows_loaded, chunks = 0, []

    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        if rows_loaded >= sample_size:
            break
        rows_needed = sample_size - rows_loaded
        chunks.append(chunk.iloc[:rows_needed])
        rows_loaded += len(chunks[-1])

    df = pd.concat(chunks, ignore_index=True)
    print(f"✅ Loaded shape: {df.shape}, Memory: {get_memory_usage():.2f} GB")
    return df


In [57]:
def create_features(df):
    df = df.copy()

    if 'Login Timestamp' in df:
        df['Login Timestamp'] = pd.to_datetime(df['Login Timestamp'], errors='coerce')
        df['hour'] = df['Login Timestamp'].dt.hour
        df['weekday'] = df['Login Timestamp'].dt.dayofweek

    if 'Round-Trip Time [ms]' in df:
        df['rtt_log'] = np.log1p(df['Round-Trip Time [ms]'])

    if 'Login Successful' in df:
        df['Login Successful'] = df['Login Successful'].fillna(0).astype(int)

    if 'Is Attack IP' in df:
        df['Is Attack IP'] = df['Is Attack IP'].fillna(0).astype(int)

    numeric = df.select_dtypes(include=np.number)
    imputer = SimpleImputer(strategy='median')
    numeric_imputed = pd.DataFrame(imputer.fit_transform(numeric), columns=numeric.columns)
    return numeric_imputed


In [58]:
def select_features(X, y, task='regression', k=10):
    selector = SelectKBest(f_regression if task == 'regression' else f_classif, k=min(k, X.shape[1]))
    X_new = selector.fit_transform(X, y)
    selected = X.columns[selector.get_support()]
    return X_new, selected.tolist(), selector


In [59]:
def train_regression(X, y):
    X_sel, features, selector = select_features(X, y, 'regression')
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LinearRegression().fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    r2 = r2_score(y_test, y_pred)
    print(f"📊 Regression R²: {r2:.4f}")
    return model, scaler, selector, features, r2


In [60]:
def train_classifier(X, y):
    X_sel, features, selector = select_features(X, y, 'classification')
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LogisticRegression(max_iter=500).fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Classification Accuracy: {acc:.4f}")
    return model, scaler, selector, features, acc


In [61]:
def save_model(model, scaler, selector, features, name, score):
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, f"models/{name}_model.pkl")
    joblib.dump(scaler, f"models/{name}_scaler.pkl")
    joblib.dump(selector, f"models/{name}_selector.pkl")
    joblib.dump(features, f"models/{name}_features.pkl")
    print(f"💾 Saved '{name}' model | Score: {score:.4f}")


In [62]:
def run_pipeline(file_path, sample_size=1_000_000):
    df = safe_load_data(file_path, sample_size=sample_size)
    df_features = create_features(df)

    # Regression Task
    if "Round-Trip Time [ms]" in df.columns:
        df_clean = df.dropna(subset=["Round-Trip Time [ms]"])
        y = df_clean["Round-Trip Time [ms]"]
        X = df_features.loc[df_clean.index].drop(columns=["Round-Trip Time [ms]"], errors='ignore')
        model, scaler, selector, feats, score = train_regression(X, y)
        save_model(model, scaler, selector, feats, "rtt_model", score)

    # Login Classification
    if "Login Successful" in df.columns:
        df_clean = df.dropna(subset=["Login Successful"])
        y = df_clean["Login Successful"]
        X = df_features.loc[df_clean.index].drop(columns=["Login Successful"], errors='ignore')
        model, scaler, selector, feats, score = train_classifier(X, y)
        save_model(model, scaler, selector, feats, "login_model", score)

    # Attack Detection
    if "Is Attack IP" in df.columns:
        df_clean = df.dropna(subset=["Is Attack IP"])
        y = df_clean["Is Attack IP"]
        X = df_features.loc[df_clean.index].drop(columns=["Is Attack IP"], errors='ignore')
        model, scaler, selector, feats, score = train_classifier(X, y)
        save_model(model, scaler, selector, feats, "attack_model", score)

    print("\n✅ All models trained and saved successfully")
    print(f"Final memory usage: {get_memory_usage():.2f} GB")


In [64]:
run_pipeline("login/rba-dataset.csv", sample_size=1000_000_000_000)


🔄 Loading 1,000,000,000,000 rows from login/rba-dataset.csv
✅ Loaded shape: (31269264, 16), Memory: 5.44 GB
📊 Regression R²: 0.1839
💾 Saved 'rtt_model' model | Score: 0.1839
✅ Classification Accuracy: 0.7367
💾 Saved 'login_model' model | Score: 0.7367
✅ Classification Accuracy: 0.9010
💾 Saved 'attack_model' model | Score: 0.9010

✅ All models trained and saved successfully
Final memory usage: 1.94 GB
