# Account Security Monitoring - Refactored

Clean, standardized implementation of the anomaly detection pipeline.

In [None]:
!pip install kaggle

In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

try:
    from google.colab import files
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False

In [None]:
# Configuration
RANDOM_STATE = 42
CONTAMINATION_RATE = 0.15
PCA_VARIANCE_RATIO = 0.8
ANOMALY_THRESHOLD_PERCENTILE = 85
N_ESTIMATORS = 300

## Data Loading Functions

In [None]:
def download_kaggle_data():
    """Download and extract Kaggle competition data."""
    if COLAB_ENV:
        try:
            uploaded = files.upload()
        except Exception as e:
            print(f"File upload failed: {e}")
    else:
        print("Running outside of Colab. Ensure kaggle.json is in ~/.kaggle/")

    if 'kaggle.json' in os.listdir('.'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        print("kaggle.json not found.")

    if not os.path.exists('cpe342-karena.zip'):
        print("Downloading data...")
        !kaggle competitions download -c cpe342-karena
    else:
        print("Data already downloaded.")

    if os.path.exists('cpe342-karena.zip'):
        print("Extracting data...")
        with zipfile.ZipFile('cpe342-karena.zip', 'r') as zip_ref:
            zip_ref.extractall('.')
        print("Data extracted.")

## Feature Engineering Functions

In [None]:
def extract_base_features(df):
    """Extract base feature names from time-series columns."""
    base_features = set()
    for col in df.columns:
        if col.endswith('_1'):
            base_features.add(col[:-2])
    return base_features

In [None]:
def create_aggregated_features(df):
    """Create aggregated features from time-series data."""
    base_features = extract_base_features(df)
    X_eng = df.copy()
    
    for base in base_features:
        cols = [f"{base}_{i}" for i in range(1, 5)]
        
        if all(c in df.columns for c in cols):
            X_eng[f'{base}_mean_agg'] = df[cols].mean(axis=1)
            X_eng[f'{base}_std_agg'] = df[cols].std(axis=1)
            X_eng[f'{base}_range_agg'] = df[cols].max(axis=1) - df[cols].min(axis=1)
    
    return X_eng

In [None]:
def preprocess_data(df):
    """Preprocess data with feature engineering and scaling."""
    X_eng = create_aggregated_features(df)
    X = X_eng.select_dtypes(include=[np.number])
    
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    return X_scaled, imputer, scaler

## Anomaly Detection Models

In [None]:
def train_isolation_forest(X_scaled):
    """Train Isolation Forest model."""
    iso = IsolationForest(
        n_estimators=N_ESTIMATORS,
        contamination=CONTAMINATION_RATE,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    iso.fit(X_scaled)
    return iso.decision_function(X_scaled)

In [None]:
def train_pca_reconstruction(X_scaled):
    """Train PCA reconstruction model."""
    n_components = int(X_scaled.shape[1] * PCA_VARIANCE_RATIO)
    pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
    
    X_pca = pca.fit_transform(X_scaled)
    X_reconstructed = pca.inverse_transform(X_pca)
    
    reconstruction_error = np.mean(np.square(X_scaled - X_reconstructed), axis=1)
    return reconstruction_error

In [None]:
def combine_anomaly_scores(iso_scores, reconstruction_error):
    """Combine anomaly scores from multiple models."""
    scaler_mm = MinMaxScaler()
    
    # Normalize Isolation Forest scores (invert: 0=bad, 1=good -> 1=bad, 0=good)
    s1 = scaler_mm.fit_transform(iso_scores.reshape(-1, 1)).flatten()
    s1 = 1 - s1
    
    # Normalize PCA reconstruction error (1=bad, 0=good)
    s2 = scaler_mm.fit_transform(reconstruction_error.reshape(-1, 1)).flatten()
    
    # Average scores
    return (s1 + s2) / 2

In [None]:
def predict_anomalies(final_score, threshold_percentile=ANOMALY_THRESHOLD_PERCENTILE):
    """Predict anomalies based on threshold."""
    threshold_val = np.percentile(final_score, threshold_percentile)
    return [1 if x >= threshold_val else 0 for x in final_score]

# Training Pipeline

In [None]:
# Download data
download_kaggle_data()

In [None]:
# Load data
try:
    df = pd.read_csv('public_dataset/task5/test.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: test.csv not found.")
    exit()

## Data Preprocessing

In [None]:
# Preprocess data
X_scaled, imputer, scaler = preprocess_data(df)
print(f"Preprocessed data shape: {X_scaled.shape}")

## Model Training and Prediction

In [None]:
# Train Isolation Forest
print("Training Isolation Forest...")
iso_scores = train_isolation_forest(X_scaled)

In [None]:
# Train PCA Reconstruction
print("Training PCA Reconstruction...")
reconstruction_error = train_pca_reconstruction(X_scaled)

In [None]:
# Combine scores and predict
final_score = combine_anomaly_scores(iso_scores, reconstruction_error)
final_predictions = predict_anomalies(final_score)

## Generate Submission

In [None]:
# Create submission
submission = df.copy()
submission['is_anomaly'] = final_predictions

submission.to_csv('submission.csv', index=False)

anomaly_count = sum(final_predictions)
anomaly_rate = anomaly_count / len(final_predictions) * 100

print(f"Submission created with ensemble approach.")
print(f"Total anomalies flagged: {anomaly_count} ({anomaly_rate:.2f}%)")

if COLAB_ENV:
    files.download('submission.csv')