In [1]:
# Colab already has numpy, pandas, scikit-learn, matplotlib, seaborn
# Just verify versions
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

print("‚úÖ All packages ready!")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Scikit-learn: {sklearn.__version__}")


‚úÖ All packages ready!
NumPy: 2.0.2
Pandas: 2.2.2
Scikit-learn: 1.6.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import os
BASE_DIR = "/content/drive/MyDrive/iForestAutoAI"
os.makedirs(BASE_DIR, exist_ok=True)

In [6]:
"""
Generate realistic synthetic automotive telemetry data
"""
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import json

# Set seed for reproducibility
np.random.seed(42)


def generate_fleet_telemetry(
    n_vehicles=250,
    days_per_vehicle=30,
    readings_per_day=4,
    failure_rate=0.05
):
    """
    Generate synthetic automotive telemetry data.

    Args:
        n_vehicles: Number of vehicles in fleet
        days_per_vehicle: Days of history per vehicle
        readings_per_day: Telemetry readings per day
        failure_rate: Proportion of vehicles with issues (5%)

    Returns:
        DataFrame with telemetry data
    """

    print(f"üöó Generating data for {n_vehicles} vehicles...")
    print(f"   {days_per_vehicle} days √ó {readings_per_day} readings/day = {days_per_vehicle * readings_per_day} readings per vehicle")

    all_data = []
    n_failing = int(n_vehicles * failure_rate)

    # Randomly select which vehicles will have issues
    failing_vehicle_ids = np.random.choice(n_vehicles, n_failing, replace=False)

    for vehicle_idx in range(n_vehicles):
        vehicle_id = f"VEH_{vehicle_idx + 1:04d}"
        is_failing = vehicle_idx in failing_vehicle_ids

        # Vehicle-specific baseline (some vehicles run hotter, etc.)
        vehicle_baseline = {
            'engine_temp_offset': np.random.normal(0, 3),
            'oil_pressure_offset': np.random.normal(0, 2),
            'battery_voltage_offset': np.random.normal(0, 0.1),
            'driving_style': np.random.choice(['gentle', 'normal', 'aggressive'], p=[0.2, 0.6, 0.2])
        }

        # Generate readings
        for day in range(days_per_vehicle):
            for reading_idx in range(readings_per_day):

                # Time progression (0 to 1 over the period)
                time_progress = (day * readings_per_day + reading_idx) / (days_per_vehicle * readings_per_day)

                # Degradation factor (failing vehicles degrade faster)
                if is_failing:
                    degradation = time_progress * np.random.uniform(1.5, 2.5)
                else:
                    degradation = time_progress * np.random.uniform(0.1, 0.3)

                # Timestamp
                timestamp = datetime(2024, 1, 1) + timedelta(
                    days=day,
                    hours=reading_idx * 6  # Readings at 0h, 6h, 12h, 18h
                )

                # === ENGINE TEMPERATURE ===
                # Baseline: 85-95¬∞C normal operating temp
                engine_temp_base = 90 + vehicle_baseline['engine_temp_offset']
                engine_temp_noise = np.random.normal(0, 2)
                engine_temp_degradation = degradation * 15  # Up to +15¬∞C when failing

                # Time of day effect (hotter in afternoon)
                hour = timestamp.hour
                time_of_day_effect = 3 * np.sin((hour - 6) * np.pi / 12)  # Peak at 6pm

                engine_temp_c = engine_temp_base + engine_temp_noise + engine_temp_degradation + time_of_day_effect
                engine_temp_c = np.clip(engine_temp_c, 70, 120)

                # === OIL PRESSURE ===
                # Baseline: 35-45 psi normal
                oil_pressure_base = 40 + vehicle_baseline['oil_pressure_offset']
                oil_pressure_noise = np.random.normal(0, 1.5)
                oil_pressure_degradation = degradation * -10  # Drops when failing

                oil_pressure_psi = oil_pressure_base + oil_pressure_noise + oil_pressure_degradation
                oil_pressure_psi = np.clip(oil_pressure_psi, 15, 50)

                # === BRAKE PAD THICKNESS ===
                # New: 10mm, Replace at 2mm
                brake_pad_base = 10
                brake_wear_rate = 0.01 if vehicle_baseline['driving_style'] == 'gentle' else \
                                  0.015 if vehicle_baseline['driving_style'] == 'normal' else 0.025

                if is_failing:
                    brake_wear_rate *= 2  # Faster wear

                brake_pad_mm = brake_pad_base - (day * brake_wear_rate)
                brake_pad_mm += np.random.normal(0, 0.1)  # Measurement noise
                brake_pad_mm = np.clip(brake_pad_mm, 1, 10)

                # === BATTERY VOLTAGE ===
                # Healthy: 12.4-12.8V, Weak: <12.0V
                battery_voltage_base = 12.6 + vehicle_baseline['battery_voltage_offset']
                battery_voltage_noise = np.random.normal(0, 0.05)
                battery_voltage_degradation = degradation * -0.4

                battery_voltage_v = battery_voltage_base + battery_voltage_noise + battery_voltage_degradation
                battery_voltage_v = np.clip(battery_voltage_v, 11.0, 13.0)

                # === TIRE PRESSURES ===
                # Recommended: 32 psi, Variance between tires
                tire_base = 32
                tire_pressures_psi = [
                    tire_base + np.random.normal(0, 1.5) for _ in range(4)
                ]
                tire_pressures_psi = [np.clip(p, 25, 38) for p in tire_pressures_psi]

                # === ENGINE RPM ===
                # Idle: ~800, Highway: ~2500
                if reading_idx in [0, 3]:  # Night/early morning - lower usage
                    rpm_base = np.random.normal(1200, 300)
                else:
                    rpm_base = np.random.normal(2000, 500)
                engine_rpm = np.clip(rpm_base, 700, 6000)

                # === COOLANT TEMPERATURE ===
                # Usually tracks engine temp but slightly lower
                coolant_temp_c = engine_temp_c - np.random.uniform(3, 8)
                coolant_temp_c = np.clip(coolant_temp_c, 65, 110)

                # === FUEL CONSUMPTION ===
                # L/100km - affected by driving style
                fuel_consumption_base = 8 if vehicle_baseline['driving_style'] == 'gentle' else \
                                       10 if vehicle_baseline['driving_style'] == 'normal' else 13
                fuel_consumption = fuel_consumption_base + np.random.normal(0, 1)
                fuel_consumption = np.clip(fuel_consumption, 5, 20)

                # === THROTTLE POSITION ===
                # 0-100% - correlates with driving style
                throttle_mean = 30 if vehicle_baseline['driving_style'] == 'gentle' else \
                               50 if vehicle_baseline['driving_style'] == 'normal' else 70
                throttle_position_pct = np.random.normal(throttle_mean, 15)
                throttle_position_pct = np.clip(throttle_position_pct, 0, 100)

                # === OPERATIONAL METRICS ===
                daily_miles = np.random.uniform(30, 120)

                harsh_braking_events = 0
                if vehicle_baseline['driving_style'] == 'aggressive':
                    harsh_braking_events = np.random.poisson(3)
                elif vehicle_baseline['driving_style'] == 'normal':
                    harsh_braking_events = np.random.poisson(0.5)

                harsh_acceleration_events = 0
                if vehicle_baseline['driving_style'] == 'aggressive':
                    harsh_acceleration_events = np.random.poisson(2.5)
                elif vehicle_baseline['driving_style'] == 'normal':
                    harsh_acceleration_events = np.random.poisson(0.3)

                cold_starts = 1 if reading_idx == 0 else 0  # First reading of day

                # === MAINTENANCE TRACKING ===
                miles_since_last_service = day * 50 + np.random.uniform(-10, 10)

                # === AMBIENT CONDITIONS ===
                # Affects readings
                ambient_temp_c = 20 + 15 * np.sin((day / 365) * 2 * np.pi)  # Seasonal
                ambient_temp_c += 10 * np.sin((hour - 6) * np.pi / 12)  # Daily cycle
                ambient_temp_c = np.clip(ambient_temp_c, -5, 40)

                # === VIBRATION (NEW) ===
                vibration_base = 0.5  # Low vibration is good
                vibration_degradation = degradation * 2  # Increases with wear
                vibration_level = vibration_base + vibration_degradation + np.random.normal(0, 0.1)
                vibration_level = np.clip(vibration_level, 0, 5)

                # === TRANSMISSION TEMP (NEW) ===
                transmission_temp_base = 80
                transmission_temp = transmission_temp_base + degradation * 10 + np.random.normal(0, 3)
                transmission_temp = np.clip(transmission_temp, 70, 110)

                # === CONSTRUCT RECORD ===
                record = {
                    'vehicle_id': vehicle_id,
                    'timestamp': timestamp,
                    'is_failing': is_failing,  # Ground truth label

                    # Core sensors
                    'engine_temp_c': round(engine_temp_c, 1),
                    'oil_pressure_psi': round(oil_pressure_psi, 1),
                    'brake_pad_mm': round(brake_pad_mm, 2),
                    'battery_voltage_v': round(battery_voltage_v, 2),

                    # Extended sensors
                    'tire_pressure_fl': round(tire_pressures_psi[0], 1),
                    'tire_pressure_fr': round(tire_pressures_psi[1], 1),
                    'tire_pressure_rl': round(tire_pressures_psi[2], 1),
                    'tire_pressure_rr': round(tire_pressures_psi[3], 1),
                    'engine_rpm': int(engine_rpm),
                    'coolant_temp_c': round(coolant_temp_c, 1),
                    'fuel_consumption_l_per_100km': round(fuel_consumption, 1),
                    'throttle_position_pct': round(throttle_position_pct, 1),
                    'vibration_level': round(vibration_level, 2),
                    'transmission_temp_c': round(transmission_temp, 1),

                    # Operational
                    'daily_miles': round(daily_miles, 1),
                    'harsh_braking_events': harsh_braking_events,
                    'harsh_acceleration_events': harsh_acceleration_events,
                    'cold_starts': cold_starts,
                    'miles_since_last_service': round(miles_since_last_service, 1),

                    # Environmental
                    'ambient_temp_c': round(ambient_temp_c, 1),

                    # Metadata
                    'driving_style': vehicle_baseline['driving_style'],
                    'reading_of_day': reading_idx + 1
                }

                all_data.append(record)

        if (vehicle_idx + 1) % 50 == 0:
            print(f"   ‚úì Generated {vehicle_idx + 1}/{n_vehicles} vehicles")

    df = pd.DataFrame(all_data)

    print(f"\n‚úÖ Dataset generated:")
    print(f"   Total records: {len(df):,}")
    print(f"   Vehicles: {df['vehicle_id'].nunique()}")
    print(f"   Failing vehicles: {df[df['is_failing']]['vehicle_id'].nunique()} ({failure_rate*100}%)")
    print(f"   Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"   Features: {len(df.columns)}")

    return df


def save_dataset(df, train_ratio=0.7, val_ratio=0.15):
    """
    Split and save dataset into train/val/test sets.
    Split by vehicle (not by rows) to avoid leakage.
    """

    vehicles = df['vehicle_id'].unique()
    np.random.shuffle(vehicles)

    n_train = int(len(vehicles) * train_ratio)
    n_val = int(len(vehicles) * val_ratio)

    train_vehicles = vehicles[:n_train]
    val_vehicles = vehicles[n_train:n_train + n_val]
    test_vehicles = vehicles[n_train + n_val:]

    train_df = df[df['vehicle_id'].isin(train_vehicles)]
    val_df = df[df['vehicle_id'].isin(val_vehicles)]
    test_df = df[df['vehicle_id'].isin(test_vehicles)]

    print(f"\nüìä Dataset split:")
    print(f"   Train: {len(train_vehicles)} vehicles ({len(train_df):,} records)")
    print(f"   Val:   {len(val_vehicles)} vehicles ({len(val_df):,} records)")
    print(f"   Test:  {len(test_vehicles)} vehicles ({len(test_df):,} records)")

    # === SAVE TO GOOGLE DRIVE ===
    train_path = os.path.join(BASE_DIR, "train_telemetry.csv")
    val_path = os.path.join(BASE_DIR, "val_telemetry.csv")
    test_path = os.path.join(BASE_DIR, "test_telemetry.csv")
    meta_path = os.path.join(BASE_DIR, "dataset_metadata.json")

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    metadata = {
        'generated_at': datetime.now().isoformat(),
        'n_vehicles_total': len(vehicles),
        'n_vehicles_train': len(train_vehicles),
        'n_vehicles_val': len(val_vehicles),
        'n_vehicles_test': len(test_vehicles),
        'n_records_total': len(df),
        'failure_rate': float(df['is_failing'].mean()),
        'features': list(df.columns),
        'date_range': {
            'start': df['timestamp'].min().isoformat(),
            'end': df['timestamp'].max().isoformat()
        }
    }

    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"\n‚úÖ Saved to Google Drive:")
    print(f"   {train_path}")
    print(f"   {val_path}")
    print(f"   {test_path}")
    print(f"   {meta_path}")

    return train_df, val_df, test_df



if __name__ == "__main__":
    print("=" * 60)
    print("üöÄ SYNTHETIC AUTOMOTIVE TELEMETRY GENERATOR")
    print("=" * 60)

    # Generate dataset
    df = generate_fleet_telemetry(
        n_vehicles=250,
        days_per_vehicle=30,
        readings_per_day=4,
        failure_rate=0.05
    )

    # Split and save
    train_df, val_df, test_df = save_dataset(df)

    print("\n" + "=" * 60)
    print("üéâ DATA GENERATION COMPLETE!")
    print("=" * 60)

üöÄ SYNTHETIC AUTOMOTIVE TELEMETRY GENERATOR
üöó Generating data for 250 vehicles...
   30 days √ó 4 readings/day = 120 readings per vehicle
   ‚úì Generated 50/250 vehicles
   ‚úì Generated 100/250 vehicles
   ‚úì Generated 150/250 vehicles
   ‚úì Generated 200/250 vehicles
   ‚úì Generated 250/250 vehicles

‚úÖ Dataset generated:
   Total records: 30,000
   Vehicles: 250
   Failing vehicles: 12 (5.0%)
   Date range: 2024-01-01 00:00:00 to 2024-01-30 18:00:00
   Features: 25

üìä Dataset split:
   Train: 175 vehicles (21,000 records)
   Val:   37 vehicles (4,440 records)
   Test:  38 vehicles (4,560 records)

‚úÖ Saved to Google Drive:
   /content/drive/MyDrive/iForestAutoAI/train_telemetry.csv
   /content/drive/MyDrive/iForestAutoAI/val_telemetry.csv
   /content/drive/MyDrive/iForestAutoAI/test_telemetry.csv
   /content/drive/MyDrive/iForestAutoAI/dataset_metadata.json

üéâ DATA GENERATION COMPLETE!


In [7]:
"""
Extract features from raw telemetry for ML model
"""

import pandas as pd
import numpy as np
from typing import Dict, List
import os

# ======================================================
# GOOGLE DRIVE PATH CONFIG
# ======================================================
BASE_DIR = "/content/drive/MyDrive/iForestAutoAI"

TRAIN_PATH = os.path.join(BASE_DIR, "train_telemetry.csv")
VAL_PATH   = os.path.join(BASE_DIR, "val_telemetry.csv")
TEST_PATH  = os.path.join(BASE_DIR, "test_telemetry.csv")


def extract_features_from_vehicle(vehicle_df: pd.DataFrame) -> Dict:
    """
    Extract feature vector from a single vehicle's telemetry history.
    """

    vehicle_df = vehicle_df.sort_values('timestamp').reset_index(drop=True)

    if len(vehicle_df) < 10:
        raise ValueError(f"Insufficient data: only {len(vehicle_df)} records")

    latest = vehicle_df.iloc[-1]
    df_7d = vehicle_df.tail(28)
    df_all = vehicle_df

    features = {}

    # ===== CURRENT STATE =====
    features['engine_temp_c'] = latest['engine_temp_c']
    features['oil_pressure_psi'] = latest['oil_pressure_psi']
    features['brake_pad_mm'] = latest['brake_pad_mm']
    features['battery_voltage_v'] = latest['battery_voltage_v']
    features['vibration_level'] = latest['vibration_level']
    features['transmission_temp_c'] = latest['transmission_temp_c']

    tire_pressures = [
        latest['tire_pressure_fl'],
        latest['tire_pressure_fr'],
        latest['tire_pressure_rl'],
        latest['tire_pressure_rr']
    ]
    features['tire_pressure_avg'] = np.mean(tire_pressures)
    features['tire_pressure_std'] = np.std(tire_pressures)
    features['tire_pressure_min'] = np.min(tire_pressures)

    # ===== 7-DAY STATS =====
    features['engine_temp_mean_7d'] = df_7d['engine_temp_c'].mean()
    features['engine_temp_std_7d'] = df_7d['engine_temp_c'].std()
    features['engine_temp_max_7d'] = df_7d['engine_temp_c'].max()
    features['engine_temp_min_7d'] = df_7d['engine_temp_c'].min()

    features['oil_pressure_mean_7d'] = df_7d['oil_pressure_psi'].mean()
    features['oil_pressure_std_7d'] = df_7d['oil_pressure_psi'].std()
    features['oil_pressure_min_7d'] = df_7d['oil_pressure_psi'].min()

    features['battery_voltage_mean_7d'] = df_7d['battery_voltage_v'].mean()
    features['battery_voltage_min_7d'] = df_7d['battery_voltage_v'].min()

    features['coolant_temp_mean_7d'] = df_7d['coolant_temp_c'].mean()
    features['coolant_temp_max_7d'] = df_7d['coolant_temp_c'].max()

    features['vibration_mean_7d'] = df_7d['vibration_level'].mean()
    features['vibration_max_7d'] = df_7d['vibration_level'].max()

    features['transmission_temp_mean_7d'] = df_7d['transmission_temp_c'].mean()
    features['transmission_temp_max_7d'] = df_7d['transmission_temp_c'].max()

    # ===== 30-DAY TRENDS =====
    days_elapsed = max(
        (df_all.iloc[-1]['timestamp'] - df_all.iloc[0]['timestamp']).days, 1
    )

    features['brake_wear_rate_30d'] = (
        df_all.iloc[0]['brake_pad_mm'] - df_all.iloc[-1]['brake_pad_mm']
    ) / days_elapsed

    features['oil_pressure_drop_rate_30d'] = (
        df_all.iloc[0]['oil_pressure_psi'] - df_all.iloc[-1]['oil_pressure_psi']
    ) / days_elapsed

    features['battery_voltage_drop_rate_30d'] = (
        df_all.iloc[0]['battery_voltage_v'] - df_all.iloc[-1]['battery_voltage_v']
    ) / days_elapsed

    features['vibration_increase_rate_30d'] = (
        df_all.iloc[-1]['vibration_level'] - df_all.iloc[0]['vibration_level']
    ) / days_elapsed

    # ===== OPERATIONAL =====
    features['miles_since_last_service'] = latest['miles_since_last_service']
    features['avg_daily_miles_30d'] = df_all['daily_miles'].mean()
    features['total_harsh_braking_7d'] = df_7d['harsh_braking_events'].sum()
    features['total_harsh_acceleration_7d'] = df_7d['harsh_acceleration_events'].sum()
    features['total_cold_starts_7d'] = df_7d['cold_starts'].sum()
    features['avg_fuel_consumption_7d'] = df_7d['fuel_consumption_l_per_100km'].mean()
    features['avg_rpm_7d'] = df_7d['engine_rpm'].mean()
    features['max_rpm_7d'] = df_7d['engine_rpm'].max()

    # ===== DERIVED =====
    features['engine_stress_index'] = (
        features['engine_temp_c'] / max(features['oil_pressure_psi'], 1)
    )

    features['cooling_efficiency'] = (
        features['engine_temp_c'] - features['coolant_temp_mean_7d']
    )

    features['brake_health_score'] = (
        features['brake_pad_mm'] - 10 * features['brake_wear_rate_30d']
    )

    features['battery_health_indicator'] = (
        features['battery_voltage_v'] - 100 * features['battery_voltage_drop_rate_30d']
    )

    features['temp_stability_score'] = 1 / (1 + features['engine_temp_std_7d'])

    total_harsh = (
        features['total_harsh_braking_7d'] +
        features['total_harsh_acceleration_7d']
    )
    features['driving_aggression_score'] = total_harsh / 7

    features['tire_balance_score'] = 1 / (1 + features['tire_pressure_std'])

    return features


def get_feature_names() -> List[str]:
    return list(extract_features_from_vehicle.__annotations__.keys())


def features_to_vector(features: Dict) -> np.ndarray:
    names = [
        'engine_temp_c','oil_pressure_psi','brake_pad_mm','battery_voltage_v',
        'vibration_level','transmission_temp_c','tire_pressure_avg',
        'tire_pressure_std','tire_pressure_min','engine_temp_mean_7d',
        'engine_temp_std_7d','engine_temp_max_7d','engine_temp_min_7d',
        'oil_pressure_mean_7d','oil_pressure_std_7d','oil_pressure_min_7d',
        'battery_voltage_mean_7d','battery_voltage_min_7d',
        'coolant_temp_mean_7d','coolant_temp_max_7d',
        'vibration_mean_7d','vibration_max_7d',
        'transmission_temp_mean_7d','transmission_temp_max_7d',
        'brake_wear_rate_30d','oil_pressure_drop_rate_30d',
        'battery_voltage_drop_rate_30d','vibration_increase_rate_30d',
        'miles_since_last_service','avg_daily_miles_30d',
        'total_harsh_braking_7d','total_harsh_acceleration_7d',
        'total_cold_starts_7d','avg_fuel_consumption_7d',
        'avg_rpm_7d','max_rpm_7d','engine_stress_index',
        'cooling_efficiency','brake_health_score',
        'battery_health_indicator','temp_stability_score',
        'driving_aggression_score','tire_balance_score'
    ]
    return np.array([features[n] for n in names])


def prepare_training_data(telemetry_df: pd.DataFrame):
    X, y, vehicle_ids = [], [], []

    for vid in telemetry_df['vehicle_id'].unique():
        vdf = telemetry_df[telemetry_df['vehicle_id'] == vid]
        try:
            feats = extract_features_from_vehicle(vdf)
            X.append(features_to_vector(feats))
            y.append(int(vdf.iloc[0]['is_failing']))
            vehicle_ids.append(vid)
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping {vid}: {e}")

    return np.array(X), np.array(y), vehicle_ids


if __name__ == "__main__":
    print("üì• Loading training data from Google Drive...")
    df = pd.read_csv(TRAIN_PATH)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    X, y, vehicle_ids = prepare_training_data(df)

    print(f"‚úÖ Feature matrix shape: {X.shape}")
    print(f"‚úÖ Labels ‚Üí Failing: {y.sum()}, Normal: {(y==0).sum()}")


üì• Loading training data from Google Drive...
‚úÖ Feature matrix shape: (175, 43)
‚úÖ Labels ‚Üí Failing: 10, Normal: 165


In [10]:
"""
Train Isolation Forest model
"""
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    roc_curve
)
import joblib
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns




def train_isolation_forest(X_train, contamination=0.05):
    """
    Train Isolation Forest model.

    Args:
        X_train: Feature matrix
        contamination: Expected proportion of anomalies

    Returns:
        Trained model
    """
    print(f"üèãÔ∏è  Training Isolation Forest...")
    print(f"   Samples: {X_train.shape[0]}")
    print(f"   Features: {X_train.shape[1]}")
    print(f"   Contamination: {contamination}")

    model = IsolationForest(
        n_estimators=100,
        max_samples=256,
        contamination=contamination,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    model.fit(X_train)

    print("‚úÖ Training complete!")
    return model


def transform_scores_to_risk(scores):
    """
    Transform Isolation Forest anomaly scores to 0-1 risk scale.

    Isolation Forest returns negative scores (more negative = more anomalous).
    We use sigmoid to convert to intuitive 0-1 risk scale.
    """
    return 1 / (1 + np.exp(scores * 5))


def evaluate_model(model, X, y_true, dataset_name="Dataset"):
    """
    Comprehensive model evaluation.

    Args:
        model: Trained Isolation Forest
        X: Features
        y_true: Ground truth labels (1=failing, 0=normal)
        dataset_name: Name for logging

    Returns:
        Dictionary of metrics
    """
    print(f"\nüìä Evaluating on {dataset_name}...")

    # Get predictions
    anomaly_scores = model.score_samples(X)
    risk_scores = transform_scores_to_risk(anomaly_scores)
    predictions = model.predict(X)  # -1 = anomaly, 1 = normal

    # Convert to binary (1=anomaly, 0=normal)
    y_pred = (predictions == -1).astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # ROC-AUC (using risk scores)
    roc_auc = roc_auc_score(y_true, risk_scores)

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': {
            'true_negatives': int(tn),
            'false_positives': int(fp),
            'false_negatives': int(fn),
            'true_positives': int(tp)
        },
        'n_samples': len(y_true),
        'n_failing': int(y_true.sum()),
        'n_detected': int(y_pred.sum())
    }

    # Print report
    print(f"\n{'='*50}")
    print(f"  {dataset_name} Results")
    print(f"{'='*50}")
    print(f"  Samples: {metrics['n_samples']}")
    print(f"  Actual failing: {metrics['n_failing']}")
    print(f"  Detected as anomaly: {metrics['n_detected']}")
    print(f"\n  Accuracy:  {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall:    {recall:.3f}")
    print(f"  F1 Score:  {f1:.3f}")
    print(f"  ROC-AUC:   {roc_auc:.3f}")
    print(f"\n  Confusion Matrix:")
    print(f"    TN: {tn:3d}  |  FP: {fp:3d}")
    print(f"    FN: {fn:3d}  |  TP: {tp:3d}")
    print(f"{'='*50}")

    return metrics, risk_scores, y_pred


def plot_evaluation_results(y_true_train, risk_scores_train, y_pred_train,
                           y_true_val, risk_scores_val, y_pred_val):
    """Create visualizations of model performance."""
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Risk Score Distribution (Train)
    axes[0, 0].hist(risk_scores_train[y_true_train == 0], bins=30, alpha=0.5, label='Normal', color='green')
    axes[0, 0].hist(risk_scores_train[y_true_train == 1], bins=30, alpha=0.5, label='Failing', color='red')
    axes[0, 0].set_xlabel('Risk Score')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Train: Risk Score Distribution')
    axes[0, 0].legend()
    axes[0, 0].axvline(0.5, color='black', linestyle='--', label='Threshold')

    # 2. Risk Score Distribution (Val)
    axes[0, 1].hist(risk_scores_val[y_true_val == 0], bins=30, alpha=0.5, label='Normal', color='green')
    axes[0, 1].hist(risk_scores_val[y_true_val == 1], bins=30, alpha=0.5, label='Failing', color='red')
    axes[0, 1].set_xlabel('Risk Score')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_title('Validation: Risk Score Distribution')
    axes[0, 1].legend()
    axes[0, 1].axvline(0.5, color='black', linestyle='--')

    # 3. Confusion Matrix (Val)
    cm = confusion_matrix(y_true_val, y_pred_val)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 2])
    axes[0, 2].set_xlabel('Predicted')
    axes[0, 2].set_ylabel('Actual')
    axes[0, 2].set_title('Validation: Confusion Matrix')
    axes[0, 2].set_xticklabels(['Normal', 'Failing'])
    axes[0, 2].set_yticklabels(['Normal', 'Failing'])

    # 4. ROC Curve (Train)
    fpr_train, tpr_train, _ = roc_curve(y_true_train, risk_scores_train)
    roc_auc_train = roc_auc_score(y_true_train, risk_scores_train)
    axes[1, 0].plot(fpr_train, tpr_train, label=f'Train (AUC = {roc_auc_train:.3f})')
    axes[1, 0].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[1, 0].set_xlabel('False Positive Rate')
    axes[1, 0].set_ylabel('True Positive Rate')
    axes[1, 0].set_title('Train: ROC Curve')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)

    # 5. ROC Curve (Val)
    fpr_val, tpr_val, _ = roc_curve(y_true_val, risk_scores_val)
    roc_auc_val = roc_auc_score(y_true_val, risk_scores_val)
    axes[1, 1].plot(fpr_val, tpr_val, label=f'Val (AUC = {roc_auc_val:.3f})')
    axes[1, 1].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[1, 1].set_xlabel('False Positive Rate')
    axes[1, 1].set_ylabel('True Positive Rate')
    axes[1, 1].set_title('Validation: ROC Curve')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)

    # 6. Precision-Recall Curve (Val)
    precision, recall, _ = precision_recall_curve(y_true_val, risk_scores_val)
    axes[1, 2].plot(recall, precision)
    axes[1, 2].set_xlabel('Recall')
    axes[1, 2].set_ylabel('Precision')
    axes[1, 2].set_title('Validation: Precision-Recall Curve')
    axes[1, 2].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('logs/model_evaluation.png', dpi=150, bbox_inches='tight')
    print("‚úÖ Saved evaluation plot: logs/model_evaluation.png")
    plt.close()


def save_model_and_metadata(model, metrics_train, metrics_val, X_train):
    """Save model and metadata."""
    # Save model
    model_path = 'models/isolation_forest_v1.pkl'
    joblib.dump(model, model_path)
    print(f"‚úÖ Model saved: {model_path}")

    # Save metadata
    metadata = {
        'model_type': 'IsolationForest',
        'trained_at': datetime.now().isoformat(),
        'sklearn_version': joblib.__version__,
        'hyperparameters': {
            'n_estimators': model.n_estimators,
            'max_samples': model.max_samples,
            'contamination': model.contamination,
            'random_state': model.random_state
        },
        'training_data': {
            'n_samples': X_train.shape[0],
            'n_features': X_train.shape[1],
            'feature_names': get_feature_names()
        },
        'performance': {
            'train': metrics_train,
            'validation': metrics_val
        }
    }

    metadata_path = 'models/model_metadata.json'
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"‚úÖ Metadata saved: {metadata_path}")


def main():
    """Main training pipeline."""
    print("=" * 60)
    print("üöÄ ISOLATION FOREST TRAINING PIPELINE")
    print("=" * 60)

    # Load data
    print("\nüì• Loading datasets...")
    train_df = pd.read_csv('/content/drive/MyDrive/iForestAutoAI/train_telemetry.csv')
    val_df = pd.read_csv('/content/drive/MyDrive/iForestAutoAI/val_telemetry.csv')

    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    val_df['timestamp'] = pd.to_datetime(val_df['timestamp'])

    print(f"‚úÖ Train: {len(train_df)} records, {train_df['vehicle_id'].nunique()} vehicles")
    print(f"‚úÖ Val:   {len(val_df)} records, {val_df['vehicle_id'].nunique()} vehicles")

    # Extract features
    print("\nüîß Extracting features...")
    X_train, y_train, train_vehicle_ids = prepare_training_data(train_df)
    X_val, y_val, val_vehicle_ids = prepare_training_data(val_df)

    print(f"‚úÖ Train features: {X_train.shape}")
    print(f"‚úÖ Val features:   {X_val.shape}")

    # Train model
    print("\nüèãÔ∏è  Training model...")
    model = train_isolation_forest(X_train, contamination=0.05)

    # Evaluate
    print("\nüìä Evaluating model...")
    metrics_train, risk_scores_train, y_pred_train = evaluate_model(model, X_train, y_train, "TRAIN")
    metrics_val, risk_scores_val, y_pred_val = evaluate_model(model, X_val, y_val, "VALIDATION")

    # Plot results
    print("\nüìà Creating visualizations...")
    plot_evaluation_results(
        y_train, risk_scores_train, y_pred_train,
        y_val, risk_scores_val, y_pred_val
    )

    # Save model
    print("\nüíæ Saving model...")
    save_model_and_metadata(model, metrics_train, metrics_val, X_train)

    print("\n" + "=" * 60)
    print("üéâ TRAINING COMPLETE!")
    print("=" * 60)
    print(f"\nüìä Final Performance:")
    print(f"   Train F1:     {metrics_train['f1_score']:.3f}")
    print(f"   Val F1:       {metrics_val['f1_score']:.3f}")
    print(f"   Train ROC-AUC: {metrics_train['roc_auc']:.3f}")
    print(f"   Val ROC-AUC:   {metrics_val['roc_auc']:.3f}")


if __name__ == "__main__":
    import os
    os.makedirs('logs', exist_ok=True)
    os.makedirs('models', exist_ok=True)

    main()

üöÄ ISOLATION FOREST TRAINING PIPELINE

üì• Loading datasets...
‚úÖ Train: 21000 records, 175 vehicles
‚úÖ Val:   4440 records, 37 vehicles

üîß Extracting features...
‚úÖ Train features: (175, 43)
‚úÖ Val features:   (37, 43)

üèãÔ∏è  Training model...
üèãÔ∏è  Training Isolation Forest...
   Samples: 175
   Features: 43
   Contamination: 0.05


  warn(
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


‚úÖ Training complete!

üìä Evaluating model...

üìä Evaluating on TRAIN...

  TRAIN Results
  Samples: 175
  Actual failing: 10
  Detected as anomaly: 9

  Accuracy:  0.994
  Precision: 1.000
  Recall:    0.900
  F1 Score:  0.947
  ROC-AUC:   1.000

  Confusion Matrix:
    TN: 165  |  FP:   0
    FN:   1  |  TP:   9

üìä Evaluating on VALIDATION...

  VALIDATION Results
  Samples: 37
  Actual failing: 1
  Detected as anomaly: 1

  Accuracy:  1.000
  Precision: 1.000
  Recall:    1.000
  F1 Score:  1.000
  ROC-AUC:   1.000

  Confusion Matrix:
    TN:  36  |  FP:   0
    FN:   0  |  TP:   1

üìà Creating visualizations...
‚úÖ Saved evaluation plot: logs/model_evaluation.png

üíæ Saving model...
‚úÖ Model saved: models/isolation_forest_v1.pkl
‚úÖ Metadata saved: models/model_metadata.json

üéâ TRAINING COMPLETE!

üìä Final Performance:
   Train F1:     0.947
   Val F1:       1.000
   Train ROC-AUC: 1.000
   Val ROC-AUC:   1.000


In [12]:
"""
Test trained model on holdout test set
"""
import numpy as np
import pandas as pd
import joblib
import json


def analyze_failure_cases(model, X_test, y_test, vehicle_ids, test_df):
    """Analyze false positives and false negatives."""

    anomaly_scores = model.score_samples(X_test)
    risk_scores = transform_scores_to_risk(anomaly_scores)
    predictions = (model.predict(X_test) == -1).astype(int)

    # Find misclassifications
    false_positives = np.where((predictions == 1) & (y_test == 0))[0]
    false_negatives = np.where((predictions == 0) & (y_test == 1))[0]

    print(f"\nüîç Analyzing Failure Cases...")
    print(f"   False Positives: {len(false_positives)}")
    print(f"   False Negatives: {len(false_negatives)}")

    # Show examples
    print(f"\n‚ùå False Positives (predicted failing, actually normal):")
    for idx in false_positives[:3]:
        vehicle_id = vehicle_ids[idx]
        risk = risk_scores[idx]
        print(f"   {vehicle_id}: risk={risk:.3f}")

    print(f"\n‚ùå False Negatives (predicted normal, actually failing):")
    for idx in false_negatives[:3]:
        vehicle_id = vehicle_ids[idx]
        risk = risk_scores[idx]
        print(f"   {vehicle_id}: risk={risk:.3f}")


def test_on_individual_vehicles(model, test_df, n_examples=5):
    """Test inference on individual vehicles (simulates production)."""

    print(f"\nüß™ Testing Individual Vehicle Inference...")

    vehicles = test_df['vehicle_id'].unique()[:n_examples]

    for vehicle_id in vehicles:
        vehicle_data = test_df[test_df['vehicle_id'] == vehicle_id]
        is_actually_failing = vehicle_data.iloc[0]['is_failing']

        # Extract features
        features = extract_features_from_vehicle(vehicle_data)
        vector = features_to_vector(features).reshape(1, -1)

        # Predict
        anomaly_score = model.score_samples(vector)[0]
        risk_score = transform_scores_to_risk(np.array([anomaly_score]))[0]
        prediction = "FAILING" if model.predict(vector)[0] == -1 else "NORMAL"

        status = "‚úÖ" if (prediction == "FAILING" and is_actually_failing) or \
                        (prediction == "NORMAL" and not is_actually_failing) else "‚ùå"

        print(f"\n{status} {vehicle_id}:")
        print(f"   Predicted: {prediction} (risk={risk_score:.3f})")
        print(f"   Actual: {'FAILING' if is_actually_failing else 'NORMAL'}")
        print(f"   Top concerns:")

        # Show top anomalous features
        concerns = []
        if features['brake_pad_mm'] < 4.0:
            concerns.append(f"Low brake pads ({features['brake_pad_mm']:.1f}mm)")
        if features['engine_temp_max_7d'] > 100:
            concerns.append(f"High engine temp ({features['engine_temp_max_7d']:.1f}¬∞C)")
        if features['battery_voltage_v'] < 12.2:
            concerns.append(f"Low battery ({features['battery_voltage_v']:.2f}V)")
        if features['oil_pressure_min_7d'] < 30:
            concerns.append(f"Low oil pressure ({features['oil_pressure_min_7d']:.1f} psi)")

        for concern in concerns[:3]:
            print(f"     - {concern}")


def main():
    """Test pipeline."""
    print("=" * 60)
    print("üß™ MODEL TESTING ON HOLDOUT SET")
    print("=" * 60)

    # Load model
    print("\nüì• Loading trained model...")
    model = joblib.load('/content/models/isolation_forest_v1.pkl')

    with open('models/model_metadata.json') as f:
        metadata = json.load(f)

    print(f"‚úÖ Loaded model trained on {metadata['trained_at']}")
    print(f"   Train F1: {metadata['performance']['train']['f1_score']:.3f}")
    print(f"   Val F1:   {metadata['performance']['validation']['f1_score']:.3f}")

    # Load test data
    print("\nüì• Loading test data...")
    test_df = pd.read_csv('/content/drive/MyDrive/iForestAutoAI/test_telemetry.csv')
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
    print(f"‚úÖ Test: {len(test_df)} records, {test_df['vehicle_id'].nunique()} vehicles")

    # Extract features
    print("\nüîß Extracting features...")
    X_test, y_test, test_vehicle_ids = prepare_training_data(test_df)
    print(f"‚úÖ Test features: {X_test.shape}")

    # Evaluate
    print("\nüìä Evaluating on test set...")
    metrics_test, risk_scores, y_pred = evaluate_model(model, X_test, y_test, "TEST")

    # Analyze failures
    analyze_failure_cases(model, X_test, y_test, test_vehicle_ids, test_df)

    # Test individual vehicles
    test_on_individual_vehicles(model, test_df, n_examples=5)

    # Save test results
    test_results = {
        'tested_at': datetime.now().isoformat(),
        'test_set_size': len(X_test),
        'metrics': metrics_test
    }

    with open('logs/test_results.json', 'w') as f:
        json.dump(test_results, f, indent=2)

    print("\n" + "=" * 60)
    print("üéâ TESTING COMPLETE!")
    print("=" * 60)
    print(f"\nüìä Test Performance:")
    print(f"   Accuracy:  {metrics_test['accuracy']:.3f}")
    print(f"   Precision: {metrics_test['precision']:.3f}")
    print(f"   Recall:    {metrics_test['recall']:.3f}")
    print(f"   F1 Score:  {metrics_test['f1_score']:.3f}")
    print(f"   ROC-AUC:   {metrics_test['roc_auc']:.3f}")


if __name__ == "__main__":
    from datetime import datetime
    main()

üß™ MODEL TESTING ON HOLDOUT SET

üì• Loading trained model...
‚úÖ Loaded model trained on 2025-12-15T04:28:01.452342
   Train F1: 0.947
   Val F1:   1.000

üì• Loading test data...
‚úÖ Test: 4560 records, 38 vehicles

üîß Extracting features...
‚úÖ Test features: (38, 43)

üìä Evaluating on test set...

üìä Evaluating on TEST...

  TEST Results
  Samples: 38
  Actual failing: 1
  Detected as anomaly: 1

  Accuracy:  1.000
  Precision: 1.000
  Recall:    1.000
  F1 Score:  1.000
  ROC-AUC:   1.000

  Confusion Matrix:
    TN:  37  |  FP:   0
    FN:   0  |  TP:   1

üîç Analyzing Failure Cases...
   False Positives: 0
   False Negatives: 0

‚ùå False Positives (predicted failing, actually normal):

‚ùå False Negatives (predicted normal, actually failing):

üß™ Testing Individual Vehicle Inference...

‚úÖ VEH_0001:
   Predicted: NORMAL (risk=0.901)
   Actual: NORMAL
   Top concerns:

‚úÖ VEH_0007:
   Predicted: FAILING (risk=0.964)
   Actual: FAILING
   Top concerns:
     - High

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: 


‚úÖ VEH_0030:
   Predicted: NORMAL (risk=0.886)
   Actual: NORMAL
   Top concerns:
     - High engine temp (103.4¬∞C)

‚úÖ VEH_0037:
   Predicted: NORMAL (risk=0.926)
   Actual: NORMAL
   Top concerns:
     - High engine temp (102.5¬∞C)

‚úÖ VEH_0038:
   Predicted: NORMAL (risk=0.908)
   Actual: NORMAL
   Top concerns:
     - High engine temp (101.0¬∞C)

üéâ TESTING COMPLETE!

üìä Test Performance:
   Accuracy:  1.000
   Precision: 1.000
   Recall:    1.000
   F1 Score:  1.000
   ROC-AUC:   1.000


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [14]:
import shutil
import os

os.makedirs("/content/drive/MyDrive/iForest/logs", exist_ok=True)
os.makedirs("/content/drive/MyDrive/iForest/models", exist_ok=True)

shutil.move("/content/logs/model_evaluation.png",
            "/content/drive/MyDrive/iForest/logs/model_evaluation.png")

shutil.move("/content/models/isolation_forest_v1.pkl",
            "/content/drive/MyDrive/iForest/models/isolation_forest_v1.pkl")

shutil.move("/content/models/model_metadata.json",
            "/content/drive/MyDrive/iForest/models/model_metadata.json")

print("‚úÖ Files moved to Google Drive")


‚úÖ Files moved to Google Drive
