### Setup & Imports

In [None]:

"""TRANSFER LEARNING DATA PREPROCESSING PIPELINE

This notebook preprocesses and aligns datasets for transfer learning
experiments. Run each checkpoint section, verify files saved to Drive,
then proceed to next checkpoint.

SOURCE: CICIOMT
TARGETS: CIC-IoT, IoT-23
"""

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Core imports
import os
import gc
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import resample

print("Libraries imported successfully")

# Configuration
BASE_DIR = '/content/drive/My Drive/Project_Final_Submission'
RAW_DIR = f'{BASE_DIR}/raw_datasets'
PREPROCESSED_DIR = f'{BASE_DIR}/preprocessed_datasets'
ALIGNED_DIR = f'{BASE_DIR}/enhanced_aligned_datasets'

# Create directories
for directory in [BASE_DIR, RAW_DIR, PREPROCESSED_DIR, ALIGNED_DIR]:
    os.makedirs(directory, exist_ok=True)
    print(f"Created: {directory}")

print("\n" + "="*70)
print("DIRECTORY STRUCTURE READY")
print("="*70)

### Install Kaggle & Setup Credentials

In [None]:
"""
=============================================================================
KAGGLE SETUP - Manual Credentials
=============================================================================
"""
# Install kagglehub
!pip install -q kagglehub

import kagglehub
import json

# Configure Kaggle credentials
print("="*70)
print("KAGGLE AUTHENTICATION")
print("="*70)

# Manual input for credentials
print("\nEnter your Kaggle credentials:")
print("(Find these at: https://www.kaggle.com/settings â†’ API section)\n")

kaggle_username = input("Enter your Kaggle username: ").strip()
kaggle_key = input("Enter your Kaggle API key: ").strip()

# Create kaggle.json content
kaggle_config = {
    "username": kaggle_username,
    "key": kaggle_key
}

# Create .kaggle directory
!mkdir -p ~/.kaggle

# Write kaggle.json file
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_config, f)

# Set proper permissions
!chmod 600 ~/.kaggle/kaggle.json

print("\nKaggle credentials configured successfully")
print(f"   Username: {kaggle_username}")
print(f"   Key: {'*' * (len(kaggle_key) - 4) + kaggle_key[-4:]}")  # Show last 4 chars only

# Test the credentials
try:
    test_path = kagglehub.dataset_download("akashdogra/cic-iot-2023")
    print("\nCredentials verified - test download successful!")
    print(f"   Test file location: {test_path}")
except Exception as e:
    print(f"\nError: {e}")
    print(" Please check your credentials and try again")

### Download Raw Datasets

In [None]:
"""

DOWNLOAD RAW DATASETS
This will take 10-15 minutes depending on connection speed

"""
print("="*70)
print("DOWNLOADING DATASETS FROM KAGGLE")
print("="*70)

# Download datasets
datasets_to_download = {
    'CICIOMT': 'limamateus/cic-iomt-2024-wifi-mqtt',
    'CIC-IoT': 'akashdogra/cic-iot-2023',
    'IoT-23': 'engraqeel/iot23preprocesseddata',
    'IDS-2018': 'solarmainframe/ids-intrusion-csv'
}

downloaded_paths = {}

for name, kaggle_path in datasets_to_download.items():
    print(f"\n{'='*70}")
    print(f"Downloading {name}...")
    print(f"{'='*70}")

    try:
        path = kagglehub.dataset_download(kaggle_path)
        downloaded_paths[name] = path
        print(f"{name} downloaded to: {path}")

        # List files in directory
        import glob
        csv_files = glob.glob(f"{path}/**/*.csv", recursive=True)
        print(f"   Found {len(csv_files)} CSV file(s)")
        for csv in csv_files[:3]:  # Show first 3
            print(f"   - {os.path.basename(csv)}")

    except Exception as e:
        print(f"Error downloading {name}: {e}")
        downloaded_paths[name] = None

# Save paths for later use
with open(f'{RAW_DIR}/download_paths.pkl', 'wb') as f:
    pickle.dump(downloaded_paths, f)

print("\n" + "="*70)
print("ALL DATASETS DOWNLOADED")
print("="*70)
print("   Checkpoint 1B complete - paths saved to Drive")
print("   You can now stop and restart runtime if needed")

### Preprocessing Functions

In [None]:
"""
PREPROCESSING FUNCTIONS
Define all preprocessing utilities
"""

def clear_memory():
    """Clear memory and run garbage collection"""
    gc.collect()

def load_large_csv(filepath, sample_frac=1.0, chunksize=30000):
    """Load large CSV in chunks"""
    print(f"Loading: {os.path.basename(filepath)}")
    chunks = []
    total_rows = 0

    for i, chunk in enumerate(pd.read_csv(filepath, chunksize=chunksize, low_memory=False)):
        if sample_frac < 1.0 and np.random.random() > sample_frac:
            continue
        chunks.append(chunk)
        total_rows += len(chunk)
        if (i + 1) % 10 == 0:
            print(f"  Processed {i+1} chunks, {total_rows:,} rows")

    df = pd.concat(chunks, ignore_index=True)
    print(f"Loaded {len(df):,} rows, {len(df.columns)} columns")
    clear_memory()
    return df

def create_security_features(df):
    """Create cybersecurity-specific features"""
    print("Creating security features...")

    # Find relevant columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Packet-related features
    packet_cols = [c for c in numeric_cols if 'packet' in c.lower() or 'pkt' in c.lower()]
    byte_cols = [c for c in numeric_cols if 'byte' in c.lower() or 'length' in c.lower()]
    duration_cols = [c for c in numeric_cols if 'duration' in c.lower() or 'time' in c.lower()]

    new_features = []

    # Packet rate features
    if len(packet_cols) > 0 and len(duration_cols) > 0:
        total_packets = df[packet_cols].sum(axis=1)
        duration = df[duration_cols].iloc[:, 0].replace(0, 1)
        df['packet_rate'] = total_packets / duration
        new_features.append('packet_rate')

    # Byte rate features
    if len(byte_cols) > 0 and len(duration_cols) > 0:
        total_bytes = df[byte_cols].sum(axis=1)
        duration = df[duration_cols].iloc[:, 0].replace(0, 1)
        df['byte_rate'] = total_bytes / duration
        new_features.append('byte_rate')

    # Average packet size
    if len(byte_cols) > 0 and len(packet_cols) > 0:
        total_bytes = df[byte_cols].sum(axis=1)
        total_packets = df[packet_cols].sum(axis=1).replace(0, 1)
        df['avg_packet_size'] = total_bytes / total_packets
        new_features.append('avg_packet_size')

    # Flow asymmetry (forward vs backward)
    fwd_cols = [c for c in numeric_cols if 'fwd' in c.lower() or 'forward' in c.lower()]
    bwd_cols = [c for c in numeric_cols if 'bwd' in c.lower() or 'backward' in c.lower()]

    if len(fwd_cols) > 0 and len(bwd_cols) > 0:
        fwd_total = df[fwd_cols].sum(axis=1)
        bwd_total = df[bwd_cols].sum(axis=1)
        df['flow_asymmetry'] = (fwd_total - bwd_total) / (fwd_total + bwd_total + 1)
        new_features.append('flow_asymmetry')

    # Fill NaN and infinity
    for feat in new_features:
        df[feat] = df[feat].replace([np.inf, -np.inf], np.nan).fillna(0)

    print(f"Created {len(new_features)} security features")
    return df, new_features

def create_statistical_features(df):
    """Create statistical aggregation features"""
    print("Creating statistical features...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Select subset of columns to avoid explosion
    selected_cols = numeric_cols[:15]  # Use first 15 numeric columns

    new_features = []

    # Mean and std across selected features
    df['flow_mean'] = df[selected_cols].mean(axis=1)
    df['flow_std'] = df[selected_cols].std(axis=1)
    df['flow_min'] = df[selected_cols].min(axis=1)
    df['flow_max'] = df[selected_cols].max(axis=1)

    new_features.extend(['flow_mean', 'flow_std', 'flow_min', 'flow_max'])

    # Fill NaN
    for feat in new_features:
        df[feat] = df[feat].fillna(0)

    print(f"Created {len(new_features)} statistical features")
    return df, new_features

def map_to_standard_categories(label):
    """Map dataset-specific labels to standard categories"""
    label = str(label).lower()

    # DoS/DDoS patterns
    if any(x in label for x in ['dos', 'ddos', 'flood', 'slowloris']):
        return 'DoS'

    # Botnet/Malware
    if any(x in label for x in ['botnet', 'mirai', 'c&c', 'okiru', 'torii', 'malware']):
        return 'Botnet'

    # Reconnaissance
    if any(x in label for x in ['scan', 'recon', 'reconnaissance', 'probe']):
        return 'Reconnaissance'

    # Benign
    if 'benign' in label or 'normal' in label:
        return 'Benign'

    # Spoofing
    if 'spoof' in label or 'arp' in label:
        return 'Spoofing'

    # Web attacks
    if any(x in label for x in ['xss', 'sql', 'injection', 'web']):
        return 'WebAttack'

    # Brute force
    if 'brute' in label or 'dictionary' in label:
        return 'BruteForce'

    # Default
    return 'Other'

def balance_classes(X, y, max_samples=10000, min_samples=100):
    """Balance classes using hybrid approach"""
    print(f"\nBalancing classes (max={max_samples}, min={min_samples})...")

    unique_classes = y.unique()
    balanced_dfs = []

    for cls in unique_classes:
        mask = (y == cls)
        X_cls = X[mask]
        y_cls = y[mask]
        n_samples = len(y_cls)

        print(f"  {cls}: {n_samples} samples", end='')

        if n_samples > max_samples:
            # Downsample
            indices = resample(range(n_samples), n_samples=max_samples,
                             random_state=42, replace=False)
            X_cls = X_cls.iloc[indices] if isinstance(X_cls, pd.DataFrame) else X_cls[indices]
            y_cls = y_cls.iloc[indices] if isinstance(y_cls, pd.Series) else y_cls[indices]
            print(f" -> downsampled to {max_samples}")
        elif n_samples < min_samples:
            # Remove rare class
            print(f" -> removed (too few samples)")
            continue
        else:
            print(f" -> kept as is")

        balanced_dfs.append((X_cls, y_cls))

    # Combine
    if len(balanced_dfs) == 0:
        raise ValueError("No classes remaining after balancing")

    X_balanced = pd.concat([x for x, _ in balanced_dfs], ignore_index=True)
    y_balanced = pd.concat([y for _, y in balanced_dfs], ignore_index=True)

    print(f"\nBalanced: {len(X_balanced)} total samples, {len(unique_classes)} classes")
    return X_balanced, y_balanced

def preprocess_dataset(csv_path, dataset_name, sample_frac=0.3):
    """Complete preprocessing pipeline for a dataset"""

    print("\n" + "="*70)
    print(f"PREPROCESSING: {dataset_name}")
    print("="*70)

    start_time = time.time()

    # Load data
    df = load_large_csv(csv_path, sample_frac=sample_frac)
    print(f"Initial shape: {df.shape}")

    # Identify label column
    label_col = None
    for col in ['label', 'Label', 'attack_type', 'Attack', 'class']:
        if col in df.columns:
            label_col = col
            break

    if label_col is None:
        # Try last column
        label_col = df.columns[-1]
        print(f"No standard label column found, using: {label_col}")

    # Separate features and labels
    y = df[label_col]
    X = df.drop(columns=[label_col])

    print(f"Features: {X.shape[1]}, Samples: {len(y)}")
    print(f"Original classes: {y.nunique()}")

    # Map to standard categories
    print("\nMapping labels to standard categories...")
    y = y.apply(map_to_standard_categories)
    print(f"Mapped to {y.nunique()} standard categories:")
    print(y.value_counts())

    # Drop non-numeric columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X = X[numeric_cols]
    print(f"\nNumeric features: {len(numeric_cols)}")

    # Handle missing values
    X = X.fillna(0)
    X = X.replace([np.inf, -np.inf], 0)

    # Create engineered features
    X, security_features = create_security_features(X)
    X, statistical_features = create_statistical_features(X)

    print(f"\nTotal features after engineering: {X.shape[1]}")

    # Balance classes
    X, y = balance_classes(X, y, max_samples=10000, min_samples=100)

    # Train/validation/test split
    print("\nSplitting into train/val/test...")
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    X_val, X_test, y_val, y_temp = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )

    print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    # Package results
    preprocessed_data = {
        'train_x': X_train,
        'train_y': y_train,
        'val_x': X_val,
        'val_y': y_val,
        'test_x': X_test,
        'test_y': y_test,
        'feature_names': X.columns.tolist(),
        'classes': sorted(y.unique()),
        'dataset_name': dataset_name
    }

    elapsed = time.time() - start_time
    print(f"\nPreprocessing complete in {elapsed:.1f}s")

    clear_memory()
    return preprocessed_data

print("Preprocessing functions defined")

### Preprocess Each Dataset

In [None]:
"""
PREPROCESS ALL DATASETS
This takes 60-90 minutes total
"""

# Load download paths
with open(f'{RAW_DIR}/download_paths.pkl', 'rb') as f:
    downloaded_paths = pickle.load(f)

import glob

# Process each dataset
preprocessed_datasets = {}

# 1. CICIOMT (Source)
print("\n" + ""*35)
print("PROCESSING 1/4: CICIOMT (SOURCE)")
print(""*35)
csv_path = glob.glob(f"{downloaded_paths['CICIOMT']}/**/*.csv", recursive=True)[0]
ciciomt = preprocess_dataset(csv_path, 'CICIOMT', sample_frac=0.3)
preprocessed_datasets['CICIOMT'] = ciciomt

# Save checkpoint
with open(f'{PREPROCESSED_DIR}/ciciomt_preprocessed.pkl', 'wb') as f:
    pickle.dump(ciciomt, f)
print(f"Saved: ciciomt_preprocessed.pkl")
clear_memory()

# 2. CIC-IoT (Target 1)
print("\n" + ""*35)
print("PROCESSING 2/4: CIC-IoT (TARGET 1)")
print(""*35)
csv_path = glob.glob(f"{downloaded_paths['CIC-IoT']}/**/*.csv", recursive=True)[0]
ciciot = preprocess_dataset(csv_path, 'CIC-IoT', sample_frac=0.3)
preprocessed_datasets['CIC-IoT'] = ciciot

# Save checkpoint
with open(f'{PREPROCESSED_DIR}/ciciot_preprocessed.pkl', 'wb') as f:
    pickle.dump(ciciot, f)
print(f"Saved: ciciot_preprocessed.pkl")
clear_memory()

# 3. IoT-23 (Target 2)
print("\n" + ""*35)
print("PROCESSING 3/4: IoT-23 (TARGET 2)")
print(""*35)
csv_path = glob.glob(f"{downloaded_paths['IoT-23']}/**/*.csv", recursive=True)[0]
iot23 = preprocess_dataset(csv_path, 'IoT-23', sample_frac=0.3)
preprocessed_datasets['IoT-23'] = iot23

# Save checkpoint
with open(f'{PREPROCESSED_DIR}/iot23_preprocessed.pkl', 'wb') as f:
    pickle.dump(iot23, f)
print(f"Saved: iot23_preprocessed.pkl")
clear_memory()

# 4. IDS-2018 (Target 3)
print("\n" + ""*35)
print("PROCESSING 4/4: IDS-2018 (TARGET 3)")
print(""*35)
csv_path = glob.glob(f"{downloaded_paths['IDS-2018']}/**/*.csv", recursive=True)[0]
ids2018 = preprocess_dataset(csv_path, 'IDS-2018', sample_frac=0.3)
preprocessed_datasets['IDS-2018'] = ids2018

# Save checkpoint
with open(f'{PREPROCESSED_DIR}/ids2018_preprocessed.pkl', 'wb') as f:
    pickle.dump(ids2018, f)
print(f"Saved: ids2018_preprocessed.pkl")
clear_memory()

print("\n" + "="*70)
print("CHECKPOINT 3 COMPLETE - ALL DATASETS PREPROCESSED")
print("="*70)
print("   All preprocessed files saved to Drive")
print("   You can now stop and restart runtime if needed")

### Feature Alignment Functions

In [None]:

"""FEATURE ALIGNMENT FUNCTIONS"""

def align_features_pairwise(source_data, target_data):
    """
    Align source and target datasets for transfer learning
    Uses common features + engineered features + PCA for missing dimensions
    """
    print(f"\nAligning {source_data['dataset_name']} -> {target_data['dataset_name']}")

    # Get feature sets
    source_features = set(source_data['train_x'].columns)
    target_features = set(target_data['train_x'].columns)

    # Find common features
    common_features = sorted(list(source_features & target_features))
    print(f"  Common features: {len(common_features)}")

    # Extract common features
    source_train = source_data['train_x'][common_features]
    target_train = target_data['train_x'][common_features]
    target_val = target_data['val_x'][common_features]
    target_test = target_data['test_x'][common_features]

    # Add PCA features if needed (for dimension matching)
    n_pca = min(15, len(common_features))
    if n_pca > 0:
        print(f"  Adding {n_pca} PCA features...")
        pca = PCA(n_components=n_pca, random_state=42)

        # Fit on source
        source_pca = pca.fit_transform(source_train)
        pca_cols = [f'pca_{i}' for i in range(n_pca)]

        # Transform target
        target_train_pca = pca.transform(target_train)
        target_val_pca = pca.transform(target_val)
        target_test_pca = pca.transform(target_test)

        # Add to dataframes
        for i, col in enumerate(pca_cols):
            source_train[col] = source_pca[:, i]
            target_train[col] = target_train_pca[:, i]
            target_val[col] = target_val_pca[:, i]
            target_test[col] = target_test_pca[:, i]

    # Scale features
    print("  Scaling features...")
    scaler = RobustScaler()

    source_train_scaled = pd.DataFrame(
        scaler.fit_transform(source_train),
        columns=source_train.columns,
        index=source_train.index
    )

    target_train_scaled = pd.DataFrame(
        scaler.transform(target_train),
        columns=target_train.columns,
        index=target_train.index
    )

    target_val_scaled = pd.DataFrame(
        scaler.transform(target_val),
        columns=target_val.columns,
        index=target_val.index
    )

    target_test_scaled = pd.DataFrame(
        scaler.transform(target_test),
        columns=target_test.columns,
        index=target_test.index
    )

    # Package aligned source
    aligned_source = {
        'train_x': source_train_scaled,
        'train_y': source_data['train_y'],
        'val_x': source_data['val_x'][common_features],  # Keep for validation
        'val_y': source_data['val_y'],
        'feature_names': source_train_scaled.columns.tolist(),
        'classes': source_data['classes'],
        'dataset_name': source_data['dataset_name']
    }

    # Package aligned target
    aligned_target = {
        'train_x': target_train_scaled,
        'train_y': target_data['train_y'],
        'val_x': target_val_scaled,
        'val_y': target_data['val_y'],
        'test_x': target_test_scaled,
        'test_y': target_data['test_y'],
        'feature_names': target_test_scaled.columns.tolist(),
        'classes': target_data['classes'],
        'dataset_name': target_data['dataset_name']
    }

    print(f"Alignment complete: {len(aligned_source['train_x'].columns)} features")

    return aligned_source, aligned_target

print("Alignment functions defined")

### Create Aligned Datasets

In [None]:
"""
CREATE PAIRWISE ALIGNED DATASETS
This creates transfer learning pairs: CICIOMT -> each target"""


print("\n" + "="*70)
print("CREATING ALIGNED DATASETS FOR TRANSFER LEARNING")
print("="*70)

# Load preprocessed data (if not in memory from checkpoint 3)
try:
    if 'preprocessed_datasets' not in locals():
        print("Loading preprocessed datasets from Drive...")
        preprocessed_datasets = {}

        with open(f'{PREPROCESSED_DIR}/ciciomt_preprocessed.pkl', 'rb') as f:
            preprocessed_datasets['CICIOMT'] = pickle.load(f)
        with open(f'{PREPROCESSED_DIR}/ciciot_preprocessed.pkl', 'rb') as f:
            preprocessed_datasets['CIC-IoT'] = pickle.load(f)
        with open(f'{PREPROCESSED_DIR}/iot23_preprocessed.pkl', 'rb') as f:
            preprocessed_datasets['IoT-23'] = pickle.load(f)
        with open(f'{PREPROCESSED_DIR}/ids2018_preprocessed.pkl', 'rb') as f:
            preprocessed_datasets['IDS-2018'] = pickle.load(f)
        print("All datasets loaded")
except Exception as e:
    print(f"Error loading preprocessed data: {e}")
    print("Make sure Checkpoint 3 completed successfully")

# Create aligned pairs
ciciomt_source = preprocessed_datasets['CICIOMT']
targets = ['CIC-IoT', 'IoT-23', 'IDS-2018']

for target_name in targets:
    print(f"\n{'='*70}")
    print(f"ALIGNING: CICIOMT -> {target_name}")
    print(f"{'='*70}")

    target_data = preprocessed_datasets[target_name]

    # Create alignment
    aligned_source, aligned_target = align_features_pairwise(ciciomt_source, target_data)

    # Save aligned datasets
    source_filename = f'{ALIGNED_DIR}/enhanced_aligned_ciciomt_for_{target_name.lower()}.pkl'
    target_filename = f'{ALIGNED_DIR}/enhanced_aligned_{target_name.lower()}.pkl'

    with open(source_filename, 'wb') as f:
        pickle.dump(aligned_source, f)
    print(f"Saved: {os.path.basename(source_filename)}")

    with open(target_filename, 'wb') as f:
        pickle.dump(aligned_target, f)
    print(f"Saved: {os.path.basename(target_filename)}")

    clear_memory()

print("\n" + "="*70)
print("CHECKPOINT 5 COMPLETE - ALL ALIGNMENTS CREATED")
print("="*70)
print("Enhanced aligned datasets saved to Drive")
print("Ready for transfer learning experiments (Notebook 2)")

### Verification & Summary

In [None]:
"""
VERIFY ALL FILES AND GENERATE SUMMARY
"""

print("\n" + "="*70)
print("PREPROCESSING PIPELINE SUMMARY")
print("="*70)

# Check all files exist
print("\n1  PREPROCESSED DATASETS:")
for name in ['ciciomt', 'ciciot', 'iot23', 'ids2018']:
    filepath = f'{PREPROCESSED_DIR}/{name}_preprocessed.pkl'
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024**2)
        print(f"   {name}_preprocessed.pkl ({size_mb:.1f} MB)")
    else:
        print(f"   {name}_preprocessed.pkl (MISSING)")

print("\n2  ALIGNED DATASETS:")
for target in ['cic-iot', 'iot-23', 'ids-2018']:
    source_file = f'{ALIGNED_DIR}/enhanced_aligned_ciciomt_for_{target}.pkl'
    target_file = f'{ALIGNED_DIR}/enhanced_aligned_{target}.pkl'

    if os.path.exists(source_file) and os.path.exists(target_file):
        source_size = os.path.getsize(source_file) / (1024**2)
        target_size = os.path.getsize(target_file) / (1024**2)
        print(f"   Pair: CICIOMT -> {target.upper()} ({source_size:.1f} MB + {target_size:.1f} MB)")
    else:
        print(f"   Pair: CICIOMT -> {target.upper()} (MISSING)")

print("\n3  DATASET DETAILS:")
for name in ['CICIOMT', 'CIC-IoT', 'IoT-23', 'IDS-2018']:
    try:
        if name == 'CICIOMT':
            filepath = f'{PREPROCESSED_DIR}/ciciomt_preprocessed.pkl'
        elif name == 'CIC-IoT':
            filepath = f'{PREPROCESSED_DIR}/ciciot_preprocessed.pkl'
        elif name == 'IoT-23':
            filepath = f'{PREPROCESSED_DIR}/iot23_preprocessed.pkl'
        else:
            filepath = f'{PREPROCESSED_DIR}/ids2018_preprocessed.pkl'

        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        print(f"\n   {name}:")
        print(f"      Train: {len(data['train_x']):,} samples")
        print(f"      Test: {len(data['test_x']):,} samples")
        print(f"      Features: {len(data['feature_names'])}")
        print(f"      Classes: {', '.join(data['classes'])}")

    except Exception as e:
        print(f"\n   {name}: Error loading - {e}")

print("\n" + "="*70)
print("PREPROCESSING COMPLETE!")
print("="*70)
print("\nAll datasets preprocessed and aligned")
print("Files saved to Google Drive")
print("Ready for Notebook 2 (Transfer Learning Training)")
print("\nLocation: /content/drive/My Drive/transfer_learning_project/")
print("="*70)