In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization, Bidirectional, GlobalAveragePooling1D, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from tensorflow.keras.regularizers import l1, l2
import optuna
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed
import logging
from datetime import datetime
from sklearn.utils.class_weight import compute_class_weight
import json
import joblib
import pickle
import shutil
import tensorflow.keras.mixed_precision as mixed_precision
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import label_binarize

VIS_DIR = "model_artifacts/visualizations"
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'

# Configure logging
log_filename = f'model_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
    filename=log_filename,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# stream handler, also show logs in console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
logging.getLogger().addHandler(console_handler)

os.environ.setdefault('DATASET_PATH', r"/teamspace/studios/this_studio/CIC-IDS 2017")

input_shape = None
n_classes = None

# 1. Data Preparation
class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        
    def load_and_clean_data(self, data):
        """Enhanced data cleaning with better handling of outliers and invalid values"""
        try:
            # Convert all columns to numeric, except 'Label'
            numeric_columns = [col for col in data.columns if col != ' Label']
            
            # First, convert all numeric columns to float64
            for col in numeric_columns:
                data[col] = data[col].astype('float64')
            
            # Process in smaller chunks for memory efficiency
            chunk_size = 100000
            total_rows = len(data)
            
            for i in range(0, total_rows, chunk_size):
                end_idx = min(i + chunk_size, total_rows)
                chunk = data.iloc[i:end_idx].copy()
                
                # Calculate statistics for outlier detection
                Q1 = chunk[numeric_columns].quantile(0.25)
                Q3 = chunk[numeric_columns].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Replace outliers with bounds (now all columns are float64)
                for col in numeric_columns:
                    chunk.loc[chunk[col] < lower_bound[col], col] = lower_bound[col]
                    chunk.loc[chunk[col] > upper_bound[col], col] = upper_bound[col]
                
                # Replace infinities with NaN
                chunk = chunk.replace([np.inf, -np.inf], np.nan)
                
                # Fill NaN with median
                chunk = chunk.fillna(chunk[numeric_columns].median())
                
                # Save cleaned chunk to CSV
                chunk.to_csv(f'cleaned_data_chunk_{i // chunk_size}.csv', index=False)
                print(f"Saved cleaned chunk {i // chunk_size} to disk.")
                
                # Update the original dataframe
                data.iloc[i:end_idx] = chunk
                
                progress = (end_idx / total_rows) * 100
                print(f"Cleaning Progress: {progress:.2f}%")
            
            return data
            
        except Exception as e:
            logging.error(f"Error in data cleaning: {str(e)}")
            raise
    
    def prepare_features_and_labels(self, data, feature_columns, label_column):
        """Prepare features and labels"""
        # Separate features and labels
        X = data[feature_columns]
        y = data[label_column]
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        
        return X_scaled, y_encoded
    
    def balance_dataset(self, X, y, max_samples_per_class=10000):
        """
        Balance the dataset:
        - Downsample classes with > `max_samples_per_class` to `max_samples_per_class`.
        - Apply Standard SMOTE to classes with < 50 samples to increase to `max_samples_per_class`.
        - Apply Cluster-Based SMOTE to classes with ≥ 50 samples to increase to `max_samples_per_class`.
        """
        print("\nBalancing dataset...")
        try:
            # Step 1: Log original class distribution
            unique_classes, class_counts = np.unique(y, return_counts=True)
            print("\nOriginal class distribution:")
            for cls, count in zip(unique_classes, class_counts):
                print(f"Class {cls}: {count} samples")

            # Step 2: Downsample classes with > max_samples_per_class
            print("\nDownsampling classes with more than max_samples_per_class...")
            X_balanced_list = []
            y_balanced_list = []

            for cls in unique_classes:
                idx = np.where(y == cls)[0]

                if len(idx) > max_samples_per_class:
                    # Downsample majority class
                    idx = np.random.choice(idx, max_samples_per_class, replace=False)
                    print(f"Class {cls} downsampled to {max_samples_per_class} samples.")
                else:
                    print(f"Class {cls} retained with {len(idx)} samples.")

                # Add to balanced list
                X_balanced_list.append(X[idx])
                y_balanced_list.append(y[idx])

            # Concatenate after downsampling
            X_balanced = np.vstack(X_balanced_list)
            y_balanced = np.concatenate(y_balanced_list)

            print("\nAfter downsampling:")
            unique_classes_ds, class_counts_ds = np.unique(y_balanced, return_counts=True)
            for cls, count in zip(unique_classes_ds, class_counts_ds):
                print(f"Class {cls}: {count} samples")

            # Step 3: Classify small and large classes for SMOTE
            small_classes = [cls for cls, count in zip(unique_classes_ds, class_counts_ds) if count < 50]
            large_classes = [cls for cls, count in zip(unique_classes_ds, class_counts_ds) if count >= 50]

            # Step 4: Apply Standard SMOTE to Small Classes
            if small_classes:
                print(
                    f"\nApplying Standard SMOTE to classes {small_classes} to increase to {max_samples_per_class} samples...")
                smote = SMOTE(sampling_strategy={cls: max_samples_per_class for cls in small_classes}, random_state=42)
                X_balanced, y_balanced = smote.fit_resample(X_balanced, y_balanced)

                print("\nAfter applying Standard SMOTE:")
                unique_classes_smote, class_counts_smote = np.unique(y_balanced, return_counts=True)
                for cls, count in zip(unique_classes_smote, class_counts_smote):
                    print(f"Class {cls}: {count} samples")

            # Step 5: Apply Cluster-Based SMOTE to Larger Classes
            if large_classes:
                print(
                    f"\nOversampling classes {large_classes} using Cluster-Based SMOTE to {max_samples_per_class} samples...")
                sampling_strategy = {cls: max_samples_per_class for cls in large_classes}

                kmeans_smote = KMeansSMOTE(
                    sampling_strategy=sampling_strategy,
                    random_state=42,
                    k_neighbors=5,  # Use 5 neighbors for SMOTE
                    cluster_balance_threshold=0.01,  # Balance threshold
                    kmeans_estimator=15  # Use 15 clusters for KMeans
            )
                X_res, y_res = kmeans_smote.fit_resample(X_balanced, y_balanced)
            else:
                print("\nNo classes require oversampling with Cluster-Based SMOTE.")
                X_res, y_res = X_balanced, y_balanced

            print("\nFinal class distribution:")
            unique_final, counts_final = np.unique(y_res, return_counts=True)
            for cls, count in zip(unique_final, counts_final):
                print(f"Class {cls}: {count} samples")

            return X_res, y_res

        except Exception as e:
            logging.error(f"Error balancing dataset: {str(e)}")
            raise

    
    def create_sequences(self, X, y, time_steps=10):
        """Create sequences with improved feature handling"""
        try:
            print("\nCreating sequences for LSTM...")
            print(f"Original input shape: {X.shape}")
            
            # Ensure number of features is appropriate for sequence length
            n_features = X.shape[1]
            n_features_per_timestep = n_features
            
            # Reshape maintaining all features
            n_samples = X.shape[0] - time_steps + 1
            X_seq = np.zeros((n_samples, time_steps, n_features_per_timestep))
            
            for i in range(n_samples):
                X_seq[i] = X[i:i + time_steps]
            
            # Adjust labels to match sequence length
            y_seq = y[time_steps-1:]
            
            print(f"Final sequence shape: {X_seq.shape}")
            return X_seq, y_seq
            
        except Exception as e:
            print(f"Error in sequence creation: {str(e)}")
            raise


# 2. Model Architecture
def create_model(input_shape, n_classes):
    print(f"\nCreating model with input shape: {input_shape}")
    model = Sequential([
        # CNN layers with regularization
        Conv1D(64, kernel_size=3, padding='same', activation='relu', 
               kernel_regularizer=l2(0.01), input_shape=input_shape),
        BatchNormalization(),
        GlobalAveragePooling1D(),  # Replaced MaxPooling1D
        Dropout(0.3),
        
        # LSTM layers with regularization
        Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01))),
        BatchNormalization(),
        Dropout(0.3),
        
        Bidirectional(LSTM(32, kernel_regularizer=l2(0.01))),
        BatchNormalization(),
        Dropout(0.3),
        
        # Dense layers with regularization
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(n_classes, activation='softmax')
    ])
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model
#Visualizations
def create_visualizations(history, y_true, y_pred, y_pred_proba, class_names):
    """Create comprehensive evaluation visualizations"""
    try:
        os.makedirs(VIS_DIR, exist_ok=True)
        
        # 1. Confusion Matrix
        plt.figure(figsize=(12, 10))
        cm = confusion_matrix(y_true, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=class_names,
                    yticklabels=class_names)
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(VIS_DIR, 'confusion_matrix.png'))
        plt.close()

        # 2. ROC Curves (One-vs-Rest)
        y_test_bin = label_binarize(y_true, classes=np.arange(len(class_names)))
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        plt.figure(figsize=(10, 8))
        for i in range(len(class_names)):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
            plt.plot(fpr[i], tpr[i], label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')
            
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Multi-class ROC Curves')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(VIS_DIR, 'roc_curves.png'))
        plt.close()

        # 3. Precision-Recall Curves
        precision = dict()
        recall = dict()
        
        plt.figure(figsize=(10, 8))
        for i in range(len(class_names)):
            precision[i], recall[i], _ = precision_recall_curve(y_test_bin[:, i], 
                                                              y_pred_proba[:, i])
            plt.plot(recall[i], precision[i], 
                     label=f'{class_names[i]}')
            
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curves')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(VIS_DIR, 'precision_recall_curves.png'))
        plt.close()

        # 4. Training History
        if history is not None:
            plt.figure(figsize=(12, 5))
            
            # Accuracy plot
            plt.subplot(1, 2, 1)
            plt.plot(history.history['accuracy'], label='Train')
            plt.plot(history.history['val_accuracy'], label='Validation')
            plt.title('Model Accuracy')
            plt.ylabel('Accuracy')
            plt.xlabel('Epoch')
            plt.legend()
            
            # Loss plot
            plt.subplot(1, 2, 2)
            plt.plot(history.history['loss'], label='Train')
            plt.plot(history.history['val_loss'], label='Validation')
            plt.title('Model Loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.legend()
            
            plt.tight_layout()
            plt.savefig(os.path.join(VIS_DIR, 'training_history.png'))
            plt.close()

    except Exception as e:
        logging.error(f"Error creating visualizations: {str(e)}")
        print(f"Error creating visualizations: {str(e)}")
        
# 3. Training and Evaluation
class ModelTrainer:
    def __init__(self, model, learning_rate=0.001):
        self.model = model
        self.learning_rate = learning_rate
        self.best_model = None
        self.history = None
    def train(self, X_train, y_train, batch_size=32, epochs=50, validation_split=0.2):
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True,
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                min_lr=1e-6,
                verbose=1
            )
        ]
        
        self.history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            callbacks=callbacks,
    verbose=1
)
        return self.history
    
    def evaluate(self, X_test, y_test, label_encoder):
        """Enhanced evaluation with detailed metrics"""
        y_pred = self.model.predict(X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)
        
        # Calculate detailed metrics
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, y_pred_classes, average='weighted'
        )
        accuracy = accuracy_score(y_test, y_pred_classes)
        
        print("\nTest Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        # Print detailed classification report
        print("\nClassification Report:")
        class_names = label_encoder.classes_
        print(classification_report(y_test, y_pred_classes, target_names=class_names))

        class_names = label_encoder.classes_
        create_visualizations(self.history, y_test, y_pred_classes, y_pred, class_names)
        
        # Plot training history if available
        if hasattr(self.model, 'history') and self.model.history is not None:
            history = self.model.history.history
            if history and len(history) > 0:  # Check if history exists and is not empty
                plt.figure(figsize=(12, 4))
                
                # Plot accuracy if available
                if 'accuracy' in history or 'acc' in history:
                    plt.subplot(1, 2, 1)
                    if 'accuracy' in history:
                        plt.plot(history['accuracy'], label='Train')
                        if 'val_accuracy' in history:
                            plt.plot(history['val_accuracy'], label='Validation')
                    elif 'acc' in history:
                        plt.plot(history['acc'], label='Train')
                        if 'val_acc' in history:
                            plt.plot(history['val_acc'], label='Validation')
                    plt.title('Model Accuracy')
                    plt.ylabel('Accuracy')
                    plt.xlabel('Epoch')
                    plt.legend()
                
                # Plot loss if available
                if 'loss' in history:
                    plt.subplot(1, 2, 2)
                    plt.plot(history['loss'], label='Train')
                    if 'val_loss' in history:
                        plt.plot(history['val_loss'], label='Validation')
                    plt.title('Model Loss')
                    plt.ylabel('Loss')
                    plt.xlabel('Epoch')
                    plt.legend()
                
                plt.tight_layout()
                plt.show()
            else:
                print("\nNo training history available to plot")
        else:
            print("\nNo training history available to plot")

    def train_with_improvements(self, X_train, y_train, batch_size=32, epochs=100, validation_split=0.2):
        """Enhanced training with improvements"""
        # Data augmentation for time series
        data_augmentation = tf.keras.Sequential([
            tf.keras.layers.GaussianNoise(0.1)
        ])
        
        # Apply augmentation
        X_train_aug = data_augmentation(X_train)
        X_train = np.concatenate([X_train, X_train_aug], axis=0)
        y_train = np.concatenate([y_train, y_train], axis=0)
        
        # Get callbacks
        callbacks = get_advanced_callbacks()
        
        # Train with class weights
        class_weights = compute_class_weight(
            'balanced',
            classes=np.unique(y_train),
            y=y_train
        )
        class_weight_dict = dict(enumerate(class_weights))
        
        # Train the model
        self.history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            callbacks=callbacks,
            class_weight=class_weight_dict,
            verbose=1
        )
        
        return self.history

def load_dataset():
    """Load dataset from the specified folder"""
    folder_path = os.getenv('DATASET_PATH')
    
    try:
        # List all CSV files in the folder
        csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
        print(f"Found {len(csv_files)} CSV files: {csv_files}")
        
        # Load CSV files one by one and process them
        all_data = []
        for file in csv_files:
            file_path = os.path.join(folder_path, file)
            print(f"\nLoading {file}...")
            
            # Read CSV in chunks
            chunk_size = 50000
            chunks = pd.read_csv(file_path, chunksize=chunk_size)
            
            file_data = []
            for i, chunk in enumerate(chunks):
                print(f"Processing chunk {i+1}...")
                file_data.append(chunk)
            
            # Combine chunks for this file
            df = pd.concat(file_data, ignore_index=True)
            print(f"Shape: {df.shape}")
            all_data.append(df)
        
        # Combine all dataframes
        print("\nCombining all datasets...")
        combined_data = pd.concat(all_data, ignore_index=True)
        print(f"Final combined shape: {combined_data.shape}")
        
        return combined_data
        
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        raise


def process_chunk(chunk, scaler=None):
    """Process a single data chunk"""
    try:
        # Convert numeric columns
        numeric_cols = [col for col in chunk.columns if col != ' Label']
        chunk[numeric_cols] = chunk[numeric_cols].astype('float64')
        
        # Handle outliers
        Q1 = chunk[numeric_cols].quantile(0.25)
        Q3 = chunk[numeric_cols].quantile(0.75)
        IQR = Q3 - Q1
        chunk[numeric_cols] = chunk[numeric_cols].clip(
            lower=Q1 - 1.5 * IQR,
            upper=Q3 + 1.5 * IQR
        )
        
        # Scale if scaler provided
        if scaler is not None:
            chunk[numeric_cols] = scaler.transform(chunk[numeric_cols])
        
        return chunk
        
    except Exception as e:
        logging.error(f"Error processing chunk: {str(e)}")
        raise


def evaluate(self, X_test, y_test, label_encoder):
    """Enhanced evaluation with detailed metrics"""
    y_pred = self.model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    # Calculate detailed metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred_classes, average='weighted'
    )
    accuracy = accuracy_score(y_test, y_pred_classes)
    
    print("\nTest Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Print detailed classification report
    print("\nClassification Report:")
    class_names = label_encoder.classes_
    print(classification_report(y_test, y_pred_classes, target_names=class_names))

    # Create visualizations with the updated function signature
    create_visualizations(self.history, y_test, y_pred_classes, y_pred, class_names)
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(y_test, y_pred_classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # 6. t-SNE Visualization
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, random_state=42)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)  # Flatten sequences
    features_2d = tsne.fit_transform(X_test_flat)

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(features_2d[:, 0], features_2d[:, 1], 
                     c=y_test, cmap='viridis', alpha=0.6)
    
    plt.colorbar(scatter)
    plt.title('Feature Space Visualization with t-SNE')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.savefig('tsne_visualization.png')
    plt.show()
    

def get_advanced_callbacks():
    """Get advanced callbacks for better training"""
    checkpoint_dir = 'checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    return [
        EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(checkpoint_dir, 'model_epoch_{epoch:02d}.weights.h5'),
            save_weights_only=True,
            save_best_only=True,
            monitor='val_loss',
            verbose=1
        ),
        tf.keras.callbacks.CSVLogger('training_log.csv')
    ]

def create_improved_model(input_shape, n_classes):
    """Create an improved model with better training speed"""
    # Use mixed precision for faster training
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    
    model = Sequential([
        Input(shape=input_shape),
        
        # First CNN block - reduced complexity
        Conv1D(64, kernel_size=3, padding='same', activation='relu', 
               kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # Second CNN block - reduced complexity
        Conv1D(128, kernel_size=3, padding='same', activation='relu',
               kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.4),
        
        # Single LSTM layer for faster training
        Bidirectional(LSTM(64, kernel_regularizer=l2(0.01))),
        BatchNormalization(),
        Dropout(0.4),
        
        # Dense layers
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(n_classes, activation='softmax')
    ])
    
    # Use a more efficient optimizer
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=0.001,
        weight_decay=0.01
    )
    
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def save_model_artifacts(model, history, preprocessor, X_test, y_test, y_pred, y_pred_proba, 
                        selected_features, model_config, base_dir='model_artifacts'):
    """Save model artifacts with error handling"""
    
    # Create base directory and subdirectories
    os.makedirs(base_dir, exist_ok=True)
    subdirs = ['weights', 'logs', 'data', 'config', 'checkpoints', 
               'evaluation', 'preprocessing', 'visualizations', 'deployment']
    
    for subdir in subdirs:
        os.makedirs(os.path.join(base_dir, subdir), exist_ok=True)
    
    try:
        # 1. Save model weights with correct extension
        model.save_weights(os.path.join(base_dir, 'weights', 'model_weights.weights.h5'))
        
        # 2. Save complete model with correct extension
        model.save(os.path.join(base_dir, 'deployment', 'complete_model.h5'))
        
        # 3. Save training logs
        pd.DataFrame(history.history).to_csv(
            os.path.join(base_dir, 'logs', 'training_logs.csv')
        )
        
        # 4. Save preprocessed data samples
        np.savez(os.path.join(base_dir, 'data', 'preprocessed_data.npz'),
                 X_test=X_test, y_test=y_test)
        
        # 5. Save model configuration
        with open(os.path.join(base_dir, 'config', 'model_config.json'), 'w') as f:
            json.dump(model_config, f, indent=4)
        
        # 6. Save preprocessing objects
        joblib.dump(preprocessor.scaler, 
                   os.path.join(base_dir, 'preprocessing', 'scaler.pkl'))
        joblib.dump(preprocessor.label_encoder, 
                   os.path.join(base_dir, 'preprocessing', 'label_encoder.pkl'))
        
        print(f"\nAll model artifacts saved successfully in {base_dir}")
        
    except Exception as e:
        logging.error(f"Error saving model artifacts: {str(e)}")
        raise
    
def create_distribution_visualizations(X, y, class_names, output_dir='model_artifacts/visualizations'):
    """
    Create t-SNE and PCA visualizations for the final data distribution
    
    Parameters:
    -----------
    X : numpy.ndarray
        The feature matrix
    y : numpy.ndarray
        The target labels
    class_names : list
        List of class names
    output_dir : str
        Directory to save the visualizations
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Flatten the sequences if dealing with 3D data (samples, timesteps, features)
    if len(X.shape) == 3:
        X_flat = X.reshape(X.shape[0], -1)
    else:
        X_flat = X
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_flat)
    
    # Create PCA visualization
    print("\nGenerating PCA visualization...")
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.6)
    plt.colorbar(scatter, label='Class')
    plt.title('PCA Visualization of Data Distribution')
    plt.xlabel(f'First Principal Component\nExplained Variance: {pca.explained_variance_ratio_[0]:.3f}')
    plt.ylabel(f'Second Principal Component\nExplained Variance: {pca.explained_variance_ratio_[1]:.3f}')
    
    # Add legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                                 markerfacecolor=plt.cm.tab10(i / len(class_names)), 
                                 label=class_names[i], markersize=10)
                      for i in range(len(class_names))]
    plt.legend(handles=legend_elements, title='Classes', 
              bbox_to_anchor=(1.15, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'pca_distribution.png'), 
                bbox_inches='tight', dpi=300)
    plt.close()
    
    # Create t-SNE visualization
    print("\nGenerating t-SNE visualization...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, 
                n_iter=1000, learning_rate='auto')
    X_tsne = tsne.fit_transform(X_scaled)
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='tab10', alpha=0.6)
    plt.colorbar(scatter, label='Class')
    plt.title('t-SNE Visualization of Data Distribution')
    plt.xlabel('First t-SNE Component')
    plt.ylabel('Second t-SNE Component')
    
    # Add legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                                 markerfacecolor=plt.cm.tab10(i / len(class_names)), 
                                 label=class_names[i], markersize=10)
                      for i in range(len(class_names))]
    plt.legend(handles=legend_elements, title='Classes', 
              bbox_to_anchor=(1.15, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'tsne_distribution.png'), 
                bbox_inches='tight', dpi=300)
    plt.close()
    
    # Create distribution analysis report
    create_distribution_report(X_pca, X_tsne, y, class_names, pca, output_dir)

def create_distribution_report(X_pca, X_tsne, y, class_names, pca, output_dir):
    """Create a detailed report of the distribution analysis"""
    report_path = os.path.join(output_dir, 'distribution_analysis_report.txt')
    
    with open(report_path, 'w') as f:
        f.write("Data Distribution Analysis Report\n")
        f.write("================================\n\n")
        
        # Overall statistics
        f.write("1. Overall Statistics\n")
        f.write("-----------------\n")
        f.write(f"Total samples: {len(y)}\n")
        unique_classes, class_counts = np.unique(y, return_counts=True)
        f.write("\nClass distribution:\n")
        for cls, count in zip(class_names, class_counts):
            f.write(f"{cls}: {count} samples ({count/len(y)*100:.2f}%)\n")
        
        # PCA analysis
        f.write("\n2. PCA Analysis\n")
        f.write("-------------\n")
        f.write("Explained variance ratios:\n")
        f.write(f"First component: {pca.explained_variance_ratio_[0]:.4f}\n")
        f.write(f"Second component: {pca.explained_variance_ratio_[1]:.4f}\n")
        f.write(f"Total variance explained: {sum(pca.explained_variance_ratio_[:2])*100:.2f}%\n")
        
        # Class separation analysis
        f.write("\n3. Class Separation Analysis\n")
        f.write("-------------------------\n")
        f.write("Mean distances between class centers:\n")
        
        # Calculate class centers in PCA space
        centers_pca = np.array([X_pca[y == i].mean(axis=0) for i in range(len(class_names))])
        centers_tsne = np.array([X_tsne[y == i].mean(axis=0) for i in range(len(class_names))])
        
        # Calculate distances between class centers
        for i in range(len(class_names)):
            for j in range(i + 1, len(class_names)):
                dist_pca = np.linalg.norm(centers_pca[i] - centers_pca[j])
                dist_tsne = np.linalg.norm(centers_tsne[i] - centers_tsne[j])
                f.write(f"\n{class_names[i]} vs {class_names[j]}:\n")
                f.write(f"  PCA distance: {dist_pca:.4f}\n")
                f.write(f"  t-SNE distance: {dist_tsne:.4f}\n")

# Main execution
def main():
    try:
        # Initialize preprocessor
        preprocessor = DataPreprocessor()
        
        # Define the selected features
        selected_features = [
            # Original High-Importance Features
            ' Flow IAT Min',
            ' Bwd Avg Packets/Bulk',
            'Fwd PSH Flags',
            'Init_Win_bytes_forward',
            ' PSH Flag Count',
            ' Bwd URG Flags',
            ' Fwd Packet Length Mean',
            ' Fwd IAT Std',
            'Fwd Packets/s',
            'Bwd Avg Bulk Rate',
            ' Destination Port',
            'Idle Mean',
            ' Packet Length Mean',
            # Additional Features for Complete Coverage
            ' Flow Duration',
            'Active Mean',
            ' Active Min',
            ' Total Length of Bwd Packets',
            ' Bwd Header Length',
            ' Subflow Fwd Bytes',
            ' Total Fwd Packets'
        ]
        
        # Load and preprocess data
        print("\nLoading dataset...")
        data = load_dataset()
        
        print("\nCleaning data...")
        data = preprocessor.load_and_clean_data(data)
        
        # Print available columns
        print("\nAvailable columns in dataset:")
        print(data.columns.tolist())
        
        # Verify all selected features are in the dataset
        missing_features = [f for f in selected_features if f not in data.columns]
        if missing_features:
            raise ValueError(f"Missing features in dataset: {missing_features}")
        
        print("\nUsing selected features:")
        print(selected_features)
        
        # Prepare features and labels
        print("\nPreparing features and labels...")
        X = data[selected_features]
        y = data[' Label']
        
        # Free memory
        del data
        
        # Scale features
        print("\nScaling features...")
        X_scaled = preprocessor.scaler.fit_transform(X)
        y_encoded = preprocessor.label_encoder.fit_transform(y)
        
        # Free more memory
        del X
        
        # Balance dataset
        print("\nBalancing dataset...")
        X_balanced, y_balanced = preprocessor.balance_dataset(
            X_scaled, y_encoded,
            max_samples_per_class=10000
        )
        
        # Create sequences
        print("\nCreating sequences...")
        X_seq, y = preprocessor.create_sequences(X_balanced, y_balanced)
        
        # Split data
        print("\nSplitting data...")
        X_train, X_test, y_train, y_test = train_test_split(
            X_seq, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Validate data shapes before training
        print("\nValidating data shapes:")
        print(f"X_train shape: {X_train.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_test shape: {y_test.shape}")
        
        if len(X_train.shape) != 3:
            raise ValueError(f"Expected X_train to be 3D, got shape {X_train.shape}")
        
        # Define input shape and number of classes
        input_shape = (X_train.shape[1], X_train.shape[2])
        n_classes = len(np.unique(y_train))
        
        # Save data shapes for model recovery
        np.savez('data_shapes.npz',
                 input_shape=input_shape,
                 n_classes=n_classes)
        
        # Create and train improved model
        print("\nCreating and training improved model...")
        
        # Create model configuration
        model_config = {
            'architecture': 'CNN-LSTM',
            'input_shape': input_shape,
            'n_classes': n_classes,
            'hyperparameters': {
                'initial_learning_rate': 0.001,
                'batch_size': 32,
                'epochs': 50
            }
        }
        
        # Create model
        model = create_improved_model(input_shape, n_classes)
        
        # Print model summary
        print("\nModel Architecture:")
        model.summary()
        
        # Save initial model weights
        os.makedirs('checkpoints', exist_ok=True)
        model.save_weights('checkpoints/initial_weights.weights.h5')
        
        # Train model with try-except
        try:
            trainer = ModelTrainer(model)
            history = trainer.train_with_improvements(X_train, y_train)
            
            # Save training history immediately
            with open('training_history.pkl', 'wb') as f:
                pickle.dump(history.history, f)
                
        except Exception as train_error:
            logging.error(f"Error during training: {str(train_error)}")
            # Try to load last checkpoint
            latest_checkpoint = tf.train.latest_checkpoint('checkpoints')
            if latest_checkpoint:
                print("\nAttempting to load last checkpoint...")
                model.load_weights(latest_checkpoint)
            raise
            
        # Generate predictions with error handling
        try:
            y_pred = model.predict(X_test, batch_size=32)
            y_pred_classes = np.argmax(y_pred, axis=1)
            
            # Save predictions immediately
            np.savez('predictions.npz',
                    y_pred=y_pred,
                    y_pred_classes=y_pred_classes)
                    
        except Exception as pred_error:
            logging.error(f"Error during prediction: {str(pred_error)}")
            raise
        
        # Save all model artifacts
        save_model_artifacts(
            model=model,
            history=history,
            preprocessor=preprocessor,
            X_test=X_test,
            y_test=y_test,
            y_pred=y_pred_classes,
            y_pred_proba=y_pred,
            selected_features=selected_features,
            model_config=model_config
        )
        
        # Continue with evaluation
        trainer.evaluate(X_test, y_test, preprocessor.label_encoder)
        #Distribution Visualizations
        print("\nGenerating distribution visualizations...")
        create_distribution_visualizations(
            X_test.reshape(X_test.shape[0], -1),  # Flatten sequences
            y_test,
            preprocessor.label_encoder.classes_,
            output_dir='model_artifacts/visualizations'
        )
        
        print("\nVisualization artifacts have been saved to 'model_artifacts/visualizations'")

        
    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        raise
    
def objective(trial, X_train, y_train, input_shape, n_classes):
    """Optuna objective function for hyperparameter optimization"""
    # Define hyperparameter search space
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    lstm_units_1 = trial.suggest_int('lstm_units_1', 32, 128)
    lstm_units_2 = trial.suggest_int('lstm_units_2', 16, 64)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    l2_reg = trial.suggest_float('l2_reg', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_int('batch_size', 32, 128)
    
    # Create model with trial parameters
    model = Sequential([
        Conv1D(64, kernel_size=3, padding='same', activation='relu',
               kernel_regularizer=l2(l2_reg), input_shape=input_shape),
        BatchNormalization(),
        GlobalAveragePooling1D(),
        Dropout(dropout_rate),
        
        Bidirectional(LSTM(lstm_units_1, return_sequences=True, 
                          kernel_regularizer=l2(l2_reg))),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        Bidirectional(LSTM(lstm_units_2, kernel_regularizer=l2(l2_reg))),
        BatchNormalization(),
        Dropout(dropout_rate),
        
        Dense(32, activation='relu', kernel_regularizer=l2(l2_reg)),
        BatchNormalization(),
        Dropout(dropout_rate),
        Dense(n_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Train with early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=5)
    history = model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=50,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=0
    )
    
    return history.history['val_loss'][-1]

if __name__ == "__main__":
    main()


Loading dataset...
Found 8 CSV files: ['Friday-WorkingHours-Afternoon-DDos.csv', 'Friday-WorkingHours-Afternoon-PortScan.csv', 'Friday-WorkingHours-Morning.csv', 'Monday-WorkingHours.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.csv', 'Thursday-WorkingHours-Morning-WebAttacks.csv', 'Tuesday-WorkingHours.csv', 'Wednesday-workingHours.csv']

Loading Friday-WorkingHours-Afternoon-DDos.csv...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Shape: (225745, 79)

Loading Friday-WorkingHours-Afternoon-PortScan.csv...


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Shape: (286467, 79)

Loading Friday-WorkingHours-Morning.csv...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Shape: (191033, 79)

Loading Monday-WorkingHours.csv...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Shape: (529918, 79)

Loading Thursday-WorkingHours-Afternoon-Infilteration.csv...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Shape: (288602, 79)

Loading Thursday-WorkingHours-Morning-WebAttacks.csv...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Shape: (170366, 79)

Loading Tuesday-

  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)
  super()._check_params_vs_input(X, default_n_init=3)



Final class distribution:
Class 0: 10000 samples
Class 1: 10003 samples
Class 2: 10000 samples
Class 3: 10000 samples
Class 4: 10000 samples
Class 5: 10006 samples
Class 6: 10004 samples
Class 7: 10001 samples
Class 8: 10000 samples
Class 9: 10000 samples
Class 10: 10000 samples
Class 11: 10003 samples
Class 12: 10000 samples
Class 13: 10000 samples
Class 14: 10000 samples

Creating sequences...

Creating sequences for LSTM...
Original input shape: (150017, 20)
Final sequence shape: (150008, 10, 20)

Splitting data...

Validating data shapes:
X_train shape: (120006, 10, 20)
y_train shape: (120006,)
X_test shape: (30002, 10, 20)
y_test shape: (30002,)

Creating and training improved model...

Model Architecture:


W0000 00:00:1738583542.528629    1595 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/100
[1m5999/6001[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.8834 - loss: 1.8578
Epoch 1: val_loss improved from inf to 0.28417, saving model to checkpoints/model_epoch_01.weights.h5
[1m6001/6001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 10ms/step - accuracy: 0.8834 - loss: 1.8572 - val_accuracy: 0.9919 - val_loss: 0.2842 - learning_rate: 0.0010
Epoch 2/100
[1m5997/6001[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.9741 - loss: 0.3428
Epoch 2: val_loss improved from 0.28417 to 0.27023, saving model to checkpoints/model_epoch_02.weights.h5
[1m6001/6001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 10ms/step - accuracy: 0.9741 - loss: 0.3428 - val_accuracy: 0.9939 - val_loss: 0.2702 - learning_rate: 0.0010
Epoch 3/100
[1m5996/6001[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.9757 - loss: 0.3276
Epoch 3: val_loss improved from 0.27023 to 0.24819, saving mod

You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. 
You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. 
You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. 



All model artifacts saved successfully in model_artifacts
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Test Results:
Accuracy: 0.9976
Precision: 0.9976
Recall: 0.9976
F1 Score: 0.9976

Classification Report:
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00      1998
                       Bot       1.00      1.00      1.00      2001
                      DDoS       1.00      1.00      1.00      2000
             DoS GoldenEye       1.00      1.00      1.00      2000
                  DoS Hulk       1.00      1.00      1.00      2000
          DoS Slowhttptest       1.00      1.00      1.00      2001
             DoS slowloris       1.00      1.00      1.00      2001
               FTP-Patator       1.00      1.00      1.00      2000
                Heartbleed       1.00      1.00      1.00      2000
              Infiltration       1.00      1.00      1.00      2000
           