In [None]:
#Data Cleaning and Preprocessing


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

class NetworkDataPreprocessor:
    """
    Comprehensive preprocessor for CIC-IDS2017 network traffic data
    """
    
    def __init__(self):
        self.scaler = RobustScaler()  # Better for outliers than StandardScaler
        self.label_encoder = LabelEncoder()
        self.feature_columns = None
        self.target_column = 'Label'
        self.processed_data = None
        
    def load_data(self, file_path):
        """Load the dataset"""
        try:
            self.df = pd.read_csv(file_path)
            print(f"✅ Loaded {self.df.shape[0]:,} records with {self.df.shape[1]} features")
            return True
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return False
    
    def clean_column_names(self):
        """Clean and standardize column names"""
        
        # Remove leading/trailing spaces and replace spaces with underscores
        self.df.columns = self.df.columns.str.strip().str.replace(' ', '_')
        
        # Handle special characters
        self.df.columns = self.df.columns.str.replace('/', '_per_')
        self.df.columns = self.df.columns.str.replace('-', '_')
        
        print(f"✅ Cleaned {len(self.df.columns)} column names")
        return self.df.columns.tolist()
    
    def handle_missing_values(self):
        """Handle missing values in the dataset"""
        
        # Check for missing values
        missing_summary = self.df.isnull().sum()
        missing_cols = missing_summary[missing_summary > 0]
        
        if len(missing_cols) > 0:
            print(f"Found missing values in {len(missing_cols)} columns:")
            for col, count in missing_cols.items():
                pct = (count / len(self.df)) * 100
                print(f"  {col}: {count:,} ({pct:.2f}%)")
            
            # Strategy for handling missing values
            numeric_cols = self.df.select_dtypes(include=[np.number]).columns
            categorical_cols = self.df.select_dtypes(include=['object']).columns
            
            # Fill numeric columns with median
            for col in numeric_cols:
                if col in missing_cols:
                    median_val = self.df[col].median()
                    self.df[col].fillna(median_val, inplace=True)
            
            # Fill categorical columns with mode
            for col in categorical_cols:
                if col in missing_cols and col != self.target_column:
                    mode_val = self.df[col].mode()[0] if not self.df[col].mode().empty else 'Unknown'
                    self.df[col].fillna(mode_val, inplace=True)
        
        print("✅ Missing values handled")
        return missing_summary
    
    def handle_infinite_values(self):
        """Handle infinite values in numeric columns"""
        
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        inf_counts = {}
        
        for col in numeric_cols:
            inf_count = np.isinf(self.df[col]).sum()
            if inf_count > 0:
                inf_counts[col] = inf_count
                
                # Replace infinite values with NaN, then fill with median
                self.df[col] = self.df[col].replace([np.inf, -np.inf], np.nan)
                median_val = self.df[col].median()
                self.df[col].fillna(median_val, inplace=True)
        
        if inf_counts:
            print(f"Handled infinite values in {len(inf_counts)} columns:")
            for col, count in inf_counts.items():
                print(f"  {col}: {count:,} infinite values")
        else:
            print("✅ No infinite values found")
        
        return inf_counts
    
    def remove_duplicates(self):
        """Remove duplicate records"""
        
        initial_count = len(self.df)
        self.df.drop_duplicates(inplace=True)
        final_count = len(self.df)
        
        removed = initial_count - final_count
        if removed > 0:
            print(f"✅ Removed {removed:,} duplicate records")
        else:
            print("✅ No duplicates found")
        
        return removed
    
    def encode_categorical_features(self):
        """Encode categorical features"""
        
        
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        categorical_cols = [col for col in categorical_cols if col != self.target_column]
        
        if len(categorical_cols) > 0:
            print(f"Found {len(categorical_cols)} categorical columns to encode:")
            for col in categorical_cols:
                unique_vals = self.df[col].nunique()
                print(f"  {col}: {unique_vals} unique values")
                
                # Use Label Encoding for now (you might want One-Hot for low cardinality)
                le = LabelEncoder()
                self.df[col] = le.fit_transform(self.df[col].astype(str))
        
        print("✅ Categorical encoding complete")
        return categorical_cols
    
    def create_binary_target(self):
        """Create binary target variable (Normal vs Attack)"""
        
        
        if self.target_column in self.df.columns:
            # Create binary classification target
            self.df['Is_Attack'] = (self.df[self.target_column] != 'BENIGN').astype(int)
            
            # Also encode the multi-class target
            self.df['Label_Encoded'] = self.label_encoder.fit_transform(self.df[self.target_column])
            
            attack_count = self.df['Is_Attack'].sum()
            normal_count = len(self.df) - attack_count
            
            print(f"✅ Binary target created:")
            print(f"  Normal: {normal_count:,} ({normal_count/len(self.df)*100:.1f}%)")
            print(f"  Attack: {attack_count:,} ({attack_count/len(self.df)*100:.1f}%)")
            
            return self.df['Is_Attack'].value_counts()
        else:
            print(f"❌ Target column '{self.target_column}' not found!")
            return None
    
    def feature_engineering(self):
        """Create additional features for anomaly detection"""
        
        
        # List of potential feature column patterns
        flow_features = [col for col in self.df.columns if 'flow' in col.lower()]
        packet_features = [col for col in self.df.columns if 'packet' in col.lower()]
        byte_features = [col for col in self.df.columns if 'byte' in col.lower() or 'length' in col.lower()]
        
        new_features = []
        
        # Example feature engineering (adjust based on actual column names)
        try:
            # Packet rate features
            if any('fwd_packet' in col.lower() for col in self.df.columns):
                fwd_cols = [col for col in self.df.columns if 'fwd_packet' in col.lower()]
                bwd_cols = [col for col in self.df.columns if 'bwd_packet' in col.lower() or 'backward_packet' in col.lower()]
                
                if fwd_cols and bwd_cols:
                    self.df['Packet_Ratio'] = (self.df[fwd_cols[0]] + 1) / (self.df[bwd_cols[0]] + 1)
                    new_features.append('Packet_Ratio')
            
            # Byte rate features
            if any('byte' in col.lower() for col in self.df.columns):
                byte_cols = [col for col in self.df.columns if 'byte' in col.lower() and 'per' in col.lower()]
                if byte_cols:
                    # Create log-transformed features for better distribution
                    for col in byte_cols[:2]:  # Limit to avoid too many features
                        new_col = f"Log_{col}"
                        self.df[new_col] = np.log1p(self.df[col].clip(lower=0))
                        new_features.append(new_col)
            
            print(f"✅ Created {len(new_features)} new features:")
            for feature in new_features:
                print(f"  {feature}")
        
        except Exception as e:
            print(f"⚠️ Feature engineering partially failed: {e}")
        
        return new_features
    
    def scale_features(self):
        """Scale numerical features"""
        
        
        # Get feature columns (exclude target columns)
        exclude_cols = [self.target_column, 'Is_Attack', 'Label_Encoded']
        self.feature_columns = [col for col in self.df.columns if col not in exclude_cols]
        
        # Select only numeric columns for scaling
        numeric_features = self.df[self.feature_columns].select_dtypes(include=[np.number]).columns.tolist()
        
        if numeric_features:
            # Fit and transform the features
            self.df[numeric_features] = self.scaler.fit_transform(self.df[numeric_features])
            print(f"✅ Scaled {len(numeric_features)} numerical features")
        
        self.feature_columns = numeric_features
        return numeric_features
    
    def sample_data(self, sample_size=100000, random_state=42):
        """Sample data for faster processing if dataset is too large"""
        
        if len(self.df) > sample_size:
            # Stratified sampling to maintain class distribution
            if 'Is_Attack' in self.df.columns:
                normal_sample = self.df[self.df['Is_Attack'] == 0].sample(
                    n=int(sample_size * 0.8), random_state=random_state, replace=False
                )
                attack_sample = self.df[self.df['Is_Attack'] == 1].sample(
                    n=int(sample_size * 0.2), random_state=random_state, replace=False
                )
                self.df = pd.concat([normal_sample, attack_sample]).sample(frac=1, random_state=random_state)
            else:
                self.df = self.df.sample(n=sample_size, random_state=random_state)
            
            print(f"✅ Sampled to {len(self.df):,} records")
        else:
            print(f"✅ Dataset size OK: {len(self.df):,} records")
        
        return len(self.df)
    
    def preprocess_pipeline(self, file_path, sample_size=100000):
        """Complete preprocessing pipeline"""

        
        # Step 1: Load data
        if not self.load_data(file_path):
            return None
        
        # Step 2: Clean column names
        self.clean_column_names()
        
        # Step 3: Handle missing and infinite values
        self.handle_missing_values()
        self.handle_infinite_values()
        
        # Step 4: Remove duplicates
        self.remove_duplicates()
        
        # Step 5: Sample data if too large
        self.sample_data(sample_size)
        
        # Step 6: Create target variables
        self.create_binary_target()
        
        # Step 7: Encode categorical features
        self.encode_categorical_features()
        
        # Step 8: Feature engineering
        self.feature_engineering()
        
        # Step 9: Scale features
        self.scale_features()
        
        print("\n" + "=" * 50)

        print(f"Final dataset shape: {self.df.shape}")
        print(f"Features for modeling: {len(self.feature_columns)}")
        
        return self.df
    
    def save_processed_data(self, output_path="processed_data.csv"):
        """Save the processed dataset"""
        if self.df is not None:
            self.df.to_csv(output_path, index=False)
            print(f"💾 Processed data saved to: {output_path}")
        
    def get_preprocessing_summary(self):
        """Get summary of preprocessing steps"""
        if self.df is not None:
            summary = {
                'total_records': len(self.df),
                'total_features': len(self.feature_columns) if self.feature_columns else 0,
                'attack_percentage': self.df['Is_Attack'].mean() * 100 if 'Is_Attack' in self.df.columns else 0,
                'feature_columns': self.feature_columns
            }
            return summary
        return None

def main():
    """Main function to run preprocessing"""
    print(" CIC-IDS2017 Data Preprocessing")
    print("=" * 60)
    
    # Initialize preprocessor
    preprocessor = NetworkDataPreprocessor()
    
    file_path = r"C:\Users\S\Desktop\SecureNet\data\raw\CICIDS2017_FULL.csv"
    
    # Run preprocessing pipeline
    processed_df = preprocessor.preprocess_pipeline(file_path, sample_size=100000)
    
    if processed_df is not None:
        # Save processed data
        preprocessor.save_processed_data("processed_network_data.csv")
        
        # Get summary
        summary = preprocessor.get_preprocessing_summary()
        print("\n📊 PREPROCESSING SUMMARY:")
        print(f"Records: {summary['total_records']:,}")
        print(f"Features: {summary['total_features']}")
        print(f"Attack Rate: {summary['attack_percentage']:.2f}%")
        
        
   

if __name__ == "__main__":
    main()

 CIC-IDS2017 Data Preprocessing
✅ Loaded 3,119,345 records with 86 features
✅ Cleaned 86 column names
Found missing values in 85 columns:
  Flow_ID: 288,602 (9.25%)
  Source_IP: 288,602 (9.25%)
  Source_Port: 288,602 (9.25%)
  Destination_IP: 288,602 (9.25%)
  Destination_Port: 288,602 (9.25%)
  Protocol: 288,602 (9.25%)
  Timestamp: 288,602 (9.25%)
  Flow_Duration: 288,602 (9.25%)
  Total_Fwd_Packets: 288,602 (9.25%)
  Total_Backward_Packets: 288,602 (9.25%)
  Total_Length_of_Fwd_Packets: 288,602 (9.25%)
  Total_Length_of_Bwd_Packets: 288,602 (9.25%)
  Fwd_Packet_Length_Max: 288,602 (9.25%)
  Fwd_Packet_Length_Min: 288,602 (9.25%)
  Fwd_Packet_Length_Mean: 288,602 (9.25%)
  Fwd_Packet_Length_Std: 288,602 (9.25%)
  Bwd_Packet_Length_Max: 288,602 (9.25%)
  Bwd_Packet_Length_Min: 288,602 (9.25%)
  Bwd_Packet_Length_Mean: 288,602 (9.25%)
  Bwd_Packet_Length_Std: 288,602 (9.25%)
  Flow_Bytes_per_s: 289,960 (9.30%)
  Flow_Packets_per_s: 288,602 (9.25%)
  Flow_IAT_Mean: 288,602 (9.25%)
  Flo