In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import warnings

# Configure settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("Libraries imported successfully!")
print("="*60)

Libraries imported successfully!


In [3]:
# Load the dataset
df = pd.read_csv('../data/raw/cicids2017_cleaned.csv')

print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# Create binary labels immediately
df['Binary_Label'] = df['Attack Type'].apply(lambda x: 0 if x == 'Normal Traffic' else 1)

print(f"Binary labels created successfully!")
print(f"  - Normal Traffic (0): {(df['Binary_Label']==0).sum():,} ({(df['Binary_Label']==0).sum()/len(df)*100:.2f}%)")
print(f"  - Attack (1): {(df['Binary_Label']==1).sum():,} ({(df['Binary_Label']==1).sum()/len(df)*100:.2f}%)")

Dataset loaded: 2,520,751 rows, 53 columns
Binary labels created successfully!
  - Normal Traffic (0): 2,095,057 (83.11%)
  - Attack (1): 425,694 (16.89%)


In [4]:
# Separate features and target variable
print("Separating features and target...")
print("="*60)

# Remove the original 'Attack Type' column and keep only Binary_Label
X = df.drop(['Attack Type', 'Binary_Label'], axis=1)
y = df['Binary_Label']

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nFeature columns: {X.columns.tolist()[:10]}...")  # Show first 10
print(f"Total features: {len(X.columns)}")

Separating features and target...
Features (X): (2520751, 52)
Target (y): (2520751,)

Feature columns: ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min']...
Total features: 52


In [5]:
# Check for highly correlated features
print("Checking feature correlations...")
print("="*60)

# Calculate correlation matrix
correlation_matrix = X.corr().abs()

# Find features with correlation > 0.95
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

# Find highly correlated pairs
high_corr_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]

print(f"Features with correlation > 0.95: {len(high_corr_features)}")
if len(high_corr_features) > 0:
    print(f"Highly correlated features: {high_corr_features[:10]}")  # Show first 10
    print("\n⚠ Note: We'll handle these during feature selection later")
else:
    print("✓ No highly correlated features found!")

Checking feature correlations...
Features with correlation > 0.95: 12
Highly correlated features: ['Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd IAT Total', 'Fwd IAT Max', 'Fwd Packets/s', 'Packet Length Std', 'Average Packet Size', 'Subflow Fwd Bytes', 'Idle Mean']

⚠ Note: We'll handle these during feature selection later


In [6]:
# Split data into training and testing sets with stratification
print("Splitting data into train and test sets...")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 80% train, 20% test
    random_state=42,    # For reproducibility
    stratify=y          # Maintains class distribution
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Testing set: {X_test.shape[0]:,} samples")

# Verify class distribution is maintained
print("\nClass distribution in training set:")
print(f"  - Normal Traffic (0): {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.2f}%)")
print(f"  - Attack (1): {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.2f}%)")

print("\nClass distribution in testing set:")
print(f"  - Normal Traffic (0): {(y_test==0).sum():,} ({(y_test==0).sum()/len(y_test)*100:.2f}%)")
print(f"  - Attack (1): {(y_test==1).sum():,} ({(y_test==1).sum()/len(y_test)*100:.2f}%)")

Splitting data into train and test sets...
Training set: 2,016,600 samples
Testing set: 504,151 samples

Class distribution in training set:
  - Normal Traffic (0): 1,676,045 (83.11%)
  - Attack (1): 340,555 (16.89%)

Class distribution in testing set:
  - Normal Traffic (0): 419,012 (83.11%)
  - Attack (1): 85,139 (16.89%)


In [7]:
# Feature Scaling using StandardScaler
print("Applying Feature Scaling (Standardization)...")
print("="*60)

# Initialize StandardScaler
scaler = StandardScaler()

# IMPORTANT: Fit ONLY on training data to prevent data leakage
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Only transform, don't fit

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("✓ Feature scaling completed!")
print(f"\nScaled training set shape: {X_train_scaled.shape}")
print(f"Scaled testing set shape: {X_test_scaled.shape}")

# Verify scaling worked (mean should be ~0, std should be ~1)
print("\nVerification (first 3 features):")
print(f"Mean values: {X_train_scaled.iloc[:, :3].mean().values}")
print(f"Std values: {X_train_scaled.iloc[:, :3].std().values}")

Applying Feature Scaling (Standardization)...
✓ Feature scaling completed!

Scaled training set shape: (2016600, 52)
Scaled testing set shape: (504151, 52)

Verification (first 3 features):
Mean values: [-5.02834245e-17  7.49794179e-18  1.03149552e-18]
Std values: [1.00000025 1.00000025 1.00000025]


In [10]:
# Save preprocessed data
print("Saving preprocessed data...")
print("="*60)

# Create directories if they don't exist
import os
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../saved_models', exist_ok=True)

print("✓ Directories created/verified")

# Save training data
X_train_scaled.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)

# Save testing data
X_test_scaled.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# Save the scaler for future use
import joblib
joblib.dump(scaler, '../saved_models/scaler.pkl')

print("✓ Preprocessed data saved successfully!")
print("\nSaved files:")
print("  - ../data/processed/X_train.csv")
print("  - ../data/processed/y_train.csv")
print("  - ../data/processed/X_test.csv")
print("  - ../data/processed/y_test.csv")
print("  - ../saved_models/scaler.pkl (for future predictions)")

# Verify file sizes
print("\nFile sizes:")
print(f"  - X_train.csv: {os.path.getsize('../data/processed/X_train.csv') / (1024*1024):.2f} MB")
print(f"  - X_test.csv: {os.path.getsize('../data/processed/X_test.csv') / (1024*1024):.2f} MB")

Saving preprocessed data...
✓ Directories created/verified
✓ Preprocessed data saved successfully!

Saved files:
  - ../data/processed/X_train.csv
  - ../data/processed/y_train.csv
  - ../data/processed/X_test.csv
  - ../data/processed/y_test.csv
  - ../saved_models/scaler.pkl (for future predictions)

File sizes:
  - X_train.csv: 2036.98 MB
  - X_test.csv: 509.24 MB
