In [None]:
# Cell 1: Import Libraries and Setup Paths
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
import os
import time
import warnings
import joblib
from sklearn.utils import resample
warnings.filterwarnings('ignore')

# Set random seed to ensure reproducibility
np.random.seed(42)

# Set better visualization style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Define paths
INPUT_PATH = '/root/autodl-tmp/projects/DL/dataset/preprocessed/CICIDS2017_merged_preprocessed.csv'
OUTPUT_DIR = '/root/autodl-tmp/projects/DL/dataset/feature_engineering'

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("CICIDS2017 Feature Engineering and Stratified Data Preparation")
print("="*80)

In [None]:
# Cell 2: Data Loading and Initial Exploration
print("\n1. Loading Preprocessed Data")
print("-"*50)

start_time = time.time()
print(f"Loading data from: {INPUT_PATH}")

# First read a small sample to determine data types
sample_df = pd.read_csv(INPUT_PATH, nrows=10000)
dtypes = sample_df.dtypes
numeric_columns = sample_df.select_dtypes(include=['float64']).columns
int_columns = sample_df.select_dtypes(include=['int64']).columns

# Set optimized data types
optimized_dtypes = {}
for col in numeric_columns:
    optimized_dtypes[col] = 'float32'  # Reduce precision to save memory
for col in int_columns:
    optimized_dtypes[col] = 'int32'  # Reduce precision to save memory

# Read in batches and merge
chunk_size = 500000  # Number of rows to read per batch
chunks = []
for chunk in pd.read_csv(INPUT_PATH, chunksize=chunk_size, dtype=optimized_dtypes):
    chunks.append(chunk)
df = pd.concat(chunks, ignore_index=True)

load_time = time.time() - start_time
print(f"Data loading completed, time elapsed: {load_time:.2f} seconds")
print(f"Dataset shape: {df.shape}")

# Display memory usage
memory_usage = df.memory_usage().sum() / (1024 ** 2)
print(f"Memory usage: {memory_usage:.2f} MB")

In [None]:
# Cell 3: Exploratory Data Analysis
print("\n2. Exploratory Data Analysis")
print("-"*50)

# Check label distribution
print("\nLabel distribution:")
label_counts = df['Label'].value_counts()
print(label_counts)

# Visualize label distribution
plt.figure(figsize=(14, 8))
label_counts.plot(kind='bar', color='skyblue')
plt.title('Attack Types Distribution')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'attack_distribution.png'), dpi=300)
plt.close()

# Get percentage of each attack type
attack_percentages = (label_counts / len(df)) * 100
print("\nPercentage of each attack type:")
for attack_type, percentage in attack_percentages.items():
    print(f"{attack_type}: {percentage:.4f}%")

# Check data types and missing values
print("\nData types:")
print(df.dtypes.value_counts())

print("\nChecking for missing values:")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if len(missing_values) > 0:
    print(missing_values)
else:
    print("No missing values")

# Statistical analysis
print("\nStatistical summary of numerical features:")
numeric_df = df.select_dtypes(include=['float32', 'float64', 'int32', 'int64'])
summary_stats = numeric_df.describe().T
summary_stats['range'] = summary_stats['max'] - summary_stats['min']
summary_stats['coefficient_of_variation'] = summary_stats['std'] / summary_stats['mean']
print(summary_stats[['mean', 'std', 'min', 'max', 'range', 'coefficient_of_variation']].head())

# Save complete statistical summary
summary_stats.to_csv(os.path.join(OUTPUT_DIR, 'feature_statistics.csv'))
print(f"Complete statistical summary saved to: {os.path.join(OUTPUT_DIR, 'feature_statistics.csv')}")

In [None]:
# Cell 4: Creating Binary and Multiclass Labels
print("\n3. Creating Binary and Multiclass Labels")
print("-"*50)

# Create binary labels (Normal vs Attack)
df['binary_label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
print("Binary labels created, distribution as follows:")
print(df['binary_label'].value_counts())

# Create multiclass labels - Group rare attack types
# Define grouping strategy - Classify attacks with less than 1000 samples as "Other Attacks"
THRESHOLD = 1000
attack_counts = label_counts[label_counts.index != 'BENIGN']
rare_attacks = attack_counts[attack_counts < THRESHOLD].index.tolist()

print(f"\nThe following rare attack types will be grouped as 'Other Attacks' (threshold: {THRESHOLD}):")
for attack in rare_attacks:
    print(f"- {attack}: {label_counts[attack]}")

# Create multiclass labels
def create_multiclass_label(label):
    if label == 'BENIGN':
        return 'Normal'
    elif label in rare_attacks:
        return 'Other Attacks'
    else:
        # Group DOS attack types together
        if 'DOS' in label:
            return 'DOS'
        # Group Web attack types together
        elif 'WEB ATTACK' in label:
            return 'Web Attack'
        else:
            return label

df['multiclass_label'] = df['Label'].apply(create_multiclass_label)
print("\nMulticlass labels created, distribution as follows:")
print(df['multiclass_label'].value_counts())

# Encode multiclass labels as numbers
label_mapping = {label: idx for idx, label in enumerate(df['multiclass_label'].unique())}
df['multiclass_encoded'] = df['multiclass_label'].map(label_mapping)

print("\nLabel mapping:")
for label, code in label_mapping.items():
    print(f"{label}: {code}")

# Save label mapping for future use
joblib.dump(label_mapping, os.path.join(OUTPUT_DIR, 'label_mapping.joblib'))

In [None]:
# Cell 5: Feature Engineering and Selection
print("\n4. Feature Engineering and Selection")
print("-"*50)

# Remove label columns and potential data leakage columns
features_to_drop = ['Label', 'binary_label', 'multiclass_label', 'multiclass_encoded', 'Day', 'Scenario']
X = df.drop(columns=features_to_drop, errors='ignore')
y_binary = df['binary_label']
y_multi = df['multiclass_encoded']

print(f"Feature matrix shape: {X.shape}")

# Check feature correlation
print("\nCalculating feature correlation matrix...")
correlation_time = time.time()

# To save memory, only calculate correlation for a subset of features
# a. Select numerical columns
numeric_cols = X.select_dtypes(include=['float32', 'float64', 'int32', 'int64']).columns

# b. If too many features, select top N
MAX_CORRELATION_FEATURES = 30  # Limit the number of features for correlation analysis
if len(numeric_cols) > MAX_CORRELATION_FEATURES:
    # Select features with highest variance
    variances = X[numeric_cols].var().sort_values(ascending=False)
    selected_cols = variances.index[:MAX_CORRELATION_FEATURES].tolist()
    print(f"Selected {MAX_CORRELATION_FEATURES} high-variance features for correlation analysis")
else:
    selected_cols = numeric_cols.tolist()

# c. Calculate correlation
corr_matrix = X[selected_cols].corr()
correlation_time = time.time() - correlation_time
print(f"Correlation calculation completed, time elapsed: {correlation_time:.2f} seconds")

# Save correlation matrix heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_heatmap.png'), dpi=300)
plt.close()

# Find highly correlated feature pairs
corr_pairs = []
for i in range(len(selected_cols)):
    for j in range(i+1, len(selected_cols)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:  # High correlation threshold set to 0.9
            corr_pairs.append((selected_cols[i], selected_cols[j], corr_matrix.iloc[i, j]))

print(f"\nFound {len(corr_pairs)} highly correlated feature pairs (|correlation| > 0.9):")
for feat1, feat2, corr in corr_pairs[:5]:  # Only show first 5 pairs
    print(f"- {feat1} and {feat2}: {corr:.4f}")

if len(corr_pairs) > 5:
    print(f"...and {len(corr_pairs)-5} other highly correlated feature pairs")

# Detect constant and near-constant features
const_features = [col for col in X.columns if X[col].nunique() <= 1]
near_const_features = [col for col in X.columns if X[col].nunique() <= 2 and X[col].nunique() > 1]

print(f"\nFound {len(const_features)} constant features and {len(near_const_features)} near-constant features")
if const_features:
    print("Constant features:")
    for feat in const_features:
        print(f"- {feat}")
    
    # Remove constant features
    X = X.drop(columns=const_features)
    print(f"Constant features removed, new feature matrix shape: {X.shape}")

# Identify high cardinality features
high_cardinality_cols = []
for col in X.columns:
    unique_ratio = X[col].nunique() / len(X)
    if unique_ratio > 0.8:
        high_cardinality_cols.append((col, unique_ratio))

print(f"\nFound {len(high_cardinality_cols)} high cardinality features (unique value ratio > 80%):")
for col, ratio in high_cardinality_cols[:5]:  # Only show first 5
    print(f"- {col}: {ratio:.4f}")

In [None]:
# Cell 6: Stratified Dataset Split
print("\n5. Stratified Dataset Split")
print("-"*50)

# Perform stratified split: training, validation, and test sets (70-15-15)
print("Performing stratified dataset split...")
split_time = time.time()

# First split into training and temporary sets
X_train, X_temp, y_binary_train, y_binary_temp, y_multi_train, y_multi_temp = train_test_split(
    X, y_binary, y_multi, test_size=0.3, stratify=y_multi, random_state=42
)

# Then split temporary set into validation and test sets
X_val, X_test, y_binary_val, y_binary_test, y_multi_val, y_multi_test = train_test_split(
    X_temp, y_binary_temp, y_multi_temp, test_size=0.5, stratify=y_multi_temp, random_state=42
)

split_time = time.time() - split_time
print(f"Dataset split completed, time elapsed: {split_time:.2f} seconds")

# Display sizes and distributions of different sets
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print("\nBinary label distribution:")
print(f"Training set: {Counter(y_binary_train)}")
print(f"Validation set: {Counter(y_binary_val)}")
print(f"Test set: {Counter(y_binary_test)}")

print("\nMulticlass label distribution:")
print(f"Training set: {Counter(y_multi_train)}")
print(f"Validation set: {Counter(y_multi_val)}")
print(f"Test set: {Counter(y_multi_test)}")

In [None]:
# Cell 7: Feature Scaling
print("\n6. Feature Scaling")
print("-"*50)

# Use RobustScaler for feature scaling to handle outliers
print("Using RobustScaler for feature scaling...")
scaling_time = time.time()

# Create and fit scaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert scaled data back to DataFrame to preserve column names and indices
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

scaling_time = time.time() - scaling_time
print(f"Feature scaling completed, time elapsed: {scaling_time:.2f} seconds")

# Save scaler for future use
joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'robust_scaler.joblib'))

In [None]:
# Cell 8: Class Imbalance Handling
print("\n7. Class Imbalance Handling")
print("-"*50)

# Only balance the training set

# a. Balance handling for binary classification
print("\n7.1 Handling class imbalance for binary classification...")
binary_balance_time = time.time()

# Use SMOTEENN for combined oversampling and cleaning
print("Applying SMOTEENN to balance binary data...")
smote_enn = SMOTEENN(random_state=42)
X_train_binary_balanced, y_binary_train_balanced = smote_enn.fit_resample(X_train_scaled, y_binary_train)

binary_balance_time = time.time() - binary_balance_time
print(f"Binary class balancing completed, time elapsed: {binary_balance_time:.2f} seconds")
print(f"Before balancing: {Counter(y_binary_train)}")
print(f"After balancing: {Counter(y_binary_train_balanced)}")

# b. Balance handling for multiclass classification
print("\n7.2 Handling class imbalance for multiclass classification...")
multi_balance_time = time.time()

# For multiclass, use stratified strategy - keep large classes unchanged, oversample small classes
# Determine minimum number of samples needed
min_samples_per_class = 5000  # Minimum samples per class

# Get current sample count for each class
class_counts = Counter(y_multi_train)
print("Original class distribution for multiclass task:")
print(class_counts)

# Perform stratified sampling
X_multi_resampled = pd.DataFrame()
y_multi_resampled = pd.Series()

for class_label, count in class_counts.items():
    # Get samples for current class
    class_indices = y_multi_train[y_multi_train == class_label].index
    X_class = X_train_scaled.loc[class_indices]
    y_class = y_multi_train.loc[class_indices]
    
    # If sample count is less than threshold, oversample
    if count < min_samples_per_class:
        # Calculate number of samples to synthesize
        n_samples = min_samples_per_class
        print(f"Oversampling class {class_label}: {count} -> {n_samples}")
        
        # Use random sampling with replacement for oversampling
        X_resampled, y_resampled = resample(
            X_class, y_class, 
            replace=True,
            n_samples=n_samples,
            random_state=42
        )
    else:
        # For large classes, keep unchanged
        X_resampled, y_resampled = X_class, y_class
    
    # Merge to results
    X_multi_resampled = pd.concat([X_multi_resampled, X_resampled])
    y_multi_resampled = pd.concat([y_multi_resampled, y_resampled])

multi_balance_time = time.time() - multi_balance_time
print(f"Multiclass balancing completed, time elapsed: {multi_balance_time:.2f} seconds")
print(f"Class distribution after balancing: {Counter(y_multi_resampled)}")

In [None]:
# Cell 9: Save Processed Datasets
print("\n8. Save Processed Datasets")
print("-"*50)

# 8.1 Save binary classification datasets
print("Saving binary classification datasets...")
binary_save_time = time.time()

# Training set (balanced)
binary_train_data = {
    'X_train': X_train_binary_balanced,
    'y_train': y_binary_train_balanced
}
joblib.dump(binary_train_data, os.path.join(OUTPUT_DIR, 'binary_train_balanced.joblib'))

# Validation and test sets (keep original distribution)
binary_val_data = {
    'X_val': X_val_scaled,
    'y_val': y_binary_val
}
joblib.dump(binary_val_data, os.path.join(OUTPUT_DIR, 'binary_val.joblib'))

binary_test_data = {
    'X_test': X_test_scaled,
    'y_test': y_binary_test
}
joblib.dump(binary_test_data, os.path.join(OUTPUT_DIR, 'binary_test.joblib'))

binary_save_time = time.time() - binary_save_time
print(f"Binary classification datasets saved, time elapsed: {binary_save_time:.2f} seconds")

# 8.2 Save multiclass classification datasets
print("Saving multiclass classification datasets...")
multi_save_time = time.time()

# Training set (balanced)
multi_train_data = {
    'X_train': X_multi_resampled,
    'y_train': y_multi_resampled
}
joblib.dump(multi_train_data, os.path.join(OUTPUT_DIR, 'multi_train_balanced.joblib'))

# Validation and test sets (keep original distribution)
multi_val_data = {
    'X_val': X_val_scaled, 
    'y_val': y_multi_val
}
joblib.dump(multi_val_data, os.path.join(OUTPUT_DIR, 'multi_val.joblib'))

multi_test_data = {
    'X_test': X_test_scaled,
    'y_test': y_multi_test
}
joblib.dump(multi_test_data, os.path.join(OUTPUT_DIR, 'multi_test.joblib'))

multi_save_time = time.time() - multi_save_time
print(f"Multiclass classification datasets saved, time elapsed: {multi_save_time:.2f} seconds")

# 8.3 Save feature list
feature_list = X_train.columns.tolist()
joblib.dump(feature_list, os.path.join(OUTPUT_DIR, 'feature_list.joblib'))
print(f"Feature list saved, total of {len(feature_list)} features")

In [None]:
# Cell 10: Summary
print("\n9. Data Processing Summary")
print("-"*50)

total_time = time.time() - start_time
print(f"Total processing time: {total_time:.2f} seconds")

print("\nDataset Statistics:")
print(f"Original dataset: {df.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Binary classes: {len(np.unique(y_binary))}")
print(f"Multiclass classes: {len(np.unique(y_multi))}")

print("\nCompleted processes:")
print("- Data loading and exploration")
print("- Creation of binary and multiclass labels")
print("- Feature engineering and selection")
print("- Stratified dataset split")
print("- Feature scaling")
print("- Class imbalance handling")
print("- Saving processed datasets")

print("\nProcessed files saved to:")
print(OUTPUT_DIR)