# Random Forest Model Training: DNS Abuse & Infrastructure Attack Detection

**Author**: Cybersecurity Data Science Team  
**Component**: AI/ML Detection of DNS Abuse and Infrastructure Attacks  
**Focus**: Volumetric attacks, DDoS signatures, amplification attacks, uncharacteristic flow patterns

**Key Objective**: Achieve > 72% accuracy on unseen data by preventing overfitting through aggressive hyperparameter tuning

---

## Table of Contents
1. [Data Loading & Initial Inspection](#1-data-loading--initial-inspection)
2. [Preprocessing & Data Quality](#2-preprocessing--data-quality)
3. [Feature Engineering & Selection](#3-feature-engineering--selection)
4. [Train-Test Split](#4-train-test-split)
5. [Hyperparameter Tuning with RandomizedSearchCV](#5-hyperparameter-tuning)
6. [Model Training](#6-model-training)
7. [Model Evaluation](#7-model-evaluation)
8. [Unseen Data Testing](#8-unseen-data-testing)
9. [Model Persistence](#9-model-persistence)

---

## 1. Data Loading & Initial Inspection

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import glob
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úì All libraries imported successfully")
print(f"Execution started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Load the dataset
DATA_PATH = r'C:\Users\shenal\Downloads\reseraach\CIC_IOT_2023\PCAP\FinalDataset\final_balanced_dataset.csv'

print("Loading dataset...")
df = pd.read_csv(DATA_PATH)
print(f"‚úì Dataset loaded successfully\n")

# Display basic information
print("="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"\nLabel Distribution:")
print(df['label'].value_counts())
print(f"\nClass Balance:")
print(df['label'].value_counts(normalize=True) * 100)

## 2. Preprocessing & Data Quality

In [None]:
# Check for missing values and infinite values
print("="*80)
print("DATA QUALITY CHECKS")
print("="*80)

print(f"\nBefore cleaning:")
print(f"  - NaN values: {df.isnull().sum().sum():,}")
print(f"  - Infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum():,}")

# Replace infinity with NaN first, then fill NaN with 0
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

print(f"\nAfter cleaning:")
print(f"  - NaN values: {df.isnull().sum().sum():,}")
print(f"  - Infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum():,}")
print("\n‚úì Data cleaned successfully")

## 3. Feature Engineering & Selection

In [None]:
# Drop identity columns that can cause overfitting
print("="*80)
print("FEATURE SELECTION")
print("="*80)

columns_to_drop = ['src_ip', 'dst_ip', 'src_port', 'dst_port']

print(f"\nDropping {len(columns_to_drop)} identity columns:")
for col in columns_to_drop:
    print(f"  - {col}")

df_clean = df.drop(columns=columns_to_drop, errors='ignore')

print(f"\n‚úì Dropped identity columns")
print(f"Remaining columns: {df_clean.shape[1]}")

In [None]:
# Encode Protocol (UDP/TCP -> 1/0)
print("="*80)
print("CATEGORICAL ENCODING")
print("="*80)

print(f"\nProtocol distribution before encoding:")
print(df_clean['protocol'].value_counts())

# Label encode Protocol
protocol_encoder = LabelEncoder()
df_clean['protocol'] = protocol_encoder.fit_transform(df_clean['protocol'])

print(f"\nProtocol encoding mapping:")
for i, label in enumerate(protocol_encoder.classes_):
    print(f"  {label} -> {i}")

print("\n‚úì Categorical encoding complete")

In [None]:
# Display final feature list for Infrastructure/Abuse detection
print("="*80)
print("FINAL FEATURE SET (Infrastructure & Abuse Attack Detection)")
print("="*80)

# Separate features and label
X = df_clean.drop('label', axis=1)
y = df_clean['label']

print(f"\nTotal Features: {X.shape[1]}")
print(f"\nInfrastructure-Focused Features (Prioritized):")

infrastructure_features = [
    'flow_bytes_per_sec',
    'flow_packets_per_sec',
    'dns_queries_per_second',
    'dns_amplification_factor',
    'total_fwd_packets',
    'total_bwd_packets',
    'flow_iat_mean',
    'flow_iat_std'
]

for feat in infrastructure_features:
    print(f"  ‚úì {feat}")

print(f"\nTarget Variable: label (0=BENIGN, 1=ATTACK)")
print(f"\n‚úì Feature engineering complete")

## 4. Train-Test Split

In [None]:
# Stratified train-test split (80/20)
print("="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=RANDOM_STATE
)

print(f"\nTraining Set:")
print(f"  - Samples: {X_train.shape[0]:,}")
print(f"  - Features: {X_train.shape[1]}")
print(f"  - Class 0 (BENIGN): {(y_train == 0).sum():,}")
print(f"  - Class 1 (ATTACK): {(y_train == 1).sum():,}")

print(f"\nTest Set:")
print(f"  - Samples: {X_test.shape[0]:,}")
print(f"  - Features: {X_test.shape[1]}")
print(f"  - Class 0 (BENIGN): {(y_test == 0).sum():,}")
print(f"  - Class 1 (ATTACK): {(y_test == 1).sum():,}")

print("\n‚úì Stratified split complete")

## 5. Hyperparameter Tuning with RandomizedSearchCV

**Anti-Overfitting Strategy**: Using aggressive hyperparameter constraints to ensure generalization

In [None]:
# Define hyperparameter search space
print("="*80)
print("HYPERPARAMETER TUNING (RandomizedSearchCV)")
print("="*80)

param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [5, 10, 15, 20, 25, 30],
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 4, 8, 16],
    'max_features': ['sqrt', 'log2', 0.5],
    'bootstrap': [True, False]
}

print("\nParameter Search Space:")
for param, values in param_distributions.items():
    print(f"  - {param}: {values}")

print(f"\nRandomizedSearchCV Configuration:")
print(f"  - Iterations: 50 random combinations")
print(f"  - Cross-validation: 5-fold")
print(f"  - Scoring metric: accuracy")

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    verbose=2,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("\n‚úì RandomizedSearchCV initialized")
print("‚è≥ Starting hyperparameter search (this may take several minutes)...\n")

# Fit RandomizedSearchCV
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

print(f"\n‚úì Hyperparameter search complete")
print(f"Time taken: {(end_time - start_time).total_seconds():.2f} seconds")

In [None]:
# Display best parameters
print("="*80)
print("BEST HYPERPARAMETERS FOUND")
print("="*80)

print(f"\nBest Cross-Validation Accuracy: {random_search.best_score_:.4f}")
print(f"\nOptimal Hyperparameters:")
for param, value in random_search.best_params_.items():
    print(f"  - {param}: {value}")

# Extract best model
best_rf = random_search.best_estimator_
print("\n‚úì Best model extracted")

## 6. Model Training

Training the best model on the full training set

In [None]:
# Train best model on full training set
print("="*80)
print("FINAL MODEL TRAINING")
print("="*80)

print("\n‚è≥ Training final model on full training set...")
start_time = datetime.now()
best_rf.fit(X_train, y_train)
end_time = datetime.now()

print(f"‚úì Training complete")
print(f"Training time: {(end_time - start_time).total_seconds():.2f} seconds")

# Training accuracy
train_pred = best_rf.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)
print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")

## 7. Model Evaluation

In [None]:
# Test set predictions
print("="*80)
print("MODEL EVALUATION ON TEST SET")
print("="*80)

y_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nTest Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Overfitting Gap: {abs(train_accuracy - test_accuracy):.4f} ({abs(train_accuracy - test_accuracy)*100:.2f}%)")

if abs(train_accuracy - test_accuracy) < 0.05:
    print("\n‚úì Model shows good generalization (overfitting gap < 5%)")
else:
    print("\n‚ö† Warning: Potential overfitting detected (gap >= 5%)")

In [None]:
# Confusion Matrix
print("\n" + "="*80)
print("CONFUSION MATRIX")
print("="*80)

cm = confusion_matrix(y_test, y_pred)
print(f"\n{cm}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['BENIGN', 'ATTACK'],
            yticklabels=['BENIGN', 'ATTACK'])
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate percentages
tn, fp, fn, tp = cm.ravel()
print(f"\nBreakdown:")
print(f"  True Negatives (BENIGN correctly classified): {tn:,}")
print(f"  False Positives (BENIGN misclassified as ATTACK): {fp:,}")
print(f"  False Negatives (ATTACK misclassified as BENIGN): {fn:,}")
print(f"  True Positives (ATTACK correctly classified): {tp:,}")

In [None]:
# Classification Report
print("\n" + "="*80)
print("CLASSIFICATION REPORT")
print("="*80)

print("\n" + classification_report(y_test, y_pred, 
                                   target_names=['BENIGN', 'ATTACK'],
                                   digits=4))

In [None]:
# Feature Importance
print("="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importances.head(20).to_string(index=False))

# Visualize top 20 features
plt.figure(figsize=(10, 8))
top_20 = feature_importances.head(20)
plt.barh(range(len(top_20)), top_20['importance'], color='steelblue')
plt.yticks(range(len(top_20)), top_20['feature'])
plt.xlabel('Importance', fontsize=12)
plt.title('Top 20 Feature Importances - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Check if infrastructure features are in top 10
top_10_features = feature_importances.head(10)['feature'].tolist()
infrastructure_in_top10 = [f for f in infrastructure_features if f in top_10_features]

print(f"\nInfrastructure features in top 10: {len(infrastructure_in_top10)}/{len(infrastructure_features)}")
if infrastructure_in_top10:
    print("Features:")
    for feat in infrastructure_in_top10:
        print(f"  ‚úì {feat}")

## 8. Unseen Data Testing

**Critical Test**: Testing on completely unseen data to validate real-world performance

In [None]:
def test_unseen_data(model, attack_folder, benign_folder):
    """
    Load and test model on completely unseen CSV files
    
    Args:
        model: Trained Random Forest model
        attack_folder: Path to attack CSV files
        benign_folder: Path to benign CSV files
    
    Returns:
        Dictionary with metrics (accuracy, precision, recall, f1)
    """
    print("="*80)
    print("UNSEEN DATA TESTING")
    print("="*80)
    
    # Load attack files
    print(f"\n‚è≥ Loading attack files from: {attack_folder}")
    attack_files = glob.glob(f"{attack_folder}/*.csv")
    print(f"Found {len(attack_files)} attack files")
    
    attack_dfs = []
    for f in attack_files:
        try:
            df_temp = pd.read_csv(f)
            attack_dfs.append(df_temp)
        except Exception as e:
            print(f"  ‚ö† Error loading {f}: {e}")
    
    if attack_dfs:
        df_attack = pd.concat(attack_dfs, ignore_index=True)
        df_attack['label'] = 1  # Attack label
        print(f"‚úì Loaded {len(df_attack):,} attack samples")
    else:
        df_attack = pd.DataFrame()
        print("‚ö† No attack data loaded")
    
    # Load benign files
    print(f"\n‚è≥ Loading benign files from: {benign_folder}")
    benign_files = glob.glob(f"{benign_folder}/*.csv")
    print(f"Found {len(benign_files)} benign files")
    
    benign_dfs = []
    for f in benign_files:
        try:
            df_temp = pd.read_csv(f)
            benign_dfs.append(df_temp)
        except Exception as e:
            print(f"  ‚ö† Error loading {f}: {e}")
    
    if benign_dfs:
        df_benign = pd.concat(benign_dfs, ignore_index=True)
        df_benign['label'] = 0  # Benign label
        print(f"‚úì Loaded {len(df_benign):,} benign samples")
    else:
        df_benign = pd.DataFrame()
        print("‚ö† No benign data loaded")
    
    # Combine datasets
    if df_attack.empty and df_benign.empty:
        print("\n‚ùå No unseen data loaded. Cannot perform testing.")
        return None
    
    df_unseen = pd.concat([df_attack, df_benign], ignore_index=True)
    print(f"\nTotal unseen samples: {len(df_unseen):,}")
    
    # Preprocess unseen data (same as training)
    print("\n‚è≥ Preprocessing unseen data...")
    
    # Handle inf/NaN
    df_unseen.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_unseen.fillna(0, inplace=True)
    
    # Drop identity columns
    df_unseen = df_unseen.drop(columns=['src_ip', 'dst_ip', 'src_port', 'dst_port'], errors='ignore')
    
    # Encode protocol if exists
    if 'protocol' in df_unseen.columns:
        df_unseen['protocol'] = protocol_encoder.transform(df_unseen['protocol'])
    
    # Separate features and labels
    y_unseen = df_unseen['label']
    X_unseen = df_unseen.drop('label', axis=1)
    
    # Ensure column order matches training data
    X_unseen = X_unseen[X.columns]
    
    print("‚úì Preprocessing complete")
    
    # Make predictions
    print("\n‚è≥ Making predictions...")
    y_unseen_pred = model.predict(X_unseen)
    
    # Calculate metrics
    accuracy = accuracy_score(y_unseen, y_unseen_pred)
    precision = precision_score(y_unseen, y_unseen_pred)
    recall = recall_score(y_unseen, y_unseen_pred)
    f1 = f1_score(y_unseen, y_unseen_pred)
    
    print("\n" + "="*80)
    print("UNSEEN DATA RESULTS")
    print("="*80)
    
    print(f"\nüéØ ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"\nDetailed Metrics:")
    print(f"  - Precision (Attack): {precision:.4f}")
    print(f"  - Recall (Attack): {recall:.4f}")
    print(f"  - F1-Score (Attack): {f1:.4f}")
    
    # Comparison with XGBoost baseline
    xgboost_baseline = 0.72
    improvement = accuracy - xgboost_baseline
    
    print(f"\nüìä Comparison with XGBoost Baseline:")
    print(f"  - XGBoost unseen accuracy: {xgboost_baseline:.4f} ({xgboost_baseline*100:.2f}%)")
    print(f"  - Random Forest unseen accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  - Improvement: {improvement:+.4f} ({improvement*100:+.2f}%)")
    
    if accuracy > xgboost_baseline:
        print(f"\n‚úÖ SUCCESS: Random Forest outperforms XGBoost by {improvement*100:.2f}%!")
    elif accuracy == xgboost_baseline:
        print(f"\n‚ö† NEUTRAL: Random Forest matches XGBoost performance")
    else:
        print(f"\n‚ùå UNDERPERFORMANCE: Random Forest is {abs(improvement)*100:.2f}% below XGBoost")
    
    # Confusion matrix for unseen data
    cm_unseen = confusion_matrix(y_unseen, y_unseen_pred)
    print(f"\nConfusion Matrix (Unseen Data):")
    print(cm_unseen)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_unseen, annot=True, fmt='d', cmap='Greens',
                xticklabels=['BENIGN', 'ATTACK'],
                yticklabels=['BENIGN', 'ATTACK'])
    plt.title('Confusion Matrix - Unseen Data', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm_unseen
    }

In [None]:
# Test on unseen data
ATTACK_FOLDER = r'C:\Users\shenal\Downloads\reseraach\Attacks\Attacks\attack_generated_new'
BENIGN_FOLDER = r'C:\Users\shenal\Downloads\reseraach\Attacks\Attacks\benign_generated_org'

unseen_results = test_unseen_data(best_rf, ATTACK_FOLDER, BENIGN_FOLDER)

## 9. Model Persistence

In [None]:
# Save the trained model
print("="*80)
print("MODEL PERSISTENCE")
print("="*80)

MODEL_PATH = 'random_forest_dns_infrastructure_model.pkl'

print(f"\n‚è≥ Saving model to: {MODEL_PATH}")
joblib.dump(best_rf, MODEL_PATH)

# Verify save
import os
file_size = os.path.getsize(MODEL_PATH) / (1024 * 1024)  # Convert to MB

print(f"‚úì Model saved successfully")
print(f"  - File: {MODEL_PATH}")
print(f"  - Size: {file_size:.2f} MB")

# Test loading
print(f"\n‚è≥ Verifying model can be loaded...")
loaded_model = joblib.load(MODEL_PATH)
print(f"‚úì Model loaded successfully")
print(f"  - Type: {type(loaded_model).__name__}")
print(f"  - Features: {loaded_model.n_features_in_}")
print(f"  - Trees: {loaded_model.n_estimators}")

In [None]:
# Final Summary
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)

print(f"\nüìä Model Performance:")
print(f"  - Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"  - Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
if unseen_results:
    print(f"  - Unseen Data Accuracy: {unseen_results['accuracy']:.4f} ({unseen_results['accuracy']*100:.2f}%)")
    print(f"\nüéØ Goal Achievement:")
    if unseen_results['accuracy'] > 0.72:
        print(f"  ‚úÖ PASSED: Exceeded 72% baseline ({unseen_results['accuracy']*100:.2f}%)")
    else:
        print(f"  ‚ùå FAILED: Did not exceed 72% baseline ({unseen_results['accuracy']*100:.2f}%)")

print(f"\nüõ°Ô∏è Overfitting Check:")
gap = abs(train_accuracy - test_accuracy)
if gap < 0.05:
    print(f"  ‚úÖ PASSED: Gap = {gap*100:.2f}% (< 5%)")
else:
    print(f"  ‚ö† WARNING: Gap = {gap*100:.2f}% (>= 5%)")

print(f"\nüìÅ Saved Model:")
print(f"  - Path: {MODEL_PATH}")
print(f"  - Size: {file_size:.2f} MB")

print(f"\n‚úì Training pipeline complete!")
print(f"Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")