# LightGBM Model Training: DNS Abuse & Infrastructure Attack Detection

**Author**: AI Assistant  
**Component**: AI/ML Detection of DNS Abuse and Infrastructure Attacks  
**Focus**: High Accuracy & Recall on Volumetric, Amplification, and Protocol Attacks

---

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
    roc_curve
)

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

print(f"LightGBM Version: {lgb.__version__}")

## 2. Data Loading
Loading the balanced dataset.

In [None]:
# Path to dataset (Adjust if necessary)
DATA_PATH = r'C:\Users\shenal\Downloads\reseraach\CIC_IOT_2023\PCAP\FinalDataset\final_balanced_dataset.csv'

print("Loading dataset...")
try:
    df = pd.read_csv(DATA_PATH)
    print(f"✓ Dataset loaded. Shape: {df.shape}")
except FileNotFoundError:
    print(f"❌ File not found at {DATA_PATH}. Please check the path.")

## 3. Preprocessing & Feature Engineering
1. **Clean**: Handle infinite/NaN values.
2. **Drop**: Remove identity columns (`src_ip`, `dst_ip`, `src_port`, `dst_port`) to ensure the model learns traffic patterns, not specific hosts.
3. **Encode**: Convert `protocol` (UDP/TCP) to numeric.

In [None]:
# 1. Handle Infinite/NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# 2. Drop Identity Columns
cols_to_drop = ['src_ip', 'dst_ip', 'src_port', 'dst_port']
df_clean = df.drop(columns=cols_to_drop, errors='ignore')

# 3. Encode Protocol
if df_clean['protocol'].dtype == 'object':
    le = LabelEncoder()
    df_clean['protocol'] = le.fit_transform(df_clean['protocol'])
    print("Encoded protocol column.")
    # Print mapping if possible
    try:
        mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(f"Protocol Mapping: {mapping}")
    except:
        pass

print(f"Final Feature Count: {len(df_clean.columns) - 1}") # -1 for label

## 4. Train-Test Split
Using an 80-20 split with stratification to maintain class balance.

In [None]:
X = df_clean.drop('label', axis=1)
y = df_clean['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train Shape: {X_train.shape}")
print(f"Test Shape:  {X_test.shape}")

## 5. LightGBM Model Training
Training a Gradient Boosting Decision Tree (GBDT) with parameters optimized for stability and accuracy.

In [None]:
# Initialize LightGBM Classifier
# Using default boosting type 'gbdt' which is generally accurate and fast.
# n_estimators=1000 with early stopping allows it to train until convergence.
clf = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    metric='binary_logloss',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    importance_type='gain'  # Use gain for feature importance
)

# Train with Early Stopping
# callbacks=[lgb.early_stopping(stopping_rounds=50)] is the new way in recent versions
print("Training LightGBM model...")
ts = datetime.now()

callbacks = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]

clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['auc', 'binary_logloss'],
    callbacks=callbacks
)

train_time = datetime.now() - ts
print(f"Training completed in {train_time}")

## 6. Model Evaluation
Comprehensive metrics including Confusion Matrix, Classification Report, and ROC Curve.

In [None]:
# Predict
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

# Metrics
print("="*60)
print("MODEL PERFORMANCE REPORT")
print("="*60)
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_prob):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix Plot
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Benign', 'Attack'], 
            yticklabels=['Benign', 'Attack'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 7. Feature Importance
Understanding which features contribute most to the detection.

In [None]:
# Feature Importance Plot
plt.figure(figsize=(12, 10))
lgb.plot_importance(clf, importance_type='gain', max_num_features=25, height=0.7)
plt.title("LightGBM Feature Importance (Gain)")
plt.show()

# Display Top 10 Features textually
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns)), columns=['Value','Feature'])
print("Top 10 Important Features:")
print(feature_imp.sort_values(by="Value", ascending=False).head(10))

## 8. Save Model
Saving the trained model for future use.

In [None]:
model_filename = 'lightgbm_dns_abuse_model.txt'
clf.booster_.save_model(model_filename)
print(f"Model saved to {model_filename}")

# Also save as pickle for sklearn wrapper usage convenience if needed
pkl_filename = 'lightgbm_dns_abuse_model.pkl'
joblib.dump(clf, pkl_filename)
print(f"Model (sklearn wrapper) saved to {pkl_filename}")