# IoT IDS Model Training and Saving
This notebook trains the best LightGBM model and saves it as a pickle file for deployment.

In [None]:
# Install required packages if needed
!pip install lightgbm --quiet

In [None]:
import pandas as pd
import numpy as np
import pickle
import joblib
from datetime import datetime
import os

# ML Libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load your dataset - UPDATE THIS PATH
# For Kaggle: /kaggle/input/your-dataset-name/filename.csv
data_path = "/kaggle/input/iotcic-dataset/iot_dataset.csv"  # Update this!

print("üìä Loading dataset...")
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Create taxonomy mapping
taxonomy_map = {
    # Flood Attacks
    'DoS-TCP_Flood': 'Flood Attacks',
    'DoS-UDP_Flood': 'Flood Attacks', 
    'DoS-SYN_Flood': 'Flood Attacks',
    'DoS-HTTP_Flood': 'Flood Attacks',
    
    # Botnet/Mirai Attacks
    'Mirai-greeth_flood': 'Botnet/Mirai Attacks',
    'Mirai-greip_flood': 'Botnet/Mirai Attacks',
    'Mirai-udpplain': 'Botnet/Mirai Attacks',
    
    # Spoofing/MITM
    'MITM-ArpSpoofing': 'Spoofing / MITM',
    'DNS_Spoofing': 'Spoofing / MITM',
    
    # Reconnaissance  
    'Recon-PingSweep': 'Reconnaissance',
    'Recon-OSScan': 'Reconnaissance',
    'Recon-PortScan': 'Reconnaissance',
    'VulnerabilityScan': 'Reconnaissance',
    
    # Backdoors & Exploits
    'Backdoor_Malware': 'Backdoors & Exploits',
    'BrowserHijacking': 'Backdoors & Exploits',
    'CommandInjection': 'Backdoors & Exploits',
    
    # Injection Attacks
    'SqlInjection': 'Injection Attacks',
    'XSS': 'Injection Attacks',
    
    # Benign
    'BenignTraffic': 'Benign'
}

print(f"Taxonomy mapping created with {len(taxonomy_map)} categories")

In [None]:
# Prepare data
print("üîß Preparing data...")

# Drop rows with missing labels
df_clean = df.dropna(subset=['label']).copy()
print(f"Clean dataset shape: {df_clean.shape}")

# Apply taxonomy mapping
df_clean['taxonomy_label'] = df_clean['label'].map(taxonomy_map).fillna('Unknown')

# Prepare features (drop label columns)
X = df_clean.drop(columns=['label', 'taxonomy_label'])
y = df_clean['taxonomy_label']

# Keep only numeric columns
numeric_columns = X.select_dtypes(include=[np.number]).columns
X = X[numeric_columns]

print(f"Feature matrix shape: {X.shape}")
print(f"Label distribution:")
print(y.value_counts())

In [None]:
# Encode labels and scale features
print("‚öôÔ∏è Encoding and scaling...")

le = LabelEncoder()
y_encoded = le.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Classes: {le.classes_}")
print(f"Scaled features shape: {X_scaled.shape}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train LightGBM model
print("üöÄ Training LightGBM model...")

lgb_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    is_unbalance=True,
    n_estimators=200,
    learning_rate=0.1,
    max_depth=10,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

# Train the model
lgb_model.fit(X_train, y_train)

print("‚úÖ Model training completed!")

In [None]:
# Evaluate model
print("üìä Evaluating model...")

y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score (weighted): {f1:.4f}")
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Save the model pipeline
print("üíæ Saving model pipeline...")

# Create timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Model pipeline dictionary
model_pipeline = {
    'model': lgb_model,
    'scaler': scaler,
    'label_encoder': le,
    'feature_names': X.columns.tolist(),
    'model_type': 'LightGBM',
    'timestamp': timestamp,
    'taxonomy_map': taxonomy_map,
    'accuracy': accuracy,
    'f1_score': f1
}

# Save with pickle
pickle_filename = f"iot_ids_lightgbm_{timestamp}.pkl"
with open(pickle_filename, 'wb') as f:
    pickle.dump(model_pipeline, f)

# Save with joblib (alternative)
joblib_filename = f"iot_ids_lightgbm_{timestamp}.joblib"
joblib.dump(model_pipeline, joblib_filename)

print(f"‚úÖ Model saved successfully!")
print(f"Pickle file: {pickle_filename}")
print(f"Joblib file: {joblib_filename}")
print(f"File sizes:")
print(f"  Pickle: {os.path.getsize(pickle_filename) / 1024 / 1024:.2f} MB")
print(f"  Joblib: {os.path.getsize(joblib_filename) / 1024 / 1024:.2f} MB")

In [None]:
# Create model info file
info_filename = f"model_info_{timestamp}.txt"
with open(info_filename, 'w') as f:
    f.write(f"IoT IDS Model Information\n")
    f.write(f"========================\n")
    f.write(f"Model Type: LightGBM Classifier\n")
    f.write(f"Timestamp: {timestamp}\n")
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"F1-Score: {f1:.4f}\n")
    f.write(f"Features: {len(X.columns)}\n")
    f.write(f"Classes: {len(le.classes_)}\n")
    f.write(f"Class Names: {', '.join(le.classes_)}\n")
    f.write(f"\nTaxonomy Mapping:\n")
    for original, taxonomy in taxonomy_map.items():
        f.write(f"  {original} -> {taxonomy}\n")

print(f"üìÑ Model info saved to: {info_filename}")

In [None]:
# Test loading the saved model
print("üîç Testing model loading...")

# Load the model
with open(pickle_filename, 'rb') as f:
    loaded_pipeline = pickle.load(f)

loaded_model = loaded_pipeline['model']
loaded_scaler = loaded_pipeline['scaler']
loaded_le = loaded_pipeline['label_encoder']

# Test prediction
test_sample = X_test[:1]
test_scaled = loaded_scaler.transform(test_sample)
prediction = loaded_model.predict(test_scaled)
prediction_proba = loaded_model.predict_proba(test_scaled)

class_name = loaded_le.inverse_transform(prediction)[0]
confidence = max(prediction_proba[0])

print(f"‚úÖ Model loaded and tested successfully!")
print(f"Test prediction: {class_name} (confidence: {confidence:.4f})")
print(f"\nüéâ Your IoT IDS model is ready for deployment!")