In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib
import os

# --- Configuration (Set these based on your environment) ---
csv_folder_path = '/content/drive/MyDrive/Deep Learning mini project/dataset'
output_filename = "CICIDS2017_FULL_CLEANED_DATASET.csv"
output_path = os.path.join(csv_folder_path, output_filename)
model_output_path_reduced = os.path.join(csv_folder_path, "CICIDS2017_DT_ReducedFeatures_Model.joblib")

# Load the cleaned dataset
print("Loading cleaned dataset...")
df = pd.read_csv(output_path, low_memory=False)
print(f"Dataset loaded with shape: {df.shape}")

# Ensure all column names are stripped of whitespace
df.columns = df.columns.str.strip()

# --- ðŸŽ¯ THE SELECTED FEATURES (The Feature Selection Step) ---
# These are the 8 key features you selected:
SELECTED_FEATURES = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Fwd Packet Length Max',
    'Flow IAT Mean',
    'Flow IAT Std',
    'Bwd Packets/s',
    'min_seg_size_forward'
]
print(f"\nTraining model using only {len(SELECTED_FEATURES)} selected features.")

Loading cleaned dataset...
Dataset loaded with shape: (2829385, 79)

Training model using only 8 selected features.


In [None]:
# 2.1. Feature and Label Separation & Feature Selection
# X_full is the full feature set, X is the reduced feature set.
X_full = df.drop('Label', axis=1)
X = df[SELECTED_FEATURES]  # <--- Feature Selection applied here!
y = df['Label']

# 2.2. Label Encoding (Target Variable)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("\n--- Label Encoding Mappings ---")
# Show the mapping to confirm (e.g., BENIGN is 0)
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# 2.3. Feature Scaling (Standardization)
# The scaler is FIT only on the REDUCED feature set (X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n--- Preprocessing Complete ---")
print(f"Scaled feature matrix shape: {X_scaled.shape}")


--- Label Encoding Mappings ---
{'BENIGN': np.int64(0), 'Bot': np.int64(1), 'DDoS': np.int64(2), 'DoS GoldenEye': np.int64(3), 'DoS Hulk': np.int64(4), 'DoS Slowhttptest': np.int64(5), 'DoS slowloris': np.int64(6), 'FTP-Patator': np.int64(7), 'Heartbleed': np.int64(8), 'Infiltration': np.int64(9), 'PortScan': np.int64(10), 'SSH-Patator': np.int64(11), 'Web Attack ï¿½ Brute Force': np.int64(12), 'Web Attack ï¿½ Sql Injection': np.int64(13), 'Web Attack ï¿½ XSS': np.int64(14)}

--- Preprocessing Complete ---
Scaled feature matrix shape: (2829385, 8)


In [None]:
# Perform a stratified train-test split (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y_encoded,
    test_size=0.3,
    random_state=42, # for reproducibility
    stratify=y_encoded
)

print("\n--- Train-Test Split (Stratified) ---")
print(f"Training set size: {len(X_train):,}")
print(f"Testing set size: {len(X_test):,}")


--- Train-Test Split (Stratified) ---
Training set size: 1,980,569
Testing set size: 848,816


In [None]:
print("\n--- Model Training (Decision Tree on 8 Features) ---")

# 4.1. Model Creation and Training
model_reduced = DecisionTreeClassifier(random_state=42)
print("Starting training on reduced feature set...")
model_reduced.fit(X_train, y_train)
print("Training complete.")

# 4.2. Model Evaluation
y_pred_reduced = model_reduced.predict(X_test)

print("\n--- Model Evaluation (Test Set) ---")
# Get the classification report using the original class names
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred_reduced, target_names=target_names))


--- Model Training (Decision Tree on 8 Features) ---
Starting training on reduced feature set...
Training complete.

--- Model Evaluation (Test Set) ---
                            precision    recall  f1-score   support

                    BENIGN       0.99      0.99      0.99    681807
                       Bot       0.75      0.47      0.58       590
                      DDoS       1.00      1.00      1.00     38408
             DoS GoldenEye       0.95      0.95      0.95      3088
                  DoS Hulk       0.91      0.97      0.94     69037
          DoS Slowhttptest       0.92      0.93      0.93      1650
             DoS slowloris       0.99      0.99      0.99      1739
               FTP-Patator       0.99      0.98      0.99      2381
                Heartbleed       1.00      1.00      1.00         3
              Infiltration       0.10      0.09      0.10        11
                  PortScan       0.99      1.00      1.00     47679
               SSH-Patator   

In [None]:
import os
import joblib

# --- Configuration (using variables from your previous context) ---
# Assuming these variables still hold the correct base path:
csv_folder_path = '/content/drive/MyDrive/Deep Learning mini project/dataset'
model_output_path_reduced = os.path.join(csv_folder_path, "CICIDS2017_DT_ReducedFeatures_Model.joblib")
scaler_output_path = os.path.join(csv_folder_path, "CICIDS2017_Reduced_Scaler.joblib")
encoder_output_path = os.path.join(csv_folder_path, "CICIDS2017_Reduced_Encoder.joblib")

# --- 1. DEFINE AND CREATE NEW MODEL DIRECTORY ---
model_dir = os.path.join(csv_folder_path, "models")

# Create the directory if it doesn't exist
# exist_ok=True prevents an error if the directory already exists
os.makedirs(model_dir, exist_ok=True)
print(f"Created/Ensured directory exists: {model_dir}")

# --- 2. UPDATE FILE PATHS TO THE NEW DIRECTORY ---
# Note: Renaming the files slightly for better organization within the new folder
model_filename = "DT_Reduced_Model.joblib"
scaler_filename = "Reduced_Scaler.joblib"
encoder_filename = "Reduced_Encoder.joblib"

final_model_path = os.path.join(model_dir, model_filename)
final_scaler_path = os.path.join(model_dir, scaler_filename)
final_encoder_path = os.path.join(model_dir, encoder_filename)

# --- 3. SAVING THE MODEL AND COMPONENTS ---
# Assuming 'model_reduced', 'scaler', and 'label_encoder' objects are available in memory

# 3.1. Save the Model
joblib.dump(model_reduced, final_model_path)

# 3.2. Save Preprocessing Components
joblib.dump(scaler, final_scaler_path)
joblib.dump(label_encoder, final_encoder_path)


print("\nâœ… SUCCESS: Model and components saved to dedicated 'models' folder:")
print(f"Model Path:  {final_model_path}")
print(f"Scaler Path: {final_scaler_path}")
print(f"Encoder Path: {final_encoder_path}")

Created/Ensured directory exists: /content/drive/MyDrive/Deep Learning mini project/dataset/models

âœ… SUCCESS: Model and components saved to dedicated 'models' folder:
Model Path:  /content/drive/MyDrive/Deep Learning mini project/dataset/models/DT_Reduced_Model.joblib
Scaler Path: /content/drive/MyDrive/Deep Learning mini project/dataset/models/Reduced_Scaler.joblib
Encoder Path: /content/drive/MyDrive/Deep Learning mini project/dataset/models/Reduced_Encoder.joblib
