In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # For evaluation
import joblib # For saving the trained model
import os # For path manipulation

In [2]:
PROJECT_ROOT_DIR = R"C:\Users\seena\Downloads\Network-Intrusion-Detection-System\processed_data"

In [3]:

DATASET_FILENAME = 'final_netshield_cleaned_scaled_dataset.parquet'

In [4]:


DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, DATASET_FILENAME)

In [5]:


MODEL_SAVE_DIR = os.path.join(PROJECT_ROOT_DIR, 'trained_model')
MODEL_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, 'network_intrusion_model.pkl')

In [6]:
print("---  Loading the Cleaned Dataset ---")
print(f"Attempting to load dataset from: {DATASET_PATH}")
try:
    # Load the Parquet file into a pandas DataFrame using pyarrow engine
    df = pd.read_parquet(DATASET_PATH, engine="pyarrow")
    print(f"Dataset loaded successfully from: {DATASET_PATH}")
    print(f"Dataset shape: {df.shape}") # Shows (number of rows, number of columns)
    print("First 5 rows of the dataset:")
    print(df.head()) # Displays the first 5 rows of your data

    print("\nDataset information (data types, non-null values):")
    df.info() # Provides a summary of your dataset's columns and data types

except FileNotFoundError:
    print(f"ERROR: Dataset file not found at: {DATASET_PATH}")
    print("Please double-check that the file name and the 'PROJECT_ROOT_DIR' are correct.")
except Exception as e:
    print(f"ERROR: An unexpected problem occurred while loading the dataset:")
    print(f"Error type: {type(e).__name__}")
    print(f"Error message: {e}")
    import traceback
    traceback.print_exc() # Prints full technical details of the error

print("-" * 50)

---  Loading the Cleaned Dataset ---
Attempting to load dataset from: C:\Users\seena\Downloads\Network-Intrusion-Detection-System\processed_data\final_netshield_cleaned_scaled_dataset.parquet
Dataset loaded successfully from: C:\Users\seena\Downloads\Network-Intrusion-Detection-System\processed_data\final_netshield_cleaned_scaled_dataset.parquet
Dataset shape: (2596603, 79)
First 5 rows of the dataset:
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  bwd_packets_length_total  fwd_packet_length_max  \
0                 -0.062106                 -0.007681              -0.33055

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder



label_encoder = LabelEncoder()

label_encoder.fit(df['label'])

encoded_to_label = {i: label for i, label in enumerate(label_encoder.classes_)}
NORMAL_ENCODED_VALUE = next(key for key, value in encoded_to_label.items() if value == 'NORMAL')

print("Mappings defined: encoded_to_label and NORMAL_ENCODED_VALUE.")

Mappings defined: encoded_to_label and NORMAL_ENCODED_VALUE.


In [8]:
print("---  Dividing Data into Training and Test Sets ---")
all_columns=df.columns.tolist()
features_to_exclude=['label','label_encoded']
X_columns=[col for col in all_columns if col not in features_to_exclude]
X=df[X_columns]
y=df['label_encoded']
print(f"\nShape of Features (X):{X.shape}")
print(f"Shape of Target (y):{y.shape}")
print("\nFirst 5 columns of X (Features):")
print(X.iloc[:, :5].head())

---  Dividing Data into Training and Test Sets ---

Shape of Features (X):(2596603, 77)
Shape of Target (y):(2596603,)

First 5 columns of X (Features):
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  
0                 -0.062106  
1                 -0.062106  
2                 -0.057573  
3                 -0.057573  
4                 -0.062106  


In [9]:
from sklearn.model_selection import train_test_split

print("\n--- Performing Train-Test Split ---")

# Split the data into training and test sets
# test_size=0.2 means 20% of the data will be used for testing
# random_state=42 ensures reproducibility
# stratify=y ensures that the proportion of classes is the same in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split into training and test sets successfully!")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


--- Performing Train-Test Split ---
Data split into training and test sets successfully!
X_train shape: (2077282, 77)
y_train shape: (2077282,)
X_test shape: (519321, 77)
y_test shape: (519321,)


In [10]:
df['label'].value_counts() # Replace 'label' with your actual target column name if different

label
NORMAL                        2261548
DOS HULK                       172846
DDOS                           128014
DOS GOLDENEYE                   10286
FTP-PATATOR                      5931
DOS SLOWLORIS                    5385
DOS SLOWHTTPTEST                 5228
SSH-PATATOR                      3219
PORTSCAN                         1956
WEB ATTACK � BRUTE FORCE         1470
WEB ATTACK � XSS                  652
INFILTRATION                       36
WEB ATTACK � SQL INJECTION         21
HEARTBLEED                         11
Name: count, dtype: int64

In [11]:
print(f"Type of X_train: {type(X_train)}")

Type of X_train: <class 'pandas.core.frame.DataFrame'>


In [12]:
# In your Jupyter Notebook cell, while newvenv kernel is active:

# 1. Ensure imbalanced-learn is at the desired version
#    (It already seems to be 0.13.0, but this doesn't hurt)
!pip install imbalanced-learn
import sklearn
print(sklearn.__version__)



Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
1.6.1



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from imblearn.over_sampling import SMOTE
# First, get the current class distribution of your training data
class_counts = pd.Series(y_train).value_counts().sort_index()

# Define your custom sampling strategy
# We want to oversample all minority classes, but not necessarily to the size of 'NORMAL'.
# Let's target a size significantly larger than current minority classes,
# for example, let's try to bring them all up to 100,000 samples each, or
# roughly the size of your largest non-NORMAL class (DOS HULK ~138k).
# We'll set a reasonable target, e.g., 50,000 to 100,000 samples for each minority class.

# Example: Set a target count for minority classes.
# Let's try to bring all minority classes to, say, 50,000 samples.
# (Adjust this number based on your memory and desired balance)
target_minority_size = 50000

# Create the sampling_strategy dictionary
sampling_strategy = {}
for cls, count in class_counts.items():
    if cls != NORMAL_ENCODED_VALUE: # For all non-NORMAL classes
        # Ensure we don't try to resample a class smaller than its current count
        sampling_strategy[cls] = max(count, target_minority_size) # Take max of current or target size
    else:
        # Keep the majority class as is
        sampling_strategy[cls] = count # Or simply exclude it from the strategy dictionary

print("Proposed SMOTE sampling strategy:")
print(sampling_strategy)

# Re-initialize SMOTE with the custom sampling_strategy
smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=sampling_strategy)

# Now, try to apply SMOTE again
print("\n--- Applying SMOTE with custom sampling_strategy... ---")
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\n--- Class distribution AFTER SMOTE with custom strategy ---")
print(pd.Series(y_train_resampled).value_counts().sort_index())



Proposed SMOTE sampling strategy:
{0: 102411, 1: 50000, 2: 138277, 3: 50000, 4: 50000, 5: 50000, 6: 50000, 7: 50000, 8: 1809238, 9: 50000, 10: 50000, 11: 50000, 12: 50000, 13: 50000}

--- Applying SMOTE with custom sampling_strategy... ---

--- Class distribution AFTER SMOTE with custom strategy ---
label_encoded
0      102411
1       50000
2      138277
3       50000
4       50000
5       50000
6       50000
7       50000
8     1809238
9       50000
10      50000
11      50000
12      50000
13      50000
Name: count, dtype: int64


In [14]:



print("\n--- Training RandomForestClassifier on SMOTE-resampled data ---")


rf_model_smote = RandomForestClassifier(
    n_estimators=100,        # Number of trees (can be increased for more performance)
    random_state=42,         # For reproducibility
    n_jobs=-1,               # Use all available CPU cores for faster training
    verbose=1                # Show training progress
)

print(f"Training data shape (resampled): {X_train_resampled.shape}")
print(f"Training target shape (resampled): {y_train_resampled.shape}")

rf_model_smote.fit(X_train_resampled, y_train_resampled)
print("\nRandomForestClassifier training on SMOTE-resampled data complete!")


print("\n--- Evaluating the Model (trained with SMOTE) on the ORIGINAL Test Set (Multi-class) ---")

y_pred_smote = rf_model_smote.predict(X_test)

accuracy_smote = accuracy_score(y_test, y_pred_smote)
print(f"Accuracy on Test Set (after SMOTE training): {accuracy_smote:.4f}")

print("\nClassification Report (Multi-class, after SMOTE training):")

target_names_multi = [encoded_to_label[i] for i in sorted(encoded_to_label.keys())]
print(classification_report(y_test, y_pred_smote, target_names=target_names_multi, zero_division=0)) # Added zero_division=0 to prevent warnings for classes with 0 precision/recall

print("\nConfusion Matrix (Multi-class, after SMOTE training):")
print(confusion_matrix(y_test, y_pred_smote))

# ---  Evaluate the model on the ORIGINAL, UNTOUCHED test set (Binary Malicious vs. Not Malicious) ---
print("\n--- Binary (Malicious vs. Not Malicious) Classification Report (after SMOTE training) ---")

# Convert true labels (y_test) to binary (using the original y_test)
y_test_binary_smote_eval = np.where(y_test == NORMAL_ENCODED_VALUE, 0, 1)

# Convert the NEW predicted labels (y_pred_smote) to binary
y_pred_binary_smote_eval = np.where(y_pred_smote == NORMAL_ENCODED_VALUE, 0, 1)

print(classification_report(y_test_binary_smote_eval, y_pred_binary_smote_eval, target_names=['Not Malicious (0)', 'Malicious (1)'], zero_division=0))

print("\n--- Binary Confusion Matrix (after SMOTE training) ---")
print(confusion_matrix(y_test_binary_smote_eval, y_pred_binary_smote_eval))


print("\n--- Investigating FALSE NEGATIVES (Actual Malicious, Predicted Not Malicious) after SMOTE training ---")

false_negatives_indices_smote = np.where((y_test_binary_smote_eval == 1) & (y_pred_binary_smote_eval == 0))[0]

if len(false_negatives_indices_smote) > 0:
    print(f"\nTotal FALSE NEGATIVES (after SMOTE training): {len(false_negatives_indices_smote)}")

    # Get the ACTUAL multi-class labels for these False Negatives
    # Use .iloc[] for robust indexing with Pandas Series
    actual_labels_for_fns_smote = y_test.iloc[false_negatives_indices_smote]

    # Convert encoded actual labels back to human-readable
    actual_fn_labels_smote = [encoded_to_label[label] for label in actual_labels_for_fns_smote]

    # Analyze the distribution of actual types of missed attacks
    missed_attack_counts_smote = pd.Series(actual_fn_labels_smote).value_counts()
    print("\nDistribution of Missed Attack Types (False Negatives) after SMOTE training:")
    print(missed_attack_counts_smote)
else:
    print("No FALSE NEGATIVES found after SMOTE training. Model is perfect at not missing attacks!")


--- Training RandomForestClassifier on SMOTE-resampled data ---
Training data shape (resampled): (2599926, 77)
Training target shape (resampled): (2599926,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.3min finished



RandomForestClassifier training on SMOTE-resampled data complete!

--- Evaluating the Model (trained with SMOTE) on the ORIGINAL Test Set (Multi-class) ---


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.6s finished


Accuracy on Test Set (after SMOTE training): 0.9990

Classification Report (Multi-class, after SMOTE training):
                            precision    recall  f1-score   support

                      DDOS       1.00      1.00      1.00     25603
             DOS GOLDENEYE       1.00      1.00      1.00      2057
                  DOS HULK       1.00      1.00      1.00     34569
          DOS SLOWHTTPTEST       0.95      0.99      0.97      1046
             DOS SLOWLORIS       1.00      0.99      1.00      1077
               FTP-PATATOR       1.00      1.00      1.00      1186
                HEARTBLEED       1.00      1.00      1.00         2
              INFILTRATION       1.00      0.71      0.83         7
                    NORMAL       1.00      1.00      1.00    452310
                  PORTSCAN       0.91      0.95      0.93       391
               SSH-PATATOR       1.00      0.98      0.99       644
  WEB ATTACK � BRUTE FORCE       0.78      0.77      0.78       294
WEB

In [15]:

PROJECT_ROOT_DIR = os.getcwd()

# Define the directory where models will be saved
MODEL_SAVE_DIR = os.path.join(PROJECT_ROOT_DIR, "model")


MODEL_FILENAME = "rf_model_smote.pkl"
MODEL_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, MODEL_FILENAME)


os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

print(f"Model will be saved to: {MODEL_SAVE_PATH}")

# --- Save the Trained Model ---
try:
    
    joblib.dump(rf_model_smote, MODEL_SAVE_PATH)
    print(f"Model '{MODEL_FILENAME}' successfully saved!")
except NameError:
    print("Error: 'rf_model_smote' is not defined. Please ensure your model training cell was run.")
except Exception as e:
    print(f"Error saving model: {e}")

Model will be saved to: C:\Users\seena\Downloads\Network-Intrusion-Detection-System\model\rf_model_smote.pkl
Model 'rf_model_smote.pkl' successfully saved!
