In [1]:
import pandas as pd
import numpy as np

# --- Configuration ---
ORIGINAL_FILE_PATH = 'data/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv'
TEST_FILE_PATH = 'traffic_test_data.csv'
TARGET_COLUMN = 'Label'
SAMPLE_SIZE_PER_CLASS = 500  # Number of rows to sample for each class

print(f"Loading data from: {ORIGINAL_FILE_PATH}")

# Load the original data
try:
    df = pd.read_csv(ORIGINAL_FILE_PATH)
except FileNotFoundError:
    print(f"ERROR: File not found at {ORIGINAL_FILE_PATH}. Please check the path.")
    exit()

# --- Data Cleaning (Consistency with Training) ---
df.columns = df.columns.str.strip()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

if TARGET_COLUMN not in df.columns:
    print(f"ERROR: Target column '{TARGET_COLUMN}' not found.")
    exit()

# Get the list of unique labels
labels = df[TARGET_COLUMN].unique()
print(f"Found traffic labels: {labels}")

# --- Stratified Sampling ---
sampled_data = []

# Assuming the two main classes are 'Benign' and 'Bot' based on your context.
# We iterate over all unique labels found.
for label in labels:
    class_df = df[df[TARGET_COLUMN] == label]

    # Sample up to SAMPLE_SIZE_PER_CLASS or the total available count, whichever is smaller
    sample_count = min(SAMPLE_SIZE_PER_CLASS, len(class_df))

    # Use .sample() to select rows randomly
    sampled_rows = class_df.sample(n=sample_count, random_state=42)
    sampled_data.append(sampled_rows)
    print(f"  Sampled {sample_count} rows for label: '{label}'")

# Combine the sampled dataframes
test_df = pd.concat(sampled_data).sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined sample to a new CSV file
test_df.to_csv(TEST_FILE_PATH, index=False)

print(f"\n✅ Success! Test data generated with {len(test_df)} rows and saved to {TEST_FILE_PATH}")

Loading data from: data/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv
Found traffic labels: ['Benign' 'Bot']
  Sampled 500 rows for label: 'Benign'
  Sampled 500 rows for label: 'Bot'

✅ Success! Test data generated with 1000 rows and saved to traffic_test_data.csv


In [2]:
import pandas as pd
import numpy as np
import pickle

# --- Configuration ---
MODEL_FILENAME = 'random_forest_traffic_classifier.pkl'
UNLABELED_TEST_FILE_PATH = 'traffic_test_data.csv'  # <<< RENAME THIS TO YOUR FILE NAME!
OUTPUT_FILE_PATH = 'unseen_predictions.csv'

# Mapping the numerical predictions back to labels.
# CRITICAL: This MUST match the LabelEncoder from your training phase.
# Assuming 'Benign' was 0 and 'Bot' was 1 based on previous steps:
PREDICTION_MAP = {
    0: 'Benign',
    1: 'Bot'
}

# --- 1. Load Model and Unlabeled Test Data ---
print(f"Loading model from {MODEL_FILENAME}...")
try:
    with open(MODEL_FILENAME, 'rb') as file:
        rf_model = pickle.load(file)
    print("Model loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Model file {MODEL_FILENAME} not found.")
    exit()

print(f"Loading unlabeled test data from {UNLABELED_TEST_FILE_PATH}...")
try:
    X_unlabeled = pd.read_csv(UNLABELED_TEST_FILE_PATH)
except FileNotFoundError:
    print(f"ERROR: Test data file {UNLABELED_TEST_FILE_PATH} not found.")
    exit()

# --- 2. Prepare Data for Prediction ---
# This cleaning MUST be identical to the training data preparation.
X_unlabeled.columns = X_unlabeled.columns.str.strip()
X_unlabeled.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle Missing Values (Assuming you dropped NaNs in training):
rows_before_cleaning = len(X_unlabeled)
X_unlabeled.dropna(inplace=True)
print(f"Cleaned data: Dropped {rows_before_cleaning - len(X_unlabeled)} rows with NaN/Inf values.")

# Drop any non-numeric features (consistent with training)
object_cols = X_unlabeled.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    X_unlabeled.drop(columns=object_cols, inplace=True)
    print(f"Dropped non-numeric columns: {list(object_cols)}")

# --- 3. Make Predictions ---
print("\nMaking predictions on the unseen data...")
# Ensure feature order/names match the training data!
y_pred_encoded = rf_model.predict(X_unlabeled)
y_predictions_labels = pd.Series(y_pred_encoded).map(PREDICTION_MAP)
predictions_df = X_unlabeled.copy()
predictions_df.reset_index(drop=True, inplace=True)
predictions_df['Predicted_Label'] = y_predictions_labels

# Save the results to a new CSV file
predictions_df.to_csv(OUTPUT_FILE_PATH, index=False)

print(f"\n✅ Predictions complete. Results saved to {OUTPUT_FILE_PATH}")
print(f"Total rows predicted: {len(predictions_df)}")

Loading model from random_forest_traffic_classifier.pkl...
Model loaded successfully.
Loading unlabeled test data from traffic_test_data.csv...
Cleaned data: Dropped 0 rows with NaN/Inf values.
Dropped non-numeric columns: ['Timestamp', 'Label']

Making predictions on the unseen data...

✅ Predictions complete. Results saved to unseen_predictions.csv
Total rows predicted: 1000
