In [6]:
import joblib
import pandas as pd
import numpy as np

# Load the scaler to get the correct feature names
scaler = joblib.load("models/scaler.pkl")
feature_names = list(scaler.feature_names_in_)  # 78 features

# Example class names (edit as needed)
class_names = [
    "BENIGN","Bot","DDoS","DoS GoldenEye","DoS Hulk","DoS Slowhttptest","DoS slowloris",
    "FTP-Patator","Heartbleed","Infiltration","PortScan","SSH-Patator",
    "Web Attack � Brute Force","Web Attack � Sql Injection","Web Attack � XSS"
]

rows = []
n_rows = 15  # One for each class
for i, label in enumerate(class_names):
    row = list(np.random.uniform(0, 30, len(feature_names)))
    row.append(label)
    rows.append(row)

df = pd.DataFrame(rows, columns=feature_names + ['label'])
df.to_csv("sample_78_features.csv", index=False)
print("Created sample_78_features.csv with 78 features and all attack types.")

Created sample_78_features.csv with 78 features and all attack types.


In [11]:
import os
import pandas as pd
import random

# Parameters
input_folder = 'data'  # Change this to your folder path
output_file = 'random_rows_combined.csv'
rows_per_file = 10  # Number of random rows to select from each file

# Collect all CSV files in the folder
csv_files = [file for file in os.listdir(input_folder) if file.endswith('.csv')]

# List to store sampled DataFrames
sampled_rows = []

# Process each file
for file_name in csv_files:
    file_path = os.path.join(input_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        sample_count = min(rows_per_file, len(df))  # In case file has fewer rows
        sampled_df = df.sample(n=sample_count, random_state=42)  # Random but reproducible
        sampled_df['source_file'] = file_name  # Optional: to know where each row came from
        sampled_rows.append(sampled_df)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Combine all sampled rows
if sampled_rows:
    result_df = pd.concat(sampled_rows, ignore_index=True)
    result_df.to_csv(output_file, index=False)
    print(f"Saved {len(result_df)} rows to '{output_file}'")
else:
    print("No rows sampled from any file.")


Saved 100 rows to 'random_rows_combined.csv'


In [13]:
import pandas as pd

# Load the file
df = pd.read_csv('data/Wednesday-workingHours.pcap_ISCX.csv')

# Sample 1000 rows
sampled_df = df.sample(n=100, random_state=42)

# Save to new file
sampled_df.to_csv('Wednesday-workingHours_sampled2.csv', index=False)


In [16]:
import pandas as pd
import os
import random

# Parameters
input_folder = 'data'  # Folder containing the CSV files
output_file = 'Output/combined_sampled_2.csv'  # Final output file
sample_size = 1000  # Number of random rows to select from each file
random_seed = 42  # For reproducibility

# Make sure output folder exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# List to collect sampled data
all_samples = []

# Process each file
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(file_path)

        # If file has fewer rows than the sample size, use all rows
        n = min(sample_size, len(df))
        sampled_df = df.sample(n=n, random_state=random_seed)

        # Optional: Add filename as a column for traceability
        sampled_df['source_file'] = filename

        all_samples.append(sampled_df)

        print(f"Sampled {n} rows from {filename}")

# Combine all samples and save to one file
combined_df = pd.concat(all_samples, ignore_index=True)
combined_df.to_csv(output_file, index=False)

print(f"\nSaved combined sampled data to {output_file}")


Sampled 1000 rows from Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Sampled 1000 rows from Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Sampled 1000 rows from Friday-WorkingHours-Morning.pcap_ISCX.csv
Sampled 1000 rows from Monday-WorkingHours.pcap_ISCX.csv
Sampled 1000 rows from Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Sampled 1000 rows from Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Sampled 1000 rows from Tuesday-WorkingHours.pcap_ISCX.csv
Sampled 1000 rows from Wednesday-workingHours.pcap_ISCX.csv

Saved combined sampled data to Output/combined_sampled_2.csv
