In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import os
from sklearn.model_selection import train_test_split

class DataProcessor:
    def __init__(self, input_dir='sample_data', output_dir='sample_data/output'):
        self.input_dir = input_dir
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
    
    def load_data(self, filename, column_names=None):
        filepath = os.path.join(self.input_dir, filename)
        try:
            data = pd.read_csv(filepath, header=0,low_memory=False)
            if column_names:
                data.columns = column_names
            return data
        except FileNotFoundError:
            print(f"File '{filename}' not found in '{self.input_dir}'.")
            return None

    def save_data(self, data, filename):
        output_path = os.path.join(self.output_dir, filename)
        data.to_csv(output_path, index=False)
        print(f"Data saved to {output_path}")

    def compare_csv(self, file1, file2):
        try:
            df1 = pd.read_csv(file1)
            df2 = pd.read_csv(file2)
        except FileNotFoundError as e:
            print(f"Error loading files: {e}")
            return

        print(f"{file1} shape: {df1.shape}")
        print(f"{file2} shape: {df2.shape}")

        if df1.shape != df2.shape:
            print("Files have different shapes.")
            return

        if df1.equals(df2):
            print("The two CSV files are identical.")
        else:
            differences = df1.compare(df2)
            print("Differences found between files:")
            print(differences)

    def combine_data_with_labels(self, data_file, label_file, output_file):
        data = self.load_data(data_file)
        labels = self.load_data(label_file)
        
        if data is None or labels is None:
            print("Failed to load data or labels.")
            return

        if len(data) != len(labels):
            print("Data and labels have different lengths...")
            print(len(data),len(labels))
            
            return

        combined = pd.concat([data, labels[['x']]], axis=1)
        self.save_data(combined, output_file)
        print(f"Data and labels combined and saved as '{output_file}'")

    def filter_and_save_data(self, input_file, output_train_file, output_test_file, 
                         rows_to_select=20000, target_ratio=0.5, test_size=0.2):
        # Read the dataset
        data = pd.read_csv(input_file)
        
        # Separate the data into 'normal' and 'malicious' samples
        normal_data = data[data['x'] == 0]
        malicious_data = data[data['x'] == 1]
        
        # Calculate required number of samples for each type
        normal_count = int(rows_to_select * (1 - target_ratio))
        malicious_count = rows_to_select - normal_count  # Remaining rows for 'x=1'
        
        # Check if there are enough samples to meet the target counts
        if len(normal_data) < normal_count:
            print(f"Warning: Not enough normal samples to reach {normal_count}. Adjusting to {len(normal_data)}.")
            normal_count = len(normal_data)
            malicious_count = rows_to_select - normal_count
        
        if len(malicious_data) < malicious_count:
            print(f"Warning: Not enough malicious samples to reach {malicious_count}. Adjusting to {len(malicious_data)}.")
            malicious_count = len(malicious_data)
            normal_count = rows_to_select - malicious_count
        
        # Sample data from each group
        sampled_normal = normal_data.sample(n=normal_count, random_state=42)
        sampled_malicious = malicious_data.sample(n=malicious_count, random_state=42)
        
        # Combine and shuffle the sampled data
        filtered_data = pd.concat([sampled_normal, sampled_malicious]).sample(frac=1, random_state=42).reset_index(drop=True)
        
        # Split the filtered data into training and testing sets
        train_data, test_data = train_test_split(filtered_data, test_size=test_size, random_state=42)
        
        # Count the distribution of 'x=0' and 'x=1' in each set
        train_count_0 = (train_data['x'] == 0).sum()
        train_count_1 = (train_data['x'] == 1).sum()
        test_count_0 = (test_data['x'] == 0).sum()
        test_count_1 = (test_data['x'] == 1).sum()
        
        print(f"Training set: x=0 count: {train_count_0}, x=1 count: {train_count_1}")
        print(f"Testing set: x=0 count: {test_count_0}, x=1 count: {test_count_1}")
        
        # Save the training and testing data to their respective output files
        self.save_data(train_data, output_train_file)
        self.save_data(test_data, output_test_file)
        print(f"Training data saved to '{output_train_file}', Testing data saved to '{output_test_file}'")


In [2]:
# 使用示例
processor = DataProcessor(input_dir='sample_data', output_dir='sample_data/output')

In [3]:
# 步骤3: 合并数据与标签文件
#targrt='Mirai'
targrt='Fuzzing'
processor.combine_data_with_labels(f'output/{targrt}_pcap_context.csv', f'{targrt}_labels.csv', f'{targrt}_data.csv')

Data saved to sample_data/output/Fuzzing_data.csv
Data and labels combined and saved as 'Fuzzing_data.csv'


In [4]:
# 步骤4: 筛选特定行并保存
processor.filter_and_save_data(f'sample_data/output/{targrt}_data.csv', f'{targrt}_data1.csv',f'{targrt}_data2.csv',rows_to_select=30000, target_ratio=0.5, test_size=0.3)

Training set: x=0 count: 10471, x=1 count: 10529
Testing set: x=0 count: 4529, x=1 count: 4471
Data saved to sample_data/output/Fuzzing_data1.csv
Data saved to sample_data/output/Fuzzing_data2.csv
Training data saved to 'Fuzzing_data1.csv', Testing data saved to 'Fuzzing_data2.csv'
