In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast

# Load data.csv
data = pd.read_csv('./feature_extraction/dataset_with_codeT5_features_array.csv')

# Convert the 'vulnerability_list' column from string to list
data['vulnerability_list'] = data['vulnerability_list'].apply(ast.literal_eval)

# Convert the list of vulnerabilities into a numpy array
vulnerability_array = np.array(data['vulnerability_list'].tolist())

# Known distribution of vulnerabilities
# target_dist = np.array([0.445, 0.057, 0.012, 0.012, 0.044, 0.084, 0.153, 0.040, 0.153])
# Calculate the true target distribution from the entire dataset
target_dist = vulnerability_array.mean(axis=0)
print("True target distribution from data:", target_dist)
print("Sum of true target distribution:", target_dist.sum())

# Total number of samples
N = len(data)
train_size = 0.8
test_size = 0.2
train_n = int(N * train_size)  # Number of training samples
test_n = N - train_n  # Number of testing samples

# Define a loss function: weighted squared error between current and target distributions
def compute_loss(train_dist, test_dist, target_dist):
    weights = 1 / (target_dist + 0.01)  # Give higher weight to rare vulnerabilities, avoid division by zero
    train_loss = np.sum(weights * (train_dist - target_dist) ** 2)
    test_loss = np.sum(weights * (test_dist - target_dist) ** 2)
    return train_loss + test_loss

# Try multiple random initializations and optimize
best_train_idx = None
best_test_idx = None
best_loss = float('inf')
max_attempts = 10  # Number of different initial splits to try

for attempt in range(max_attempts):
    # Perform initial random split
    train_idx, test_idx = train_test_split(np.arange(N), train_size=train_size, shuffle=True)
    train_data = vulnerability_array[train_idx]
    test_data = vulnerability_array[test_idx]
    train_dist = train_data.mean(axis=0)  # Proportion of each vulnerability in training set
    test_dist = test_data.mean(axis=0)  # Proportion of each vulnerability in testing set
    
    # Iterative optimization
    max_iterations = 500
    for iteration in range(max_iterations):
        current_loss = compute_loss(train_dist, test_dist, target_dist)
        
        # If the error is small enough and size constraints are met, stop early
        if (np.all(np.abs(train_dist - target_dist) < 0.02) and 
            np.all(np.abs(test_dist - target_dist) < 0.02) and 
            abs(len(train_idx) - train_n) <= 5 and 
            abs(len(test_idx) - test_n) <= 5):
            break
        
        # Randomly select a vulnerability to adjust (where error > 0.02)
        vuln_to_adjust = np.random.choice(np.where(np.abs(test_dist - target_dist) > 0.02)[0])
        
        if test_dist[vuln_to_adjust] > target_dist[vuln_to_adjust]:  # Too many in testing
            if len(test_idx) > test_n:  # Only move if test set is larger than desired
                candidates = test_idx[vulnerability_array[test_idx, vuln_to_adjust] == 1]
                if len(candidates) > 0:
                    move_idx = np.random.choice(candidates)
                    test_idx = np.setdiff1d(test_idx, move_idx)
                    train_idx = np.append(train_idx, move_idx)
        else:  # Too few in testing
            if len(train_idx) > train_n:  # Only move if train set is larger than desired
                candidates = train_idx[vulnerability_array[train_idx, vuln_to_adjust] == 1]
                if len(candidates) > 0:
                    move_idx = np.random.choice(candidates)
                    train_idx = np.setdiff1d(train_idx, move_idx)
                    test_idx = np.append(test_idx, move_idx)
        
        # Update distributions
        train_data = vulnerability_array[train_idx]
        test_data = vulnerability_array[test_idx]
        train_dist = train_data.mean(axis=0)
        test_dist = test_data.mean(axis=0)
    
    # Record the best result if size constraints are approximately met
    final_loss = compute_loss(train_dist, test_dist, target_dist)
    if (final_loss < best_loss and 
        abs(len(train_idx) - train_n) <= 5 and 
        abs(len(test_idx) - test_n) <= 5):
        best_loss = final_loss
        best_train_idx = train_idx.copy()
        best_test_idx = test_idx.copy()
        best_train_dist = train_dist.copy()
        best_test_dist = test_dist.copy()

# Ensure exact sizes by trimming or adding samples if necessary
if len(best_train_idx) > train_n:
    best_train_idx = best_train_idx[:train_n]
    best_test_idx = np.setdiff1d(np.arange(N), best_train_idx)
elif len(best_train_idx) < train_n:
    additional_idx = np.random.choice(best_test_idx, train_n - len(best_train_idx), replace=False)
    best_train_idx = np.append(best_train_idx, additional_idx)
    best_test_idx = np.setdiff1d(np.arange(N), best_train_idx)

# Final update of distributions
best_train_dist = vulnerability_array[best_train_idx].mean(axis=0)
best_test_dist = vulnerability_array[best_test_idx].mean(axis=0)

# Store the best split back into DataFrames
train_df = data.iloc[best_train_idx].reset_index(drop=True)
test_df = data.iloc[best_test_idx].reset_index(drop=True)

# Print results
print("Training vulnerability distribution:", best_train_dist)
print("Testing vulnerability distribution:", best_test_dist)
print("Target distribution:", target_dist)
print("Maximum error (Training):", np.max(np.abs(best_train_dist - target_dist)))
print("Maximum error (Testing):", np.max(np.abs(best_test_dist - target_dist)))
print("Training set size:", len(best_train_idx))
print("Testing set size:", len(best_test_idx))
print("Sum of training distribution:", best_train_dist.sum())
print("Sum of testing distribution:", best_test_dist.sum())

# Save results to CSV
train_df.to_csv('./codeT5/train_data.csv', index=False)
test_df.to_csv('./codeT5/test_data.csv', index=False)
print("Finish data split and save into train_data.csv及test_data.csv")

True target distribution from data: [0.53520864 0.06892697 0.01434426 0.01434426 0.05271982 0.1009687
 0.18461252 0.04806259 0.18442623]
Sum of true target distribution: 1.2036140089418776
Training vulnerability distribution: [0.53539823 0.06800186 0.01397299 0.01397299 0.05379599 0.10130415
 0.18327899 0.04913833 0.18304611]
Testing vulnerability distribution: [0.53445065 0.0726257  0.01582868 0.01582868 0.04841713 0.09962756
 0.18994413 0.04376164 0.18994413]
Target distribution: [0.53520864 0.06892697 0.01434426 0.01434426 0.05271982 0.1009687
 0.18461252 0.04806259 0.18442623]
Maximum error (Training): 0.0013801186558446144
Maximum error (Testing): 0.005517904570015553
Training set size: 4294
Testing set size: 1074
Sum of training distribution: 1.2019096413600376
Sum of testing distribution: 1.2104283054003724
Finish data split and save into train_data.csv及test_data.csv
