In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast

# Load data.csv
data = pd.read_csv('./feature_extraction/dataset_with_codebert_features_array.csv')
#data = pd.read_csv('./feature_extraction/dataset_with_longformer_features_array.csv')
#data = pd.read_csv('./feature_extraction/dataset_with_codeT5_features_array.csv')

# Convert the 'vulnerability_list' column from string to list
data['vulnerability_list'] = data['vulnerability_list'].apply(ast.literal_eval)

# Convert the list of vulnerabilities into a numpy array
vulnerability_array = np.array(data['vulnerability_list'].tolist())

# Known distribution of vulnerabilities
target_dist = np.array([0.445, 0.057, 0.012, 0.012, 0.044, 0.084, 0.153, 0.040, 0.153])

# Total number of samples
N = len(data)
train_size = 0.8
test_size = 0.2
train_n = int(N * train_size) # Number of training samples
test_n = N - train_n # Number of testing samples

# Define a loss function: weighted squared error between current and target distributions
def compute_loss(train_dist, test_dist, target_dist):
    weights = 1 / (target_dist + 0.01)  # Give higher weight to rare vulnerabilities, avoid division by zero
    train_loss = np.sum(weights * (train_dist - target_dist) ** 2)
    test_loss = np.sum(weights * (test_dist - target_dist) ** 2)
    return train_loss + test_loss

# Try multiple random initializations and optimize
best_train_idx = None
best_test_idx = None
best_loss = float('inf')
max_attempts = 10  # Number of different initial splits to try

for attempt in range(max_attempts):
    # Perform initial random split
    train_idx, test_idx = train_test_split(np.arange(N), train_size=train_size, shuffle=True)
    train_data = vulnerability_array[train_idx]
    test_data = vulnerability_array[test_idx]
    train_dist = train_data.mean(axis=0) # Proportion of each vulnerability in training set
    test_dist = test_data.mean(axis=0) # Proportion of each vulnerability in testing set
    
    # Iterative optimization
    max_iterations = 500  # Increase number of iterations
    for iteration in range(max_iterations):
        current_loss = compute_loss(train_dist, test_dist, target_dist)
        
        # If the error is small enough, stop early
        if np.all(np.abs(train_dist - target_dist) < 0.02) and np.all(np.abs(test_dist - target_dist) < 0.02):
            break
        
        # Randomly select a vulnerability to adjust (where error > 0.02)
        vuln_to_adjust = np.random.choice(np.where(np.abs(test_dist - target_dist) > 0.02)[0])
        if test_dist[vuln_to_adjust] > target_dist[vuln_to_adjust]:  # Too many of this vulnerability in testing
            # Move a sample from testing to training
            candidates = test_idx[vulnerability_array[test_idx, vuln_to_adjust] == 1]
            if len(candidates) > 0:
                move_idx = np.random.choice(candidates)
                test_idx = np.setdiff1d(test_idx, move_idx)
                train_idx = np.append(train_idx, move_idx)
        else:  # Too few of this vulnerability in testing
            # Move a sample from training to testing
            candidates = train_idx[vulnerability_array[train_idx, vuln_to_adjust] == 1]
            if len(candidates) > 0:
                move_idx = np.random.choice(candidates)
                train_idx = np.setdiff1d(train_idx, move_idx)
                test_idx = np.append(test_idx, move_idx)
        
        # Update distributions
        train_data = vulnerability_array[train_idx]
        test_data = vulnerability_array[test_idx]
        train_dist = train_data.mean(axis=0)
        test_dist = test_data.mean(axis=0)
    
    # Record the best result
    final_loss = compute_loss(train_dist, test_dist, target_dist)
    if final_loss < best_loss:
        best_loss = final_loss
        best_train_idx = train_idx.copy()
        best_test_idx = test_idx.copy()
        best_train_dist = train_dist.copy()
        best_test_dist = test_dist.copy()

# Store the best split back into DataFrames
train_df = data.iloc[best_train_idx].reset_index(drop=True)
test_df = data.iloc[best_test_idx].reset_index(drop=True)

# Print results
print("Training vulnerability distribution:", best_train_dist)
print("Testing vulnerability distribution:", best_test_dist)
print("Target distribution:", target_dist)
print("Maximum error (Training):", np.max(np.abs(best_train_dist - target_dist)))
print("Maximum error (Testing):", np.max(np.abs(best_test_dist - target_dist)))

# Save results to CSV
train_df.to_csv('./codebert/train_data.csv', index=False)
test_df.to_csv('./codebert/test_data.csv', index=False)
#train_df.to_csv('./longformer/train_data.csv', index=False)
#test_df.to_csv('./longformer/test_data.csv', index=False)
#train_df.to_csv('./codeT5/train_data.csv', index=False)
#test_df.to_csv('./codeT5/test_data.csv', index=False)
print("Finish data split and save into train_data.csv及test_data.csv")

Training vulnerability distribution: [0.53796412 0.06800167 0.0139758  0.0139758  0.05152274 0.10075094
 0.18606592 0.04672507 0.18585732]
Testing vulnerability distribution: [0.51219512 0.07665505 0.0174216  0.0174216  0.06271777 0.10278746
 0.17247387 0.05923345 0.17247387]
Target distribution: [0.445 0.057 0.012 0.012 0.044 0.084 0.153 0.04  0.153]
Maximum error (Training): 0.09296412181894037
Maximum error (Testing): 0.06719512195121952
Finish data split and save into train_data.csv及test_data.csv
