In [6]:
import os
import re

def preprocess_c_program(c_program):
    preprocessed_program = re.sub(r'\/\*[\s\S]*?\*\/|\/\/.*', '', c_program)  
    preprocessed_program = re.sub(r'#.*', '', preprocessed_program)          
    preprocessed_program = preprocessed_program.strip()                       
    return preprocessed_program

def preprocess_c_programs_in_directory(directory_path):
    total_files = 0
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if filename.endswith(".c"):
            total_files += 1
            with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
                c_program = file.read()

                preprocessed_program = preprocess_c_program(c_program)

            with open(file_path, 'w') as file:
                file.write(preprocessed_program)

            print(f"Preprocessed {file_path}")

    return total_files

# Dataset directory
dataset_directory = "/home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw"

# Preprocess code snippets in the Vulnerable directory
vulnerable_directory = os.path.join(dataset_directory, "Vulnerable")
total_vulnerable_files = preprocess_c_programs_in_directory(vulnerable_directory)

# Preprocess code snippets in the Non-vulnerable directory
non_vulnerable_directory = os.path.join(dataset_directory, "Non_vulnerable")
total_non_vulnerable_files = preprocess_c_programs_in_directory(non_vulnerable_directory)

# Print total files preprocessed for each directory
print(f"Total Vulnerable files preprocessed: {total_vulnerable_files}")
print(f"Total Non-vulnerable files preprocessed: {total_non_vulnerable_files}")


Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/PavanNikhilesh_dpdk-drivers2Fnet2Fena2Fena_ethdev.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/juliet_testsuite_CWE121_S01-CWE121_Stack_Based_Buffer_Overflow__CWE135_53d.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/ssocopsacc_openldap-serveradm-libraries2Fliblber2Fbprint.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/iprouteold-ip2Fipxfrm.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/binutils-bfd2Fpeigen.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/iproutenewver-misc2Fnstat.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/rdesktop-integration-integration-printer.c
Preprocessed /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/PavanNikhilesh_dpdk-lib2Flibrte_power2Frte_power_empty_poll.c
Preproce

In [8]:
import os
import random
import shutil

# Set paths for source and destination directories
source_directory = dataset_directory + '/Vulnerable'  # Change this to the path of your dataset
train_directory = dataset_directory + '/Vulnerable/TRAIN'
test_directory = dataset_directory + '/Vulnerable/TEST'

# Check if source directory exists
if not os.path.exists(source_directory):
    print("Source directory does not exist:", source_directory)
else:
    # Create destination directories if they don't exist
    os.makedirs(train_directory, exist_ok=True)
    os.makedirs(test_directory, exist_ok=True)
    print("Destination directories created")

    # Get a list of all C files in the source directory
    c_files = [file for file in os.listdir(source_directory) if file.endswith('.c')]
    print(len(c_files))

    # Shuffle the list of C files
    random.shuffle(c_files)

    # Calculate the index to split the files
    split_index = int(0.7 * len(c_files))

    # Divide files into training and testing sets
    train_files = c_files[:split_index]
    test_files = c_files[split_index:]

    # Move training files to the train directory
    for file in train_files:
        print("ah")
        source_path = os.path.join(source_directory, file)
        destination_path = os.path.join(train_directory, file)
        shutil.move(source_path, destination_path)
        print("Moved", file, "to", train_directory)

    # # Move testing files to the test directory
    for file in test_files:
        source_path = os.path.join(source_directory, file)
        destination_path = os.path.join(test_directory, file)
        shutil.move(source_path, destination_path)
        print("Moved", file, "to", test_directory)

    print("Dataset divided into train and test sets.")

Destination directories created
2306
ah
Moved rodinia_heartwall_orig-AVI2Favilib.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved juliet_testsuite_CWE121_S01-CWE121_Stack_Based_Buffer_Overflow__CWE135_65b.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved vimxenial-src2Fnetbeans.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved e2fsprogtrusty-lib2Fe2p2Fuuid.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved _CVE-2010-1624.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved libav-libavformat2Frtpdec_amr.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved ssocopsacc_radiusclient-src2Fradius.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved cve-2007-5629-3.c to /home/shaurya/BTP/1/Dataset/Raw/dataset 1/Dataset_raw/Vulnerable/TRAIN
ah
Moved e2fsprogdebst