In [14]:
import os
import subprocess
import time
from unidecode import unidecode


composite_scores = 0
# Extracts the chunks of a given sequence, depending on the n value.
def extract_chunks(sequence, n, sliding = True):    
    chunks = []
    if sliding:
        for i in range(len(sequence) - n + 1):
            chunks.append(sequence[i:i+n])
    else:
        for i in range(0, len(sequence) - n + 1, n):
            chunks.append(sequence[i:i+n])
    return chunks

# Processes training file, reads each line, extracts the chunks
def preprocess_training(training_file, output_file, n, sliding):
    with open(training_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            seq = line.strip()
            if not seq:
                continue
            chunks = extract_chunks(seq, n, sliding)
            for chunk in chunks:
                outfile.write(chunk + "\n")
    print(f"Preprocessed training data saved to {output_file}")

# Processes test file, reads each line, extracts the chunks
def process_test_sequence(seq, n, sliding):
    return extract_chunks(seq, n, sliding)

# Run NSA on the chunks
def run_negsel_on_chunks(chunks, jar_command):
    
    print(chunks)
    temp_filename = r"temp.train"
    with open(temp_filename, 'w') as f:
        for chunk in chunks:
            f.write(chunk + "\n")
    print("Done")
    # Run the jar with the temporary file content as STDIN.
    with open(temp_filename, 'rb') as fin:
        proc = subprocess.run(jar_command, input=fin.read(), stdout=subprocess.PIPE)

    # Wait for Java to finish processing before deleting the file
    time.sleep(2)  

    # Now remove the file safely
    os.remove(temp_filename)
    stdout = proc.stdout.decode('utf-8', errors='replace')
    stderr = proc.stderr.decode('utf-8', errors='replace')

    print(f"→ negsel2 returncode: {proc.returncode}")
    print("=== STDOUT ===")
    print(stdout or "(no stdout)")
    print("=== STDERR ===")
    print(stderr or "(no stderr)")
    
    # Each line of output is assumed to be a score (or log2(1+x)).
    scores = [float(line.strip()) for line in output.strip().splitlines() if line.strip()]
    return scores

# save the scores in an output file
def save_composite_scores(composite_scores, filename):
    with open(filename, 'w') as file:
        for idx, score in enumerate(composite_scores):
            file.write(f"{score}\n")


In [16]:


def main():

    for r in range(3, 4):
        n = 10              # Fixed-length of chunks (and detectors)
        composite_scores_filename =r"C:\Users\muqad\OneDrive\Desktop\Semester_2\Natural Computing\intrusion\NaivephishingURLStrings100Samples_result.txt"
        training_file = r"C:\Users\muqad\OneDrive\Desktop\Semester_2\Anomalous_URL_MDP_Strings\MDP_Strings\StringSet1\BenignMDPURLs_0Drift.train"
        test_file = r"C:\Users\muqad\OneDrive\Desktop\Semester_2\Anomalous_URL_MDP_Strings\MDP_Strings\StringSet1\PhishingMDPURLs_0Drift.test"
        training_chunks_file = r"C:\Users\muqad\OneDrive\Desktop\Semester 2\Anomalous_URL_MDP_Strings\MDP_Strings\StringSet1\train.train"
        preprocess_training(training_file, training_chunks_file, n, sliding = False)
        print("preprocessing done")
        # ----- Build the negsel2.jar Command -----
        # java -jar negsel2.jar -alphabet file://<training_chunks_file> -self <training_chunks_file> -n 10 -r 4 -c -l
        jar_file = "negsel2.jar"
        jar_command = [
            "java", "-jar", jar_file,
            "-self", training_chunks_file,
            "-n", str(n),
            "-r", str(r),
            "-c", "-l"
        ]
        print("Running jar command:")
        print(" ".join(jar_command))

        composite_scores = []  # To store composite anomaly score for each test sequence
        with open(test_file, 'r') as f:
            test_sequences = [line.strip() for line in f if line.strip()]
        
        print(len(test_sequences))
        for idx, seq in enumerate(test_sequences):
            # Split the variable-length test sequence into fixed-length chunks.
            chunks = process_test_sequence(seq, n, sliding = False)
            if not chunks:
                composite_scores.append(0)
                print(f"Warning: No valid chunks found for sequence {idx}, skipping...")
                continue
        

            # Run negsel2.jar on these chunks.
            scores = run_negsel_on_chunks(chunks, jar_command)
            # Aggregate the scores—for example, take the average.
            print(scores)
            if not scores:
                print(f"Warning: no scores returned for sequence {idx}, assigning 0")
                composite_score = 0.0
            else:
                composite_score = sum(scores) / len(scores)
            composite_scores.append(composite_score)
            print(f"Test sequence {idx}: composite anomaly score = {composite_score}")

        
        print("\nComposite anomaly scores for test sequences:")
        for idx, score in enumerate(composite_scores):
            print(f"Sequence {idx}: {score}")

        save_composite_scores(composite_scores, composite_scores_filename)

if __name__ == '__main__':
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\muqad\\OneDrive\\Desktop\\Semester_2\\Anomalous_URL_MDP_Strings\\MDP_Strings\\StringSet1\\BenignMDPURLs_0Drift.train'