In [None]:
import time
import os
import tracemalloc
import subprocess
import random
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
login(token="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")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b-it",torch_dtype=torch.bfloat16).to(device)
torch.backends.cudnn.benchmark = True

In [None]:
def generate_deterministic_variants(filter_code):
    print("Generating deterministic code variants...")
    variants = []
    


    # Prompt per riduzione degli attributi in Falcon
    prompt_attr_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Keep only essential API calls for minimal, efficient processing
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """

    # Prompt per riduzione delle funzioni API in Falcon
    prompt_api_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Keep only essential API calls for minimal, efficient processing
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """

    # Prompt per riduzione del livello di anonimizzazione degli attributi in Falcon
    prompt_anon_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Reduce anonymization to retain necessary utility only.
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """
    prompts = [prompt_attr_reduction, prompt_api_reduction, prompt_anon_reduction]
    
    seed_value = 42
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    
    for i, prompt in enumerate(prompts):
        #print(f"Generating variant {i+1} based on prompt {i+1}...")
        
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        
        generated_ids = model.generate(
            input_ids, 
            max_length=1000,
            do_sample=False,
            top_k=5,         
            top_p=0.8
        )
        
        prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
        generated_ids_no_prompt = generated_ids[0][len(prompt_ids):]
        cleaned_response = tokenizer.decode(generated_ids_no_prompt, skip_special_tokens=True).strip()
        print(cleaned_response)
        
        variants.append(cleaned_response)

    return variants


In [None]:
def generate_non_deterministic_variants(filter_code):
    print("Generating non deterministic code variants...")
    variants = []
    
    # Prompt per riduzione degli attributi in Falcon
    prompt_attr_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Keep only essential API calls for minimal, efficient processing
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """

    # Prompt per riduzione delle funzioni API in Falcon
    prompt_api_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Keep only essential API calls for minimal, efficient processing
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """

    # Prompt per riduzione del livello di anonimizzazione degli attributi in Falcon
    prompt_anon_reduction = f"""
    <start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to this criteria:
    Reduce anonymization to retain necessary utility only.
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """
    prompts = [prompt_attr_reduction, prompt_api_reduction, prompt_anon_reduction]
    
    seed_value = random.randint(0, 10000)
    torch.manual_seed(seed_value)
    random.seed(seed_value)
    
    for i, prompt in enumerate(prompts):
        #print(f"Generating variant {i+1} based on prompt {i+1}...")
        
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        
        generated_ids = model.generate(
            input_ids, 
            max_length=1000,
            temperature=0.1,
            top_k=5,         
            top_p=0.8
        )
        
        prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
        generated_ids_no_prompt = generated_ids[0][len(prompt_ids):]
        cleaned_response = tokenizer.decode(generated_ids_no_prompt, skip_special_tokens=True).strip()
        print(cleaned_response)
        variants.append(cleaned_response)

    return variants


In [None]:
def extract_attributes(js_code):
    attributes = re.findall(r'var\s+(\w+)', js_code)
    return set(attributes)

def count_attributes(js_code):
    attributes=extract_attributes(js_code)
    attributes_count=len(attributes)
    function_count = len(re.findall(r':\s*function\b|:\s*\([^)]*\)\s*=>', js_code))
    all_properties = re.findall(r'\b\w+\s*:', js_code)
    property_count = len(all_properties) - function_count
    return attributes_count+property_count

def count_function_calls(js_code):
    function_call_pattern = r'\b\w+\.\w+\s*\.\w+\s*\(.*?\)\s*;?'
    function_calls = re.findall(function_call_pattern, js_code)
    return len(function_calls)


def analyze_filter_code(js_code):
    tracemalloc.start()
    start_time = time.time()
    
    try:
        process = subprocess.run(["node", "-e", js_code], capture_output=True, text=True)
        
        if process.returncode != 0:
            print(f"JavaScript error: {process.stderr}")
            return None,None
    except Exception as e:
        print(f"Error during code execution: {e}")
        return None, None
    
    end_time = time.time()
    current_memory, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    execution_time = end_time - start_time
    peak_memory_kb = peak_memory / 1024
    
    return execution_time, peak_memory_kb

def generate_variants(filter_code, n_variants=3):
    variants = []


    # Example prompt modification
    text = f"""<start_of_turn>user
    The following definition refers to data minimization: 
    'Data minimization is a principle restricting data collection to what is necessary in relation to the purposes for which they are processed.'
    Apply data minimization principles to the following JavaScript code according to these criteria:
    Reduce anonymization to retain necessary utility only,Keep only essential API calls for minimal, efficient processing,Remove unnecessary data attributes, ensuring only essential ones are processed.
    If compliant with data minimization, add '// No changes needed.'
    Don't write anything else but the code.
    Code:
    {filter_code}<end_of_turn>
    <start_of_turn>model
    """



    for i in range(n_variants):
        print(f"Generating variant {i+1}...")
        
        seed_value = random.randint(0, 10000)
        torch.manual_seed(seed_value)
        random.seed(seed_value)
        

        
        input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
        
        generated_ids = model.generate(
            input_ids, 
            max_length=1000, 
            temperature=0.1,
            top_k=5,         
            top_p=0.8,
            do_sample=True
        )
        prompt_ids = tokenizer.encode(text, add_special_tokens=False)
        generated_ids_no_prompt = generated_ids[0][len(prompt_ids):]
        cleaned_response = tokenizer.decode(generated_ids_no_prompt, skip_special_tokens=True).strip()
        print(cleaned_response)
        
        
        variants.append(cleaned_response)

    return variants

#Score 
def evaluate_code(filter_code):
    execution_time, memory_used = analyze_filter_code(filter_code)
    if execution_time is None:
        return None
    number_attributes = count_attributes(filter_code)
    api_calls = count_function_calls(filter_code)
        
    score = (
        execution_time * 0.1 + 
        memory_used * 0.1 + 
        number_attributes * 0.5 + 
        api_calls * 0.3 
    )
    
    print(f"Calculated score: {score}")
    return score

def save(iteration, mode, codes, name_code):
    if mode == 1:
        folder_path='Test/Gemma/Test_singolo_prompt/'+str(name_code)+'/Iterazione_'+str(iteration)
    elif mode == 2:
        folder_path='Test/Gemma/Test_3_prompt_deterministico/'+str(name_code)+'/Iterazione_'+str(iteration)
    else:
        folder_path='Test/Gemma/Test_3_prompt_non_deterministico/'+str(name_code)+'/Iterazione_'+str(iteration)

    os.makedirs(folder_path, exist_ok=True)
    for i, variant in enumerate(codes):
        file_path = os.path.join(folder_path, f"variant_{i+1}.js")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(variant)

def beam_search(initial_code, mode, name_code, beam_width=3, iterations=3):
    print("Starting beam search...")
    beam = [(initial_code, evaluate_code(initial_code))]
    
    for i in range(iterations):
        print(f"\n--- Iteration {i+1} ---")
        candidates = []
        
        for code, score in beam:
            if mode==1:
                print(f"Generating variants with generate_variants...")
                variants = generate_variants(code)
            elif mode==2:
                print(f"Generating variants with generate_deterministic_variants...")
                variants = generate_deterministic_variants(code)
            else:
                print(f"Generating variants with generate_non_deterministic_variants...")
                variants = generate_non_deterministic_variants(code)

            

            for variant in variants:
                variant_score = evaluate_code(variant)
                if variant_score is None:
                    candidates.append((code, evaluate_code(code)))
                else:
                    candidates.append((variant, variant_score))
        
        candidates.sort(key=lambda x: x[1] if x[1] is not None else float('+inf'))
        beam = candidates[:min(beam_width, len(candidates))]
        codes = [code for code,score in beam]
        save(i+1,mode, codes, name_code)

            
    print("\nBeam search completed.")
    return beam[0]


In [None]:
def calculate_error_rates(original_code, LLM_code, manual_code):
    original_attributes = extract_attributes(original_code)
    LLM_attributes = extract_attributes(LLM_code)
    manual_attributes = extract_attributes(manual_code)
    
    attributes_to_remove = original_attributes - manual_attributes
    total_possible_removals = len(attributes_to_remove)
    print(f'Il numero di attributi che possono essere rimossi è {total_possible_removals}\n')
    attributes_not_removed_by_LLM = [attr for attr in attributes_to_remove if attr in LLM_attributes]
    print(f'Il numero di attributi che il LLM non ha rimosso è {len(attributes_not_removed_by_LLM)}\n')

    #Primo error rate
    error_rate_1 = len(attributes_not_removed_by_LLM) / total_possible_removals if total_possible_removals > 0 else 0
    necessary_attributes = manual_attributes.intersection(original_attributes)
    print(f'Il numero di attributi necessari è {len(necessary_attributes)}\n')
    necessary_not_removed_by_LLM = [attr for attr in necessary_attributes if attr in LLM_attributes]
    print(f'Il numero di attributi necessari non rimossi dal LLM è {len(necessary_not_removed_by_LLM)}\n')

    #Secondo error rate
    total_necessary = len(necessary_attributes)
    error_rate_2 = 1-len(necessary_not_removed_by_LLM) / total_necessary if total_necessary > 0 else 0

    return error_rate_1, error_rate_2

In [13]:
folder_filter_codes_50 = 'filter_codes_50'
folder_filter_code_test='filter_code_test'
files = [f for f in os.listdir(folder_filter_codes_50) if os.path.isfile(os.path.join(folder_filter_codes_50, f))]
for file in files:
    with open(folder_filter_codes_50+'/'+file, 'r', encoding='utf-8') as f:
            content = f.read()
            print(file)
            code, score = beam_search(content,3,file) #Cambiare mode per ogni test

dm_01.js
Starting beam search...
Calculated score: 11.133434224128724

--- Iteration 1 ---
Generating variants with generate_non_deterministic_variants...
Generating non deterministic code variants...




const GoogleCalendar = {
  newEventAdded: {
      Where: "[some street address]", 
      Starts: "9:00 AM",
      Ends: "10:00 AM"
  },
  addDetailedEvent: {
      skip: () => console.log("Event skipped."),
      setDescription: (description) => console.log(`Description set: ${description}`),
      setAllDay: (isAllDay) => console.log(`All-day set: ${isAllDay}`),
      setStartTime: (startTime) => console.log(`Start time set: ${startTime}`),
      setEndTime: (endTime) => console.log(`End time set: ${endTime}`)
  }
};

if (GoogleCalendar.newEventAdded.Where.indexOf("[some street address]") < 0) {
  GoogleCalendar.addDetailedEvent.skip();
} else {
  GoogleCalendar.addDetailedEvent.setDescription("In the office from " 
      + GoogleCalendar.newEventAdded.Starts 
      + " to " + GoogleCalendar.newEventAdded.Ends);
  GoogleCalendar.addDetailedEvent.setAllDay("true");
  GoogleCalendar.addDetailedEvent.setStartTime(GoogleCalendar.newEventAdded.Starts);
  GoogleCalendar.addDetailedEvent.set

In [14]:
import os


def evaluate_all_filters(base_dir):
    best_variants = {}
    
    for filter_name in os.listdir(base_dir):
        filter_path = os.path.join(base_dir, filter_name)
        
        if os.path.isdir(filter_path):
            iter3_path = os.path.join(filter_path, 'Iterazione_3')
            
            if os.path.isdir(iter3_path):
                best_score = float('inf')
                best_variant = None
                
                for variant in ['variant_1.js', 'variant_2.js', 'variant_3.js']:
                    variant_path = os.path.join(iter3_path, variant)
                    
                    with open(variant_path, 'r', encoding='utf-8') as file:
                        code = file.read()
                    
                    score = evaluate_code(code)
                    
                    if score is not None and score < best_score:
                        best_score = score
                        best_variant = variant_path
                
                best_variants[filter_name] = (best_variant, best_score)
    
    return best_variants

def errors(original_code_path, best_code, test_code_path):
    with open(original_code_path, 'r',  encoding='utf-8') as original:
        original_code=original.read()
        with open(test_code_path,'r', encoding='utf-8') as test:
            test_code=test.read()
            with open(best_code,'r', encoding='utf-8') as LLM:
                LLM_code=LLM.read()
                error_rate_var_removed, error_rate_var_removed_necessary = calculate_error_rates(original_code,LLM_code,test_code)
    return error_rate_var_removed, error_rate_var_removed_necessary


base_directory = 'Test/Gemma/Test_3_prompt_non_deterministico/' #Cambiare per ogni test
text_file_path='Risultati/Gemma/error_rate_generate_non_deterministic_variants.txt' #Cambiare per ogni test
best_variants = evaluate_all_filters(base_directory)
with open(text_file_path,'a',encoding='utf-8') as error_file:
    error_file.write('filter_name,error_rate_1,error_rate_2\n')
    for filter_name, (best_variant, score) in best_variants.items():
        print(f"Best variant for {filter_name}: {best_variant} with score {score}")
        original_code_path='filter_codes_50/' + filter_name
        test_file_name = filter_name.replace('.js', '_test.js')
        test_code_path='filter_code_test/' + test_file_name
        error_rate_1, error_rate_2=errors(original_code_path,best_variant,test_code_path)
        error_file.write(f"{filter_name},{error_rate_1},{error_rate_2}\n")

Calculated score: 11.093076300621032
Calculated score: 11.03309063911438
Calculated score: 11.04285626411438
Calculated score: 10.982359886169434
Calculated score: 11.163464069366457
Calculated score: 10.927190518379211
Calculated score: 8.4305997133255
Calculated score: 8.13983132839203
Calculated score: 8.132018852233887
Calculated score: 7.352769804000855
Calculated score: 7.987687563896179
Calculated score: 7.942132592201233
Calculated score: 5.364931297302246
Calculated score: 5.00907199382782
Calculated score: 5.547794318199158
Calculated score: 9.456628155708312
Calculated score: 9.026940846443177
Calculated score: 9.103152465820312
Calculated score: 7.129672694206238
Calculated score: 6.842631602287293
Calculated score: 6.846684598922729
Calculated score: 13.632218909263612
Calculated score: 13.642516183853148
Calculated score: 13.631818795204163
Calculated score: 10.234360098838806
Calculated score: 9.732028794288635
Calculated score: 9.716491484642027
Calculated score: 8.9430