In [1]:
import pandas as pd
import re
import json
import csv

from gpt_prompt import generate_correct_samples

In [2]:
def get_gpt_snippets(sample, verbose=False, **kwargs):
    """
    Prompt GPT and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    """
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')
    
    response = generate_correct_samples(prompt_rule=sample.style, prompt_code=sample.code,
                                        aspects_list=aspects, verbose=verbose, **kwargs)
    response_text = response.choices[0].message.content

    if verbose:
        print(response_text)
    
    # Extract the generated snippets from the Claude's response
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

In [3]:
MODEL='gpt-4-turbo-2024-04-09'
TEMPERATURE=1

In [4]:
# CSV output file path
output_fname = 'result_gpt.csv'
input_fname  = 'dataset_correct.csv'

def save_snippets(style, snippets, label, processed):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for snippet in snippets:
            csv_writer.writerow([style, snippet, label, processed])

In [5]:
df = pd.read_csv(input_fname)
df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,False
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,False
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,False
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,False
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,False
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,False
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,False


In [6]:
# Discard all the processed examples
unprocessed_df = df[~df['processed']]
unprocessed_df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,False
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,False
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,False
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,False
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,False
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,False
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,False


In [7]:
for i, sample in unprocessed_df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    code_snippets = get_gpt_snippets(sample, verbose=False, model=MODEL, temperature=TEMPERATURE)
    
    # Ask the user whether the generated snippets are satisfactory
    for snippet in code_snippets:
        print(snippet.strip())
        print('-' * 20) # for better readability

    print("Mark the sample as processed? y/n: ")
    processed_input  = input().lower()
    processed_answer = processed_input == 'y'

    df.loc[i, 'processed'] = processed_answer
    save_snippets(sample.style, code_snippets, sample.label, processed_answer)

Sample #0: Aligned with opening delimiter.

process_data = analyze_metrics(input_value, baseline_data,
                               growth_rate, yearly_output)
--------------------
draw = render_chart(x_axis_data, y_label,
                    z_values, color_map)
--------------------
compute = calculate_results(parameter_a, benchmark,
                            delta_points, extra_metrics)
--------------------
send = transmit_information(source_data, target_node,
                            enabled_flags, connection_params)
--------------------
create = build_object(instance_id, factory_settings,
                      model_type, configuration)
--------------------
display = show_graphics(frame_buffer, render_target,
                       depth_blog, opts_quality)
--------------------
update = modify_entry(current_status, revision_number,
                      field_to_change, update_payload)
--------------------
fetch = retrieve_record(search_key, directory_path,
                 

 n


Sample #1: Add 4 spaces (an extra level of indentation) to distinguish arguments from the rest.

def calculate_area(
        length, width, height,
        depth):
--------------------
def process_data(
        input_value, output_value, ratio,
        factor):
--------------------
def create_user(
        username, email, password,
        phone_number):
--------------------
def draw_shape(
        corner, side_length, color,
        texture, opacity):
--------------------
def send_message(
        sender, receiver, content,
        date_time, priority):
--------------------
def connect_devices(
        primary_device, secondary_device, method,
        protocol, bandwidth):
--------------------
def configure_system(
        user_config, system_config, environment,
        access_level):
--------------------
def analyze_performance(
        metric_one, metric_two, baseline,
        target, variance):
--------------------
def start_engine(
        ignition_key, fuel_level, oil_pressure,

 y


Sample #2: Hanging indents should add a level.

data = process_data(
    input_vals, configuration,
    parameters, options_set)
--------------------
record = save_entry(
    record_id, data_object,
    metadata, timestamp)
--------------------
result = evaluate_performance(
    matrix, vector,
    coefs, bias)
--------------------
info = extract_information(
    source, destination,
    route_map, time_frame)
--------------------
person = update_profile(
    user_id, account_data,
    settings, history_log)
--------------------
calc = calculate_result(
    operand_one, operand_two,
    aux_data, extra_params)
--------------------
draw = render_image(
    canvas, shape_info,
    color_map, depth_field)
--------------------
content = parse_document(
    header_data, body_content,
    footer_info, doc_style)
--------------------
setup = configure_system(
    main_config, backup_plan,
    user_prefs, opt_mode)
--------------------
merge = combine_records(
    primary_rec, secondary_rec,
 

 y


Sample #3: No extra indentation (same level of indent).

if (item_valid and
    list_complete):
--------------------
if (user_active and
    profile_filled):
--------------------
if (game_started and
    level_reached):
--------------------
if (files_exist and
    permission_granted):
--------------------
if (connection_established and
    signal_strong):
--------------------
if (sun_shining and
    birds_chirping):
--------------------
if (data_loaded and
    no_errors_found):
--------------------
if (engine_running and
    oil_checked):
--------------------
if (doors_closed and
    lights_on):
--------------------
if (fridge_full and
    milk_cold):
--------------------
Mark the sample as processed? y/n: 


 y


Sample #4: Add some extra indentation on the conditional continuation line.

if (item_is_available
        and cart_is_not_full):
--------------------
if (user_is_authenticated
        and access_is_granted):
--------------------
if (file_exists
        and not file_is_corrupted):
--------------------
if (connection_is_secure
        and received_data_is_valid):
--------------------
if (order_is_processed
        and payment_is_complete):
--------------------
if (day_is_sunny
        and park_is_open):
--------------------
if (license_is_active
        and user_is_compliant):
--------------------
if (memory_is_sufficient
        and disk_space_is_enough):
--------------------
if (temperature_is_optimal
        and humidity_is_within_range):
--------------------
if (app_is_installed
        and service_is_running):
--
--------------------
Mark the sample as processed? y/n: 


 y


Sample #5: The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (collection case)

ages_tuple = (
    45, 34, 23,
    32, 52, 67,
)
--------------------
color_dict = {
    'red': 1, 'blue': 2,
    'green': 3, 'yellow': 4,
}
--------------------
coords_list = [
    (1, 2), (3, 4),
    (5, 6), (7, 8),
]
--------------------
inventory_dict = {
    'apples': 120, 'oranges': 80,
    'bananas': 150, 'grapes': 90,
}
--------------------
scores_tuple = (
    76, 89, 92,
    85, 72, 88,
)
--------------------
user_ids = [
    101, 102, 103,
    104, 105, 106,
]
--------------------
price_list = [
    2.99, 5.49, 1.89,
    9.99, 0.99, 7.49,
]
--------------------
status_codes = {
    200: 'Success', 404: 'Not Found',
    500: 'Server Error', 302: 'Redirect',
}
--------------------
shapes_tuple = (
    'circle', 'square', 'triangle',
    'rectangle', 'pentagon',
)
--------------------
months = [
  

 y


Sample #6: The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (function call case)

output = calculate_sum(
    'one', 'two', 'three',
    'four', 'five', 'six',
)
--------------------
data = fetch_records(
    'id', 'info', 'type',
    'date', 'status', 'source',
)
--------------------
response = handle_request(
    'user', 'email', 'password',
    'session', 'auth', 'profile',
)
--------------------
config = set_configuration(
    'mode', 'state', 'level',
    'type', 'zone', 'setting',
)
--------------------
values = compute_totals(
    'item', 'cost', 'count',
    'tax', 'total', 'net',
)
--------------------
details = get_information(
    'part', 'batch', 'code',
    'size', 'label', 'rule',
)
--------------------
result = process_input(
    'char', 'code', 'symbol',
    'format', 'style', 'flow',
)
--------------------
readings = collect_data(
    'temp', 'press', 'humid',
    's

 y


In [9]:
df.to_csv(input_fname, mode='w', index=False, quoting=csv.QUOTE_ALL)