In [1]:
import pandas as pd
import re
import json
import csv

from gpt_prompt import GPTClient

In [14]:
def extract_gpt_snippets(response_text):
    # Extract the result section
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

def prompt_user_feedback(snippets):
    """
    Prompts a user via the standard input to ask if he is satisfied with the generated snippets.
    
    If the user types in "y", "yes" or just presses enter, 
        the function returns (True, []) tuple
    If instead the user types in the comma-separated list of digits like "1,2,4-6,9", 
        the function returns (False, [1, 2, 4, 5, 6, 9]) tuple
    """
    # Step 1: Print each snippet with a number and separator
    for i, snippet in enumerate(snippets, start=1):
        print(f"{i}.\n{snippet.strip()}")
        print('-' * 20)  # Separator for better readability

    # Step 2: Prompt user for feedback
    user_input = input("Are you satisfied with the generated snippets? (y/yes/n/no): ").strip().lower()

    # Step 3: Check user input for satisfaction
    if user_input in ['y', 'yes']:
        return True, []

    # Step 4: Check if user wants to regenerate any of the samples
    user_input = input("Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, " + \
                       "otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9)").strip().lower()
    if user_input in ['', 'n', 'no']:
        return False, []
        
    try:
        numbers = []
        for part in user_input.split(','):
            if '-' in part:
                start, end = map(int, part.split('-'))
                numbers.extend(range(start, end + 1))
            else:
                numbers.append(int(part))
        
        # Validate numbers as indices of the snippets list
        valid_indices = range(1, len(snippets) + 1)
        invalid_numbers = [num for num in numbers if num not in valid_indices]

        if invalid_numbers:
            print(f"Invalid snippet numbers: {invalid_numbers}. Please provide valid indices.")
            return prompt_user_feedback(snippets)  # Retry prompting if input is invalid
        
        return False, sorted(set(numbers))  # Return unique sorted list of numbers
    except ValueError:
        print("Invalid input format. Please provide either 'y', 'yes', or a comma-separated list of numbers.")
        return prompt_user_feedback(snippets)  # Retry prompting if input is invalid

In [3]:
params = {
    "model": 'gpt-3.5-turbo-16k-0613',
    "temperature": 1,
    "verbose": True
}

In [4]:
# CSV output file path
output_fname = 'result_gpt.csv'
input_fname  = 'dataset_correct.csv'

def save_snippets(style, snippets, label, processed):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for snippet in snippets:
            csv_writer.writerow([style, snippet, label, processed])

In [5]:
df = pd.read_csv(input_fname)
df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,False
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,True
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,True
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,True
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,True
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,True
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,True


In [6]:
# Discard all the processed examples
unprocessed_df = df[~df['processed']]
unprocessed_df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,False


In [15]:
client = GPTClient()

for i, sample in unprocessed_df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')

    # Prompt GPT and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    response = client.generate_correct_samples(prompt_rule=sample.style, prompt_code=sample.code,
                                               aspects_list=aspects, **params)
    response_text = response.choices[0].message.content
    if params['verbose']:
        print(response_text)
    
    code_snippets = extract_gpt_snippets(response_text)
    
    # Prompt user if he is satisfied with the generated snippets
    processed_answer, snippets_ids_to_regenerate = prompt_user_feedback(code_snippets)
    
    # If not, ask GPT to regenerate wrong snippets
    while(snippets_ids_to_regenerate):
        snippets_to_regenerate = [code_snippets[i-1] for i in snippets_ids_to_regenerate]
        print(snippets_to_regenerate)
        
        new_snippets_response = client.regenerate_correct_samples(snippets_to_regenerate, **params)
        new_snippets = extract_gpt_snippets(new_snippets_response.choices[0].message.content)

        for i, new_snippet in zip(snippets_ids_to_regenerate, new_snippets):
            code_snippets[i-1] = new_snippet

        failed_snippets = snippets_ids_to_regenerate
        processed_answer, snippets_ids_to_regenerate = prompt_user_feedback(code_snippets)

    # mark only failed_snippets as unprocessed 
    # save_snippets(sample.style, code_snippets, sample.label, processed_answer)
    df.loc[i, 'processed'] = processed_answer
    client.clear_history()

Sample #0: Aligned with opening delimiter.


USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections).
2. Generated snippets must be different in <other_aspects> both from the provided <example> and from each other. If some aspects from <other_aspects> are irrelevant to the provided examples, you can safely ignore them, and focus on relevant ones instead. You may chang

Are you satisfied with the generated snippets? (y/yes/n/no):  т
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 1, 2-3,10


['# Code Snippet 1\nabc = long_function_name(\n    arg1, arg2,\n    arg3, arg4\n)\n\n', '# Code Snippet 2\nxyz = long_function_name(\n    parameter1, parameter2,\n    parameter3\n)\n\n', '# Code Snippet 3\ndef_name = long_function_name(\n    val1, val2,\n    val3, val4,\n    val5\n)\n\n', '# Code Snippet 10\nvariable = long_function_name(\n    arg_i, arg_ii,\n    arg_iii, arg_iv\n)']

USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all r

Are you satisfied with the generated snippets? (y/yes/n/no):  n
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 4-9


['# Code Snippet 4\nsome_func = long_function_name(\n    var_a, var_b,\n    var_c, var_d,\n    var_e, var_f\n)\n\n', '# Code Snippet 5\ncustom_func = long_function_name(\n    parameter_x, parameter_y,\n    argument_z, argument_w\n)\n\n', '# Code Snippet 6\ntemp_var = long_function_name(\n    arg_one, arg_two,\n    arg_three, arg_four\n)\n\n', '# Code Snippet 7\nreturn_val = long_function_name(\n    val_1, val_2,\n    val_3, val_4\n)\n\n', '# Code Snippet 8\nresult = long_function_name(\n    arg_01, arg_02,\n    arg_03, arg_04,\n    arg_05\n)\n\n', '# Code Snippet 9\nfunction_name = long_function_name(\n    parameter_1, parameter_2,\n    parameter_3\n)\n\n']

USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a 

Are you satisfied with the generated snippets? (y/yes/n/no):  y


In [8]:
# Mark the processed samples
df.to_csv(input_fname, mode='w', index=False, quoting=csv.QUOTE_ALL)