In [1]:
import pandas as pd
import re
import json
import csv

from gpt_prompt import GPTClient

In [2]:
def extract_gpt_snippets(response_text):
    # Extract the result section
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

def prompt_user_feedback(snippets):
    """
    Prompts a user via the standard input to ask if he is satisfied with the generated snippets.
    
    If the user types in "y", "yes" or just presses enter, 
        the function returns (True, []) tuple
    If instead the user types in the comma-separated list of digits like "1,2,4-6,9", 
        the function returns (False, [1, 2, 4, 5, 6, 9]) tuple
    """
    # Step 1: Print each snippet with a number and separator
    for i, snippet in enumerate(snippets, start=1):
        print(f"{i}.\n{snippet.strip()}")
        print('-' * 20)  # Separator for better readability

    # Step 2: Prompt user for feedback
    user_input = input("Are you satisfied with the generated snippets? (y/yes/n/no): ").strip().lower()

    # Step 3: Check user input for satisfaction
    if user_input in ['y', 'yes']:
        return True, []

    # Step 4: Check if user wants to regenerate any of the samples
    user_input = input("Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, " + \
                       "otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9)").strip().lower()
    if user_input in ['', 'n', 'no']:
        return False, []
        
    try:
        numbers = []
        for part in user_input.split(','):
            if '-' in part:
                start, end = map(int, part.split('-'))
                numbers.extend(range(start, end + 1))
            else:
                numbers.append(int(part))
        
        # Validate numbers as indices of the snippets list
        valid_indices = range(1, len(snippets) + 1)
        invalid_numbers = [num for num in numbers if num not in valid_indices]

        if invalid_numbers:
            print(f"Invalid snippet numbers: {invalid_numbers}. Please provide valid indices.")
            return prompt_user_feedback(snippets)  # Retry prompting if input is invalid
        
        return False, sorted(set(numbers))  # Return unique sorted list of numbers
    except ValueError:
        print("Invalid input format. Please provide either 'y', 'yes', or a comma-separated list of numbers.")
        return prompt_user_feedback(snippets)  # Retry prompting if input is invalid

In [3]:
params = {
    "model": 'gpt-4o',	# 'gpt-4-turbo-2024-04-09',
    "temperature": 1,
    "verbose": True
}

In [4]:
# CSV output file path
output_fname = 'result_gpt.csv'
fname_wrong  = 'dataset_wrong.csv'
fname_correct  = 'dataset_correct.csv'

df_wrong = pd.read_csv(fname_wrong, index_col='style')
df_correct = pd.read_csv(fname_correct, index_col='style')

df_wrong

Unnamed: 0_level_0,code,label,processed
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",wrong,False
Add 4 spaces (an extra level of indentation) to distinguish arguments from the rest.,"def long_function_name(\r\n var_one, va...",wrong,True
Hanging indents should add a level.,"foo = long_function_name(\r\nvar_one, var_two,...",wrong,True
No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_another_...,wrong,True
Add some extra indentation on the conditional continuation line.,if (this_is_one_thing\r\n and that_is_anot...,wrong,True
The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (collection case),"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,True
The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (function call case),result = some_function_that_takes_arguments(\r...,wrong,True


In [5]:
# Discard all the processed examples
unprocessed_df = df_wrong[~df_wrong['processed']]
unprocessed_df

Unnamed: 0_level_0,code,label,processed
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",wrong,False


In [6]:
df = unprocessed_df.join(df_correct, how='left', rsuffix='_correct')
df = df[['code', 'code_correct', 'aspects', 'label', 'processed']]

N_items = len(df.index)
N_before_join = len(unprocessed_df.index)

if N_before_join - N_items != 0:
    print(f'Warning: {N_before_join - N_items} items discarded because they didn\'t have corresponding correct snippets.')

df.reset_index(inplace=True)
df

Unnamed: 0,style,code,code_correct,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...","foo = long_function_name(var_one, var_two,\r\n...",n/fs,wrong,False


In [7]:
def save_snippets(style, snippets, label, processed_snippets_ids):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for i, snippet in enumerate(snippets):
            is_processed = i+1 in processed_snippets_ids
            csv_writer.writerow([style, snippet, label, is_processed])

In [8]:
client = GPTClient()

for i, sample in df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')

    # Prompt GPT and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    response = client.generate_wrong_samples(prompt_rule=sample.style, aspects_list=aspects,
                                             wrong_code=sample.code, correct_code=sample.code_correct, **params)
    response_text = response.choices[0].message.content
    if params['verbose']:
        print(response_text)
    
    code_snippets = extract_gpt_snippets(response_text)
    
    # Prompt user if he is satisfied with the generated snippets
    processed_answer, snippets_ids_to_regenerate = prompt_user_feedback(code_snippets)
    
    # If not, ask GPT to regenerate wrong snippets
    while(snippets_ids_to_regenerate):
        snippets_to_regenerate = [code_snippets[i-1] for i in snippets_ids_to_regenerate]
        
        new_snippets_response = client.regenerate_wrong_samples(snippets_to_regenerate, **params)
        new_snippets = extract_gpt_snippets(new_snippets_response.choices[0].message.content)

        for i, new_snippet in zip(snippets_ids_to_regenerate, new_snippets):
            code_snippets[i-1] = new_snippet

        last_snippets = snippets_ids_to_regenerate
        processed_answer, snippets_ids_to_regenerate = prompt_user_feedback(code_snippets)

    # mark last_snippets as unprocessed if the user is still dissatisfied with them:
    if processed_answer == False:
        processed_snippets_ids = [i for i in range(1, 10 + 1) if i not in last_snippets]
    else:
        processed_snippets_ids = [i for i in range(1, 10 + 1)]
    save_snippets(sample.style, code_snippets, sample.label, processed_snippets_ids)
    
    df_wrong.loc[sample.style, 'processed'] = processed_answer
    client.clear_history()

Sample #0: 0


USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. Snippets will be part of a ML dataset, so they must have some variability, being different both from the provided examples and from each other. To provide this variability, you need to modify the provided examples in the aspects from the <other_aspects> section. If some aspects fr

Are you satisfied with the generated snippets? (y/yes/n/no):  n
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 4


[4] range(1, 11)
['value = complex_calculation(a1, a23, data_is_4m)\n']

USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. Snippets will be part of a ML dataset, so they must have some variability, being different both from the provided examples and from each other. To provide this variability, you need to modify the provided examples in the as

Are you satisfied with the generated snippets? (y/yes/n/no):  y


In [9]:
df_wrong.to_csv(fname_wrong, mode='w', index=True, quoting=csv.QUOTE_ALL)

In [14]:
df.loc[0, 'code_correct']

'foo = long_function_name(var_one, var_two,\r\n                         var_three, var_four)'