In [2]:
import pandas as pd
import re
import json
import csv

from gpt_prompt import GPTClient

In [3]:
# CSV output file path
output_fname = 'result_gpt.csv'
fname_wrong  = 'dataset_wrong.csv'
fname_correct  = 'dataset_correct.csv'

df_wrong = pd.read_csv(fname_wrong, index_col='style')
df_correct = pd.read_csv(fname_correct, index_col='style')

df_wrong

Unnamed: 0_level_0,code,label,processed
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",wrong,True
Add 4 spaces (an extra level of indentation) to distinguish arguments from the rest.,"def long_function_name(\r\n var_one, va...",wrong,True
Hanging indents should add a level.,"foo = long_function_name(\r\nvar_one, var_two,...",wrong,True
No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_another_...,wrong,True
Add some extra indentation on the conditional continuation line.,if (this_is_one_thing\r\n and that_is_anot...,wrong,True
The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (collection case),"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,True
The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (function call case),result = some_function_that_takes_arguments(\r...,wrong,True


In [4]:
# Discard all the processed examples
unprocessed_df = df_wrong[~df_wrong['processed']]
unprocessed_df

Unnamed: 0_level_0,code,label,processed
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [5]:
df = unprocessed_df.join(df_correct, how='left', rsuffix='_correct')
df = df[['code', 'code_correct', 'aspects', 'label', 'processed']]

N_items = len(df.index)
N_before_join = len(unprocessed_df.index)

if N_before_join - N_items != 0:
    print(f'Warning: {N_before_join - N_items} items discarded because they didn\'t have corresponding correct snippets.')

df

Unnamed: 0_level_0,code,code_correct,aspects,label,processed
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [6]:
def get_gpt_snippets(style, sample, verbose=False, **kwargs):
    """
    Prompt GPT and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    """
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')
    
    response = generate_wrong_samples(prompt_rule=style, aspects_list=aspects, wrong_code=sample.code, 
                                      correct_code=sample.code_correct, verbose=verbose, **kwargs)
    response_text = response.choices[0].message.content

    if verbose:
        print('RESPONSE:\n' + response_text)
    
    # Extract the generated snippets from the Claude's response
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

def save_snippets(style, snippets, label, processed):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for snippet in snippets:
            csv_writer.writerow([style, snippet, label, processed])

In [7]:
MODEL='gpt-4-turbo-2024-04-09'
TEMPERATURE=1
VERBOSE=True

In [7]:
for i, (style, sample) in enumerate(df.iterrows()):
    print(f'Sample #{i+1}: {style}', end='\n' * 2)
    
    code_snippets = get_gpt_snippets(style, sample, verbose=True, model=MODEL, temperature=TEMPERATURE)
    
    # Ask the user whether the generated snippets are satisfactory
    for snippet in code_snippets:
        print(snippet.strip())
        print('-' * 20) # for better readability

    print("Mark the sample as processed? y/n: ")
    processed_input  = input().lower()
    processed_answer = processed_input == 'y'

    df_wrong.loc[style, 'processed'] = processed_answer
    save_snippets(style, code_snippets, sample.label, processed_answer)

Sample #1: Aligned with opening delimiter.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one rule:
3. Snippets must not be identical to the <correct_snippet> in the <style_rule>
4. Since you will generate snippets for a ML dataset, th

 n


Sample #2: Add 4 spaces (an extra level of indentation) to distinguish arguments from the rest.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one rule:
3. Snippets must not be identical to the <correct_snippet> in the <style_rule>
4. 

 y


Sample #3: Hanging indents should add a level.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one rule:
3. Snippets must not be identical to the <correct_snippet> in the <style_rule>
4. Since you will generate snippets for a ML dataset

 n


Sample #4: No extra indentation (same level of indent).

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one rule:
3. Snippets must not be identical to the <correct_snippet> in the <style_rule>
4. Since you will generate snippets for a M

 y


Sample #5: Add some extra indentation on the conditional continuation line.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one rule:
3. Snippets must not be identical to the <correct_snippet> in the <style_rule>
4. Since you will gener

 n


Sample #6: The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (collection case)

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one ru

 n


Sample #7: The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct (function call case)

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically *wrong* snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be similar to the provided <wrong_snippet> example in a sense that they *must violate* a certain <style_rule> (all relevant data are provided to you below in the corresponding XML sections).
2. When generating similar wrong snippets, you may and should change the <wrong_snippet> example in all possible ways, except for one

 y


In [8]:
df_wrong.to_csv(fname_wrong, mode='w', index=True, quoting=csv.QUOTE_ALL)