In [2]:
import pandas as pd
import re
import json
import csv

from claude_prompt import generate_wrong_samples

In [5]:
def get_claude_snippets(sample, verbose=False):
    """
    Prompt Claude and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    """
    
    
    response = generate_wrong_samples(prompt_rule=sample.style, prompt_code=sample.code,
                                      aspects_list=aspects, verbose=verbose)
    response_text = response[0].text
    
    # Extract the generated snippets from the Claude's response
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

def save_snippets(style_id, snippets, label, processed):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for snippet in snippets:
            csv_writer.writerow([style_id, snippet, label, processed])

In [11]:
# CSV output file path
output_fname = 'result.csv'
fname_wrong  = 'dataset_wrong.csv'
fname_correct  = 'dataset_correct.csv'

df_wrong = pd.read_csv(fname_wrong, index_col='id')
df_correct = pd.read_csv(fname_correct, index_col='id')

df_wrong

Unnamed: 0_level_0,style,code,label,processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6715963594578028578,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",wrong,False
3213575530546295585,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",wrong,False
4555523396418567934,Hanging indents should add a level.,"foo = long_function_name(\r\nvar_one, var_two,...",wrong,False
3731108434884656287,No extra indentation (same level of indent).,if (this_is_one_thing and\r\nthat_is_another_t...,wrong,False
-7275606690904798960,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_anot...,wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,wrong,False


In [12]:
# Discard all the processed examples
df_wrong = df_wrong[~df_wrong['processed']]
df_wrong

Unnamed: 0_level_0,style,code,label,processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6715963594578028578,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",wrong,False
3213575530546295585,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",wrong,False
4555523396418567934,Hanging indents should add a level.,"foo = long_function_name(\r\nvar_one, var_two,...",wrong,False
3731108434884656287,No extra indentation (same level of indent).,if (this_is_one_thing and\r\nthat_is_another_t...,wrong,False
-7275606690904798960,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_anot...,wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,wrong,False


In [20]:
df_correct

Unnamed: 0_level_0,style,code,aspects,label,processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6715963594578028578,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,True
3213575530546295585,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,True
4555523396418567934,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,True
3731108434884656287,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n/st,correct,True
-7275606690904798960,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,True
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,True
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,True


In [19]:
df = df_wrong.join(df_correct, how='left', rsuffix='_correct')
df = df[['style', 'code', 'code_correct', 'label', 'processed']]

N_items = len(df.index)
N_before_join = len(df_wrong.index)

if N_before_join - N_items != 0:
    print(f'Warning: {N_before_join - N_items} items discarded because they didn\'t have corresponding correct snippets.')

df



Unnamed: 0_level_0,style,code,code_correct,label,processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6715963594578028578,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...","foo = long_function_name(var_one, var_two,\r\n...",wrong,False
3213575530546295585,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...","def long_function_name(\r\n var_one, va...",wrong,False
4555523396418567934,Hanging indents should add a level.,"foo = long_function_name(\r\nvar_one, var_two,...","foo = long_function_name(\r\n var_one, var_...",wrong,False
3731108434884656287,No extra indentation (same level of indent).,if (this_is_one_thing and\r\nthat_is_another_t...,if (this_is_one_thing and\r\n that_is_anoth...,wrong,False
-7275606690904798960,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_anot...,if (this_is_one_thing\r\n and that_is_a...,wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]","my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",result = some_function_that_takes_arguments(\r...,wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",wrong,False
-5897263164556083888,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,result = some_function_that_takes_arguments(\r...,wrong,False


In [9]:
for i, sample in unprocessed_df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    code_snippets = get_claude_snippets(sample, verbose=True)
    
    # Ask the user whether the generated snippets are satisfactory
    for snippet in code_snippets:
        print(snippet.strip())
        print('-' * 20) # for better readability

    print("Mark the sample as processed? y/n: ")
    processed_input  = input().lower()
    processed_answer = processed_input == 'y'

    df.loc[i, 'processed'] = processed_answer
    save_snippets(sample.style, code_snippets, sample.label, processed_answer)

Sample #3: Hanging indents should add a level.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections).
2. Generated snippets must be different in <other_aspects> both from the provided <example> and from each other. You may change the <example> code in the aspects from the <other_aspects> block **only**, so don't be overly creative.
3. Pay special attention to identation: some <style_r

 y


Sample #5: Add some extra indentation on the conditional continuation line.

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections).
2. Generated snippets must be different in <other_aspects> both from the provided <example> and from each other. You may change the <example> code in the aspects from the <other_aspects> block **only**, so don't be overly creative.
3. Pay special attention

 y


Sample #7: The closing brace/bracket/parenthesis on multiline constructs must be lined up under the first character of the line that starts the multiline construct

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections).
2. Generated snippets must be different in <other_aspects> both from the provided <example> and from each other. You may change the <example> code in the aspects from t

 y


In [11]:
df.to_csv(input_fname, mode='w', index=True, quoting=csv.QUOTE_ALL)