In [1]:
import pandas as pd
import re
import json
import csv

from claude_prompt import generate_correct_samples

In [2]:
def get_claude_snippets(sample, verbose=False):
    """
    Prompt Claude and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    """
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')
    
    response = generate_correct_samples(prompt_rule=sample.style, prompt_code=sample.code,
                                        aspects_list=aspects, verbose=verbose)
    response_text = response[0].text
    
    # Extract the generated snippets from the Claude's response
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

In [3]:
# CSV output file path
output_fname = 'result.csv'
input_fname  = 'dataset_correct.csv'

def save_snippets(style, snippets, label, processed):
    with open(output_fname, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for snippet in snippets:
            csv_writer.writerow([style, snippet, label, processed])

In [4]:
df = pd.read_csv(input_fname)
df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,True
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,True
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,True
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,False
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,True
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,True
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,True


In [5]:
# Discard all the processed examples
unprocessed_df = df[~df['processed']]
unprocessed_df

Unnamed: 0,style,code,aspects,label,processed
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,False


In [6]:
for i, sample in unprocessed_df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    code_snippets = get_claude_snippets(sample, verbose=True)
    
    # Ask the user whether the generated snippets are satisfactory
    for snippet in code_snippets:
        print(snippet.strip())
        print('-' * 20) # for better readability

    print("Mark the sample as processed? y/n: ")
    processed_input  = input().lower()
    processed_answer = processed_input == 'y'

    df.loc[i, 'processed'] = processed_answer
    save_snippets(sample.style, code_snippets, sample.label, processed_answer)

Sample #3: No extra indentation (same level of indent).

PROMPT:

You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for correcting a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections).
2. Generated snippets must be different in <other_aspects> both from the provided <example> and from each other. You may change the <example> code in the aspects from the <other_aspects> block **only**, so don't be overly creative.
3. Pay special attention to identation: some

 y


In [7]:
df.to_csv(input_fname, mode='w', index=False, quoting=csv.QUOTE_ALL)