In [1]:
import pandas as pd
import re
import json
import csv

from gpt_prompt import GPTClient

In [2]:
def extract_gpt_snippets(response_text):
    # Extract the result section
    result_section = re.search(r'<result>(.*)</result>', response_text, re.DOTALL).group(1)
    # Split the result section into individual code snippets
    code_snippets = result_section.strip().split('--\n')

    return code_snippets

def prompt_user_feedback(snippets):
    """
    Prompts a user via the standard input to ask if he is satisfied with the generated snippets.
    
    If the user types in "y", "yes" or just presses enter, 
        the function returns (True, [], "") tuple
    If instead the user types in the comma-separated list of digits like "1,2,4-6,9", and specific issue to be corrected (optionally),
        the function returns (False, [1, 2, 4, 5, 6, 9], "The code snippets are too similar to each other") tuple
    """
    # Step 1: Print each snippet with a number and separator
    for i, snippet in enumerate(snippets, start=1):
        print(f"{i}.\n{snippet.strip()}")
        print('-' * 20)  # Separator for better readability

    # Step 2: Prompt user for feedback
    user_input = input("Are you satisfied with the generated snippets? (y/yes/n/no): ").strip().lower()

    # Step 3: Check user input for satisfaction
    if user_input in ['y', 'yes']:
        return True, [], ""

    # Step 4: Check if user wants to regenerate any of the samples
    user_input = input("Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, " + \
                       "otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9)").strip().lower()
    if user_input in ['', 'n', 'no']:
        return False, [], ""
        
    try:
        numbers = []
        for part in user_input.split(','):
            if '-' in part:
                start, end = map(int, part.split('-'))
                numbers.extend(range(start, end + 1))
            else:
                numbers.append(int(part))
        
        # Validate numbers as indices of the snippets list
        valid_indices = range(1, len(snippets) + 1)
        print(numbers, valid_indices)
        invalid_numbers = [num for num in numbers if num not in valid_indices]

        if invalid_numbers:
            print(f"Invalid snippet numbers: {invalid_numbers}. Please provide valid indices.")
            return prompt_user_feedback(snippets)  # Retry prompting if input is invalid

        # If the indices are correct, finally ask the user for specifying an issue with the snippets they marked incorrect
        issue = input("Please specify is there's specific issue you want to be fixed in the inputs you selected. Otherwise, press enter.").strip()
        return False, sorted(set(numbers)), issue  # Return unique sorted list of numbers
    except ValueError:
        print("Invalid input format. Please provide either 'y', 'yes', or a comma-separated list of numbers.")
        return prompt_user_feedback(snippets)  # Retry prompting if input is invalid

In [3]:
params = {
    "model": 'gpt-4o-2024-05-13',	# 'gpt-4-turbo-2024-04-09',
    "temperature": 1,
    "verbose": True
}

In [4]:
# CSV output file path
output_fname = 'result_gpt.csv'
input_fname  = 'dataset_correct.csv'

def save_snippets(style, snippets, label, processed_snippets_ids):
    with open(output_fname, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

        for i, snippet in enumerate(snippets):
            is_processed = i+1 in processed_snippets_ids
            csv_writer.writerow([style, snippet, label, is_processed])

In [5]:
df = pd.read_csv(input_fname)
df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,True
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,True
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,True
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,True
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,True
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,True
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,True
7,Backslashes may still be appropriate at times....,with open('/path/to/some/file/you/want/to/read...,n,correct,True
8,Displayed formulas always break before binary ...,income = (gross_wages\r\n + taxable_i...,exp,correct,True
9,Imports should usually be on separate lines:,import os\r\nimport sys,m,correct,True


In [6]:
# Discard all the processed examples
unprocessed_df = df[~df['processed']]
unprocessed_df

Unnamed: 0,style,code,aspects,label,processed
24,However it does not make sense to have a trail...,"FILES = [\r\n 'setup.cfg',\r\n 'tox.ini'...",n/cl,correct,False
31,Be consistent in return statements. Either all...,def foo(x):\r\n if x >= 0:\r\n retur...,n/fs/exp,correct,False


In [7]:
N_TO_PROCESS = 10
unprocessed_df = unprocessed_df.head(N_TO_PROCESS)

In [8]:
client = GPTClient()

for i, sample in unprocessed_df.iterrows():
    print(f'Sample #{i}: {sample.style}', end='\n' * 2)
    
    # Parse the aspects as the list of abbreviations that generate_correct_samples() function expects
    aspects = sample.aspects.split('/')

    # Prompt GPT and get its generated snippets based on the example from the sample row in the dataset.csv dataframe
    response = client.generate_correct_samples(prompt_rule=sample.style, prompt_code=sample.code,
                                               aspects_list=aspects, **params)
    response_text = response.choices[0].message.content
    if params['verbose']:
        print(response_text)
    
    code_snippets = extract_gpt_snippets(response_text)
    
    # Prompt user if he is satisfied with the generated snippets
    processed_answer, snippets_ids_to_regenerate, code_issue = prompt_user_feedback(code_snippets)
    unprocessed_snippets_ids = [i for i in range(1, 10 + 1)]
    
    # If not, ask GPT to regenerate wrong snippets
    while(snippets_ids_to_regenerate):
        snippets_to_regenerate = [code_snippets[i-1] for i in snippets_ids_to_regenerate]
        
        new_snippets_response = client.regenerate_correct_samples(snippets_to_regenerate, issue=code_issue, **params)
        new_snippets = extract_gpt_snippets(new_snippets_response.choices[0].message.content)

        # Overwrite wrong snippets with the corrected ones
        for i, new_snippet in zip(snippets_ids_to_regenerate, new_snippets):
            code_snippets[i-1] = new_snippet

        # In the user doesn't want to continue, mark the snippets they were dissatisfied with as unprocessed
        unprocessed_snippets_ids = snippets_ids_to_regenerate
        # Repeat until no wrong snippets left or the user doesn't want to continue
        processed_answer, snippets_ids_to_regenerate, code_issue = prompt_user_feedback(code_snippets)

    # Mark unprocessed_snippets_ids as unprocessed if the user is still dissatisfied with them:
    if processed_answer == False:
        processed_snippets_ids = [i for i in range(1, 10 + 1) if i not in unprocessed_snippets_ids]
    else:
        processed_snippets_ids = [i for i in range(1, 10 + 1)]
    save_snippets(sample.style, code_snippets, sample.label, processed_snippets_ids)
    
    df.loc[i, 'processed'] = processed_answer
    client.clear_history()

Sample #24: However it does not make sense to have a trailing comma on the same line as the closing delimiter (except in the case of singleton tuples):


USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant data are provided to you below in the corresponding XML sections). Provided example may contain multiple independent snippets, but your task is to generate one snippet at a time.
2. Generated snippets must be different in <oth

Are you satisfied with the generated snippets? (y/yes/n/no):  1-10
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) Your snippets contain two separate cases: collection definition and function call. Edit the snipperts to contain EITHER collection definition (5 snippets) OR function call (5 snippets).


Invalid input format. Please provide either 'y', 'yes', or a comma-separated list of numbers.
1.
FILE_NAMES = [
    'config.yml',
    'pytest.ini',
    ]
setup_files(FILE_NAMES,
            er_raise=True,
            )
--------------------
2.
CONFIG_FILES = (
    'program.conf',
    'environment.ini',
    )
load_settings(CONFIG_FILES,
              autoload=False,
              )
--------------------
3.
resource_paths = [
    'resources.txt',
    'data.json',
    ]
init_resources(resource_paths,
               ignore_errors=False,
               )
--------------------
4.
LOG_FILES = [
    'logfile.log',
    'debug.log',
    ]
start_logging(LOG_FILES,
              verbose=True,
              )
--------------------
5.
templateFiles = [
    'template.html',
    'index.html',
    ]
generateHtml(templateFiles,
             debug=False,
             )
--------------------
6.
script_paths = [
    'script.sh',
    'deploy.bash',
    ]
run_scripts(script_paths,
            fail_on_error=True,


Are you satisfied with the generated snippets? (y/yes/n/no):  n
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 1-10


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] range(1, 11)


Please specify is there's specific issue you want to be fixed in the inputs you selected. Otherwise, press enter. Your snippets contain two separate cases: collection definition and function call. Edit the snipperts to contain EITHER collection definition (5 snippets) OR function call (5 snippets).



USER:
 
There's a problem with the following snippets you generated:
<wrong_snippets>
FILE_NAMES = [
    'config.yml',
    'pytest.ini',
    ]
setup_files(FILE_NAMES,
            er_raise=True,
            )

--
CONFIG_FILES = (
    'program.conf',
    'environment.ini',
    )
load_settings(CONFIG_FILES,
              autoload=False,
              )

--
resource_paths = [
    'resources.txt',
    'data.json',
    ]
init_resources(resource_paths,
               ignore_errors=False,
               )


--
LOG_FILES = [
    'logfile.log',
    'debug.log',
    ]
start_logging(LOG_FILES,
              verbose=True,
              )

--
templateFiles = [
    'template.html',
    'index.html',
    ]
generateHtml(templateFiles,
             debug=False,
             )

--
script_paths = [
    'script.sh',
    'deploy.bash',
    ]
run_scripts(script_paths,
            fail_on_error=True,
            )

--
CONF_PATHS = (
    'main.conf',
    'backup.ini',
    )
initialize_configs(CONF_PATHS,
    

Are you satisfied with the generated snippets? (y/yes/n/no):  n
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 1-10


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] range(1, 11)


Please specify is there's specific issue you want to be fixed in the inputs you selected. Otherwise, press enter. you forgot about the trailing comma



USER:
 
There's a problem with the following snippets you generated:
<wrong_snippets>
FILE_NAMES = [
    'config.yml',
    'pytest.ini'
    ]
setup_files(FILE_NAMES, er_raise=True)

--
CONFIG_FILES = (
    'program.conf',
    'environment.ini'
    )
load_settings(CONFIG_FILES, autoload=False)

--
resource_paths = [
    'resources.txt',
    'data.json'
    ]
init_resources(resource_paths, ignore_errors=False)

--
LOG_FILES = [
    'logfile.log',
    'debug.log'
    ]
start_logging(LOG_FILES, verbose=True)

--
templateFiles = [
    'template.html',
    'index.html'
    ]
generateHtml(templateFiles, debug=False)

--
script_paths = [
    'script.sh',
    'deploy.bash'
    ]
run_scripts(script_paths, fail_on_error=True)

--
CONF_PATHS = (
    'main.conf',
    'backup.ini'
    )
initialize_configs(CONF_PATHS, overwrite=True)

--
data_files = [
    'data1.csv',
    'data2.csv'
    ]

--
document_files = [
    'doc1.pdf',
    'doc2.docx'
    ]

--
scripts = (
    'install.sh',
    'update.bas

Are you satisfied with the generated snippets? (y/yes/n/no):  y


Sample #31: Be consistent in return statements. Either all return statements in a function should return an expression, or none of them should. If any return statement returns an expression, any return statements where no value is returned should explicitly state this as return None, and an explicit return statement should be present at the end of the function (if reachable):


USER:
 
You are an expert programmer with extensive experience and vast knowledge of different industries. You will help me create a dataset for training an ML model to correct a code style according to the PEP 8 Python guideline.

You will be given an example of a stylistically correct code snippet, and your task is to generate a total of 10 similar snippets. When generating snippets, you must strictly adhere to the following instructions:

<instructions>
1. Snippets are valid pieces of Python code that must be identical to the provided <example> in the <style_rule>, but not in all <other_aspects> (all relevant

Are you satisfied with the generated snippets? (y/yes/n/no):  n
Do you want to regenerate any of the snippets? If not, type n/no or hit Enter, otherwise specify a comma-separated list of values (e.g. 1,2,4-6,9) 1-10


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] range(1, 11)


Please specify is there's specific issue you want to be fixed in the inputs you selected. Otherwise, press enter. Remove the ```python tag. Leave out only one function per snippet.



USER:
 
There's a problem with the following snippets you generated:
<wrong_snippets>
```python
def calculate_y(z):
    if z > 5:
        return z * 2
    else:
        return None
        
def process_value(a, b):
    if a <= b:
        return a + b
    else:
        return None
```

--
```python
def check_negative(n):
    if n < 0:
        return -n
    else:
        return None
        
def absolute_value(x):
    if x >= 0:
        return x
    else:
        return None
```

--
```python
def power_of_two(num):
    if num > 10:
        return num ** 2
    else:
        return None
        
def find_square(x):
    if x == 0:
        return None
    return x * x
```

--
```python
def validate_input(i, j, k):
    if i < j:
        return i
    elif j > k:
        return j
    else:
        return None
        
def compare_values(a):
    if a == 5:
        return None
    return a * 2
```

--
```python
def positive_check(val):
    if val > 0:
        return val * 3
    else:
        ret

Are you satisfied with the generated snippets? (y/yes/n/no):  y


In [9]:
# Mark the processed samples
df.to_csv(input_fname, mode='w', index=False, quoting=csv.QUOTE_ALL)

In [10]:
df

Unnamed: 0,style,code,aspects,label,processed
0,Aligned with opening delimiter.,"foo = long_function_name(var_one, var_two,\r\n...",n/fs,correct,True
1,Add 4 spaces (an extra level of indentation) t...,"def long_function_name(\r\n var_one, va...",n/fs,correct,True
2,Hanging indents should add a level.,"foo = long_function_name(\r\n var_one, var_...",n/fs,correct,True
3,No extra indentation (same level of indent).,if (this_is_one_thing and\r\n that_is_anoth...,n,correct,True
4,Add some extra indentation on the conditional ...,if (this_is_one_thing\r\n and that_is_a...,n,correct,True
5,The closing brace/bracket/parenthesis on multi...,"my_list = [\r\n 1, 2, 3,\r\n 4, 5, 6,\r\n]",n/cl,correct,True
6,The closing brace/bracket/parenthesis on multi...,result = some_function_that_takes_arguments(\r...,n/fs,correct,True
7,Backslashes may still be appropriate at times....,with open('/path/to/some/file/you/want/to/read...,n,correct,True
8,Displayed formulas always break before binary ...,income = (gross_wages\r\n + taxable_i...,exp,correct,True
9,Imports should usually be on separate lines:,import os\r\nimport sys,m,correct,True


In [11]:
df.loc[24, 'processed']

False

In [12]:
for i, sample in unprocessed_df.iterrows():
    print(i)

24
31


In [13]:
for i, sample in unprocessed_df.iterrows():
    print(df.loc[i, 'processed'])

False
False
