In [11]:
import pandas as pd

data_all = pd.read_csv('aarest_base_generated_instructions.csv')

In [12]:
import re
import json
import random
import demjson3
import validators
from pprint import pprint

regions = [
    "us-east-2", "us-east-1", "us-west-1", "us-west-2", "af-south-1", 
    "ap-east-1", "ap-south-2", "ap-southeast-3", "ap-southeast-4", 
    "ap-south-1", "ap-northeast-3", "ap-northeast-2", "ap-southeast-1", 
    "ap-southeast-2", "ap-northeast-1", "ca-central-1", "ca-west-1", 
    "eu-central-1", "eu-west-1", "eu-west-2", "eu-south-1", "eu-west-3", 
    "eu-south-2", "eu-north-1", "eu-central-2", "il-central-1", 
    "me-south-1", "me-central-1", "sa-east-1", "us-gov-east-1", 
    "us-gov-west-1"
]
methods = ['get', 'post', 'put', 'delete', 'patch']

tokens_to_add = set(["<URLSTART>", "<URLEND>", "<PAYLOADSTART>", "<PAYLOADEND>"])
final_dataset = {"instructions": [], "api_calls": []}
method_errors = 0
url_errors = 0
json_errors = 0
method_counts = {
    mth: 0 for mth in methods
}

def parse_method(curl_command, i):
    method: str = re.findall(r'--request ([a-zA-Z]*)', curl_command)[0].lower()

    if method not in methods:
        return False, f"{i} Invalid method: {method}"
    
    return True, method

def parse_url(curl_command, i):
    url: str = re.findall(r'--url ([^\s]*)', curl_command)[0]
    url = url.strip("'\"\\")

    # one off fixes
    url = re.sub(r'\/%7Bregion%7D', random.choice(regions), url)

    valid = validators.url(url)
    if not valid:
        return False, f"{i} Invalid url: {url}"
    
    # remove the base url, leaving only the endpoint
    endpoint = re.sub(r'http[s]?:\/\/[^\/\s]*', '', url)

    if len(endpoint) == 0:
        return False, f"{i} No endpoint found in url: {url}"
    
    return True, endpoint

def parse_payload(curl_command, method, i):
    data: list[str] = re.findall(r"--data [\\']*({.*})[\\']*", curl_command)

    if len(data) > 0:
        data = data[0]
        try:
            data = re.sub(r'\\\\"', r'\"', data)
            data = demjson3.decode(data)
            data = json.dumps(data)
        except Exception as e:
            return False, f"{i} issue loading json: {str(e)} {data}"
        return True, data
    else:
        if method in ['post', 'put', 'patch']:
            return False, f"{i} No payload for method {method}"
        return True, ""
    
with open("parsing_errors.txt", "w") as f:
    for i in range(data_all.shape[0]):
        row = data_all.iloc[i]

        try:
            if pd.isna(row['instructions']):
                # then we are looking at part of those original 13200 that weren't batched and didn't split by quotes
                instruction: str = row['instruction']
                instruction = instruction.split("\"")[1].split("\"")[0]
                curl_command: str = row['api_call']
            else:
                # then we are looking at the batched instructions which were already split by quotes
                instruction: str = row['instructions']
                curl_command: str = row['api_calls']

            # verify that the bot didn't go meta with the instruction
            if "curl" in instruction:
                continue

            # parse the curl command into the correct tokenized form
            curl_command = curl_command.split("curl")[1]
            tokenized_command = []

            method_success, method_result = parse_method(curl_command, i)
            if method_success:
                method_counts[method_result] += 1
                tokenized_command.append(f"<{method_result.upper()}>")
            else:
                f.write(method_result)
                method_errors += 1
                continue

            url_success, url_result = parse_url(curl_command, i)
            if url_success:
                tokenized_command.append(f"<ENDPOINTSTART>{url_result}<ENDPOINTEND>")
            else:
                f.write(url_result)
                url_errors += 1
                continue

            payload_success, payload_result = parse_payload(curl_command, method_result, i)
            if payload_success:
                tokenized_command.append(f"<PAYLOADSTART>{payload_result}<PAYLOADEND>")
            else:
                f.write(payload_result)
                json_errors += 1
                continue

            tokenized_command = "".join(tokenized_command)
            final_dataset["instructions"].append(instruction)
            final_dataset["api_calls"].append(tokenized_command)
        except Exception as e:
            print(str(e))
            break

print(f"{url_errors} url errors and {json_errors} json errors for a total of {url_errors + json_errors}.\nThat plus other errors resulted in a final dataset size of {len(final_dataset['instructions'])}.\nThe distribution of methods is:")
pprint(method_counts)

1365 url errors and 10696 json errors for a total of 12061.
That plus other errors resulted in a final dataset size of 120358.
The distribution of methods is:
{'delete': 24346, 'get': 23601, 'patch': 23237, 'post': 37962, 'put': 23273}


In [13]:
final_df = pd.DataFrame(final_dataset)
final_df.to_csv("rest_base_model_dataset_tokenized_endpoints_allpayload.csv", index=False)