In [None]:
import random
import pandas as pd

# Input data
input_data = {
    "telephone": "11999998888"
}

# Output data
output_data = {
    "ddd": "11",
    "telephone": "999998888"
}

# Jolt spec
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "ddd": "=substring(@(1,telephone),0,2)",
            "sizeTelephone": "=size(@(1,telephone))",
            "telephone": "=substring(@(1,telephone),2,@(1,sizeTelephone))"
        }
    },
    {
        "operation": "remove",
        "spec": {
            "sizeTelephone": ""
        }
    }
]

# Generate 100 more records with variations
records = []
for _ in range(100):
    # Randomize the digits of the telephone number while keeping the same area code
    randomized_telephone = input_data["telephone"][:2] + ''.join(random.sample(input_data["telephone"][2:], len(input_data["telephone"][2:])))

    # Create a new record with the randomized telephone number
    new_record = {
        "input": {"telephone": randomized_telephone},
        "output": output_data,
        "spec": jolt_spec,
        "context": "Separating area code from a phone number with randomized telephone number"
    }

    records.append(new_record)

# Convert records to a pandas DataFrame
df = pd.DataFrame(records)

# Save DataFrame to an Excel file
df.to_excel("generated_records.xlsx", index=False)

print("Generated 100 records with variations. Saved to generated_records.xlsx")


Generated 100 records with variations. Saved to generated_records.xlsx


In [None]:
import random
import pandas as pd

# Input data
input_data = {
    "cpf": "123.456.789-10",
    "cnpj": "11.222.333/0001-10"
}

# Output data
output_data = {
    "cpf": "12345678910",
    "cnpj": "11222333000110"
}

# Jolt spec
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "cpf": "=split('[.-]',@(1,cpf))",
            "cnpj": "=split('[./-]',@(1,cnpj))"
        }
    },
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "cpf": "=join('',@(1,cpf))",
            "cnpj": "=join('',@(1,cnpj))"
        }
    }
]

# Generate 100 more records with variations
records = []
for _ in range(100):
    # Generate variations in the input data
    randomized_cpf = ''.join(random.sample(input_data["cpf"], len(input_data["cpf"])))
    randomized_cnpj = ''.join(random.sample(input_data["cnpj"], len(input_data["cnpj"])))

    # Create a new record with the randomized data
    new_record = {
        "input": {"cpf": randomized_cpf, "cnpj": randomized_cnpj},
        "output": output_data,
        "spec": jolt_spec,
        "context": "Removing special characters from CPF and CNPJ with randomized variations"
    }

    records.append(new_record)

# Convert records to a pandas DataFrame
df = pd.DataFrame(records)

# Save DataFrame to an Excel file
df.to_excel("generated_records.xlsx", index=False)

print("Generated 100 records with variations. Saved to generated_records.xlsx")


Generated 100 records with variations. Saved to generated_records.xlsx


In [None]:
import random
import pandas as pd

# Input data
input_data = {
    "products": [
        {"id": 1},
        {"id": 2},
        {"id": 1}
    ]
}

# Output data
output_data = {
    "products": [
        {"id": "1"},
        {"id": "2"}
    ]
}

# Jolt spec
jolt_spec = [
    {
        "operation": "shift",
        "spec": {
            "products": {
                "*": {
                    "id": {
                        "*": "ids.&[]"
                    }
                }
            }
        }
    },
    {
        "operation": "shift",
        "spec": {
            "ids": {
                "*": {
                    "$": "products[].id"
                }
            }
        }
    }
]

# Generate 100 more records with variations
records = []
for _ in range(100):
    # Generate variations in the input data
    # Ensure at least 2 ids have the same value
    id_values = [1, 2, random.choice([1, 2])]  # At least 2 ids with the same value
    random.shuffle(id_values)
    products = [{"id": id_val} for id_val in id_values]

    # Create a new record with the randomized data
    new_record = {
        "input": {"products": products},
        "output": output_data,
        "spec": jolt_spec,
        "context": "Eliminating duplicate values with variations"
    }

    records.append(new_record)

# Convert records to a pandas DataFrame
df = pd.DataFrame(records)

# Save DataFrame to an Excel file
df.to_excel("generated_records.xlsx", index=False)

print("Generated 100 records with variations. Saved to generated_records.xlsx")


Generated 100 records with variations. Saved to generated_records.xlsx


In [3]:
import random
import pandas as pd

# Function to generate random products with different values
def generate_random_products():
    products = []
    for i in range(2):
        product = {
            "id": i + 1,
            "name": f"Product {chr(65 + i)}",  # Assigning product names as A, B, ...
            "value": random.randint(1, 100)  # Generating random value between 1 and 100
        }
        products.append(product)
    return products

# Jolt spec
jolt_spec = [
    {
        "operation": "shift",
        "spec": {
            "products": {
                "*": {
                    "*": "products[#2].&",
                    "value": ["products[#2].&", "values[]"]
                }
            }
        }
    },
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "totalValue": "=doubleSum(@(1,values))"
        }
    }
]

# Generate 100 records with variations
records = []
for _ in range(100):
    # Generate random products with different values
    products = generate_random_products()

    # Calculate total value
    total_value = sum(product["value"] for product in products)

    # Create a new record with the randomized data
    new_record = {
        "input": {"products": products},
        "output": {"totalValue": total_value},
        "spec": jolt_spec,
        "context": "Adding numeric values with variations"
    }

    records.append(new_record)

# Convert records to a pandas DataFrame
df = pd.DataFrame(records)

# Save DataFrame to an Excel file
df.to_excel("generated_records.xlsx", index=False)

print("Generated 100 records with variations. Saved to generated_records.xlsx")


Generated 100 records with variations. Saved to generated_records.xlsx


In [5]:
import random
import pandas as pd

# Jolt spec
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "inverseValue": "=divide(1, @(1,value2))",
            "finalValue": "=divideAndRound(2, @(1,value1), @(1,inverseValue))"
        }
    }
]

# Generate 100 records with variations
records = []
for _ in range(100):
    # Generate random input values
    value1 = random.randint(1, 20)
    value2 = random.randint(1, 10)

    # Calculate output values based on input
    inverse_value = 1 / value2
    final_value = round(2 * value1 / inverse_value, 2)

    # Create a new record with the randomized data
    new_record = {
        "input": {"value1": value1, "value2": value2},
        "output": {"value1": value1, "value2": value2, "inverseValue": inverse_value, "finalValue": final_value},
        "spec": jolt_spec,
        "context": "Multiplying 2 numeric values"
    }

    records.append(new_record)

# Convert records to a pandas DataFrame
df = pd.DataFrame(records)

# Save DataFrame to an Excel file
df.to_excel("generated_records.xlsx", index=False)

print("Generated 100 records with variations. Saved to generated_records.xlsx")


Generated 100 records with variations. Saved to generated_records.xlsx


In [8]:
import json
import random
import string
import pandas as pd

def generate_random_email():
    domain = random.choice(["gmail.com", "outlook.com", "yahoo.com", "hotmail.com"])
    return ''.join(random.choices(string.ascii_lowercase, k=5)) + "@" + domain

def generate_random_clients():
    num_clients = random.randint(1, 3)
    clients = []
    for _ in range(num_clients):
        clients.append({"name": ''.join(random.choices(string.ascii_lowercase, k=5)), "email": generate_random_email()})
    return clients

def generate_record():
    input_clients = generate_random_clients()

    # Include only clients with Gmail addresses in the output
    output_clients = [client for client in input_clients if client["email"].endswith("gmail.com")]

    # Randomly generate spec
    spec = [{
        "operation": "shift",
        "spec": {
            "clients": {
                "*": {
                    "email": {
                        "*\\@gmail.com": {
                            "@2": "gmail[]"
                        }
                    }
                }
            }
        }
    }]

    return {
        "input": json.dumps({"clients": input_clients}),
        "output": json.dumps({"clients": output_clients}),
        "spec": json.dumps(spec),
        "context": "Applying a filter on the content of a field"
    }

# Generate 100 records
num_records = 100
records = [generate_record() for _ in range(num_records)]

# Convert records to a DataFrame
df = pd.DataFrame(records)

# Save DataFrame to Excel
output_file = "generated_records.xlsx"
df.to_excel(output_file, index=False)

print(f"Records saved to {output_file}")


Records saved to generated_records.xlsx


In [1]:
import json
import random
import string
import pandas as pd

def generate_random_keys():
    return random.sample(string.ascii_lowercase, random.randint(1, 4))

def generate_random_values():
    return ''.join(random.choices(string.ascii_lowercase, k=5))

def generate_random_clients():
    num_clients = random.randint(1, 3)
    clients = []
    for _ in range(num_clients):
        keys = generate_random_keys()
        client = {key: generate_random_values() for key in keys}
        clients.append(client)
    return clients

def generate_record():
    input_data = {"list": generate_random_clients()}

    # Output data is the same as input data with an additional key "e" in each dictionary
    output_data = [{k: v for k, v in client.items()} for client in input_data["list"]]
    for client in output_data:
        client["e"] = "e"

    # Randomly generate spec
    spec = [{
        "operation": "default",
        "spec": {
            "list[]": {
                "*": {"e": "e"}
            }
        }
    }]

    return {
        "input": json.dumps(input_data),
        "output": json.dumps({"list": output_data}),
        "spec": json.dumps(spec),
        "context": "Including default values within a list"
    }

# Generate 100 records
num_records = 100
records = [generate_record() for _ in range(num_records)]

# Convert records to a DataFrame
df = pd.DataFrame(records)

# Save DataFrame to Excel
output_file = "generated_records_different_pairs.xlsx"
df.to_excel(output_file, index=False)

print(f"Records saved to {output_file}")


Records saved to generated_records_different_pairs.xlsx


In [17]:
import json
import random
import string
import pandas as pd

# Function to generate random input and output data
def generate_random_data():
    user_name = ''.join(random.choices(string.ascii_lowercase, k=5))
    input_data = {"body": {"userName": user_name}}
    output_data = {"data": {"user_name": user_name}}
    return input_data, output_data

# Generate random spec and context
def generate_spec_and_context():
    spec = {
        "operation": "shift",
        "spec": {
            "body": {
                "userName": "data.user_name"
            }
        }
    }
    context = "Shift: Used to change the structure of a JSON, keeping the values contained in that same JSON."
    return spec, context

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data, output_data = generate_random_data()
    spec, context = generate_spec_and_context()
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [28]:
import random
import string
import pandas as pd

# Function to generate a random username
def generate_random_username():
    return ''.join(random.choices(string.ascii_lowercase, k=5))

# Generate random spec and context
spec = {
    "operation": "default",
    "spec": {
        "body": {
            "email": "default@email.com"
        }
    }
}
context = "Default: Used to add new fields or objects in a JSON, if they don't already exist."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    username = generate_random_username()
    input_data = {"body": {"userName": username}}
    output_data = {"body": {"userName": username, "email": "default@email.com"}}
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [31]:
import json
import random
import string
import pandas as pd

# Function to generate random username
def generate_random_username():
    return ''.join(random.choices(string.ascii_lowercase, k=5))

# Generate random spec and context
spec = {
    "operation": "remove",
    "spec": {
        "body": {
            "email": ""
        }
    }
}
context = "Remove: Used to remove fields or objects from a JSON."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    username = generate_random_username()
    input_data = {"body": {"userName": username, "email": "default@email.com"}}
    output_data = {"body": {"userName": username}}
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

#Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [36]:
import json
import random
import string
import pandas as pd

# Function to generate random field names
def generate_random_field_name():
    return ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 10)))

# Function to generate random input data
def generate_random_input():
    fields = {generate_random_field_name(): ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 10))) for _ in range(random.randint(3, 5))}
    return {"employee": fields}

# Function to generate output data based on input
def generate_output(input_data):
    return {"employee": {key: value for key, value in sorted(input_data["employee"].items())}}

# Generate random spec and context
spec = {"operation": "sort"}
context = "Sort: Used to sort fields and objects in a JSON in alphabetical order."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_input()
    output_data = generate_output(input_data)
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [38]:
import json
import random
import string
import pandas as pd

# Function to generate random values for "name", "id", and "value"
def generate_random_values():
    name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 10)))
    id_value = ''.join(random.choices(string.digits + string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 10))) + '-' + ''.join(random.choices(string.ascii_uppercase, k=1))
    value = random.randint(1, 100)
    return name, id_value, value

# Function to generate random input data
def generate_random_input():
    name, id_value, value = generate_random_values()
    return {"products": {"name": name, "id": id_value, "value": value}}

# Function to generate output data based on input
def generate_output(input_data):
    return {"products": [input_data["products"]]}

# Generate random spec and context
spec = {"operation": "cardinality", "spec": {"products": "MANY"}}
context = "Cardinality: Used to transform simple fields and objects into arrays and vice versa."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_input()
    output_data = generate_output(input_data)
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [43]:
import json
import random
import string
import pandas as pd

# Function to generate random names
def generate_random_name():
    first_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 8)))
    last_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 8)))
    return first_name, last_name

# Function to generate random input data
def generate_random_input():
    first_name, last_name = generate_random_name()
    return {"data": {"firstName": first_name, "lastName": last_name}}

# Function to generate output data based on input
def generate_output(input_data):
    full_name = input_data["data"]["firstName"] + " " + input_data["data"]["lastName"]
    modified_data = input_data["data"].copy()  # Copy the input data to modify
    modified_data["fullName"] = full_name
    return {"data": modified_data}

# Generate random spec and context
spec = {
    "operation": "modify-overwrite-beta",
    "spec": {
        "data": {
            "fullName": "=concat(@(1,firstName),' ',@(1,lastName))"
        }
    }
}
context = "Modify-overwrite-beta: Used to override values and apply functions to a JSON."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_input()
    output_data = generate_output(input_data)
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [46]:
import json
import random
import string
import pandas as pd

# Function to generate random client data
def generate_random_client():
    name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))
    age = random.randint(18, 80)
    marital_status = random.choice(["Single", "Married", "Divorced", "Widowed"])
    country = random.choice(["Brazil", "USA", "Canada", "UK", "Australia"])  # Randomize country
    return {"client": {"name": name, "age": str(age), "maritalStatus": marital_status, "country": country}}

# Function to generate output data based on input
def generate_output(input_data):
    if input_data["client"]["country"] == "Brazil":
        citizenship = "Brazilian"
    else:
        citizenship = "Foreigner"
    output_data = {"client": {"name": input_data["client"]["name"], "citizenship": citizenship}}
    return output_data

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "client": {
            "name": "client.name",
            "age": "client.age",
            "maritalStatus": "client.maritalStatus",
            "country": {
                "Brazil": {
                    "#Brazilian": "client.citizenship"
                },
                "*": {
                    "#Foreigner": "client.citizenship"
                }
            }
        }
    }
}
context = "IF - ELSE simple with JOLT: If it is 'Brazil', we complete the 'citizenship' field with the 'Brazilian' value. If the 'country' value is a country other than 'Brazil', we complete the 'citizenship' field with the 'Foreigner' value."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_client()
    output_data = generate_output(input_data)
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [47]:
import json
import random
import string
import pandas as pd

# Function to generate random names and emails
def generate_random_name():
    return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))

def generate_random_email():
    name = ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 10)))
    domain = ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 10)))
    extension = random.choice(["com", "net", "org"])
    return f"{name}@{domain}.{extension}"

# Function to generate random client data
def generate_random_client():
    return {"name": generate_random_name(), "email": generate_random_email()}

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "name": "client.&",
        "email": "client.&"
    }
}
context = "Wildcard &: It uses the content of what is declared in the LHS to compose the structure of the output JSON, without the need to make this content explicit in the transformation. Usage: RHS Operation: shift"

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_client()
    rows.append({"input": input_data, "output": {"client": input_data}, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [50]:
import json
import random
import string
import pandas as pd

# Predefined list of meaningful field names
field_names = ["name", "email", "phone_number", "address", "birth_date", "occupation", "country", "city", "zip_code", "company"]

# Function to generate random data for the customer
def generate_random_customer():
    num_fields = random.randint(3, 7)  # Random number of fields
    fields = {}
    for _ in range(num_fields):
        field_name = random.choice(field_names)
        field_value = ''.join(random.choices(string.ascii_lowercase + string.digits, k=random.randint(5, 15)))
        fields[field_name] = field_value
    return fields

# Function to wrap output fields inside "customer" field
def wrap_in_customer(output_data):
    return {"customer": output_data}

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "*": "customer.&"
    }
}
context = "* (asterisk) References all fields and objects in a JSON without having to make their names explicit in the transformation. Usage: LHS Operations: shift, remove, cardinality, modify-default-beta and modify-overwrite-beta"

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_customer()
    output_data = input_data.copy()
    rows.append({"input": input_data, "output": wrap_in_customer(output_data), "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [52]:
import json
import random
import string
import pandas as pd

# Function to generate random data for the product
def generate_random_product():
    key = ''.join(random.choices(string.ascii_lowercase, k=random.randint(3, 8)))
    value = ''.join(random.choices(string.ascii_uppercase + string.digits, k=random.randint(5, 10)))
    return {"key": key, "value": value}

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "value": "product.@(1,key)"
    }
}
context = "@' References the value of a field or object contained in the input JSON, but has different effects depending on its usage. Usage: LHS and RHS Operations: shift (LHS and RHS), modify-overwrite-beta (RHS) and modify-overwrite-beta (RHS). Shift example: In ""@ (1, key)"" we are taking the value of the ""key"" field to be used as the name of the field that will receive the value of the ""value"" field (""@value""). The use of @ in both LHS and RHS involves declaring the level at which we are seeking information and counting levels from level 1 onwards. In this case, the ""key"" field is on the same level as the ""value"" field, so we use the number 1. The usage of @ in LHS follows the same way as in RHS."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_product()
    output_data = {"product": {"code": input_data["value"]}}
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [57]:
import json
import random
import string
import pandas as pd

# Predefined list of meaningful keys
meaningful_keys = ["name", "value", "category", "weight", "size", "color", "quantity", "brand"]

# Function to generate random data for the product
def generate_random_product():
    num_fields = random.randint(3, 7)  # Random number of fields
    fields = {}
    for _ in range(num_fields):
        key = random.choice(meaningful_keys)
        value = generate_random_value(key)
        fields[key] = value
    return {"product": fields}

# Function to generate random value based on the key
def generate_random_value(key):
    if key == "name":
        return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))
    elif key == "value":
        return round(random.uniform(5.0, 100.0), 2)
    elif key == "category":
        return ''.join(random.choices(string.ascii_uppercase, k=random.randint(3, 6)))
    elif key == "weight":
        return round(random.uniform(0.5, 50.0), 2)
    else:
        return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=random.randint(5, 15)))

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "product": {
            "*": {
                "$": "product[]"
            }
        }
    }
}
context = """$
References the name of a field or object contained in the input JSON to be used as the value of a field or object in the output JSON.Usage: LHS
Operations: shift
 """

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_product()
    output_data = list(input_data["product"].keys())
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [56]:
import json
import random
import string
import pandas as pd

# Predefined list of meaningful keys
meaningful_keys = ["name", "value", "weight", "size", "color", "quantity", "brand"]

# Function to generate random data for the product
def generate_random_product():
    num_fields = random.randint(2, 4)  # Random number of fields excluding category
    fields = {}
    for _ in range(num_fields):
        key = random.choice(meaningful_keys)
        value = generate_random_value(key)
        fields[key] = value
    return {"product": fields}

# Function to generate random value based on the key
def generate_random_value(key):
    if key == "name":
        return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))
    elif key == "value":
        return round(random.uniform(5.0, 100.0), 2)
    elif key == "weight":
        return round(random.uniform(0.5, 50.0), 2)
    else:
        return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=random.randint(5, 15)))

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "product": {
            "*": "product.&",
            "#DEFAULT-CATEGORY": "product.category"
        }
    }
}
context = "If used in LHS, it has the function of entering values manually in the output JSON. In RHS, on the other hand, it is applicable only to create lists and has the function of grouping certain content of the input JSON within the list to be created."

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_product()
    output_data = input_data["product"].copy()
    output_data["category"] = "DEFAULT-CATEGORY"
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [59]:
import json
import random
import string
import pandas as pd

# Function to generate random product data
def generate_random_product():
    return {
        "product": {
            "name": ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 10))),
            "price": round(random.uniform(5, 100), 2)
        },
        "manufacturer": ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))
    }

# Generate random spec and context
spec = {
    "operation": "modify-default-beta",
    "spec": {
        "product": {
            "company": "@(2,manufacturer)"
        }
    }
}
context = """
modify-default-beta and modify-overwrite-beta examples: used to fetch field from above hierarchy
"""

# Generate 100 rows
num_rows = 100
rows = []
for _ in range(num_rows):
    input_data = generate_random_product()
    output_data = input_data.copy()
    output_data["product"]["company"] = input_data["manufacturer"]  # Update output with manufacturer field
    rows.append({"input": input_data, "output": output_data, "spec": spec, "context": context})

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output", "spec", "context"])

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [61]:
import json
import random
import string
import pandas as pd

# Function to generate random input-output pair
def generate_random_data():
    full_name = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=random.randint(5, 15)))
    email = ''.join(random.choices(string.ascii_lowercase, k=random.randint(5, 10))) + "@example.com"

    input_data = {
        "customer": {
            "fullName": full_name,
            "email": email
        }
    }

    output_data = {
        "customer": {
            "name": full_name,
            "email": email
        }
    }

    return input_data, output_data

# Generate 100 rows
num_rows = 100
rows = [generate_random_data() for _ in range(num_rows)]

# Generate random spec and context
spec = {
    "operation": "shift",
    "spec": {
        "customer": {
            "fullName|customerName": "customer.name",
            "email": "customer.&"
        }
    }
}
context = """
| (pipe) allows referencing multiple fields or objects of an input JSON so that, regardless of the name of the field or object, its value is allocated to the same destination in the output JSON.
"""

# Convert rows to DataFrame
df = pd.DataFrame(rows, columns=["input", "output"])
df["spec"] = json.dumps(spec)
df["context"] = context

# Save DataFrame to Excel
output_file = "generated_training_data.xlsx"
df.to_excel(output_file, index=False)

print(f"Training data saved to {output_file}")


Training data saved to generated_training_data.xlsx


In [11]:
import pandas as pd
import json

# Jolt transformation specification
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "numScores": "=size(@(1,scores))",
            "firstScore": "=firstElement(@(1,scores))",
            "lastScore": "=lastElement(@(1,scores))",
            "scoreAtMidPoint": "=elementAt(@(1,scores),2)",
            "sortedScores": "=sort(@(1,scores))"
        }
    }
]

# Generate 100 records
records = []
for _ in range(100):
    # Generate random scores list with 5 elements
    scores = [random.randint(1, 10) for _ in range(5)]

    # Create input dictionary
    input_data = {"scores": scores}

    # Apply Jolt transformation
    output_data = {
        "scores": scores,
        "numScores": len(scores),
        "firstScore": min(scores),
        "lastScore": max(scores),
        "sortedScores": sorted(scores)
    }

    records.append((input_data, output_data, jolt_spec))

# Convert records to DataFrame
df = pd.DataFrame(records, columns=["input", "output", "spec"])
df
# Save DataFrame to Excel file
df.to_excel("jolt_records.xlsx", index=False)


In [2]:
pip install faker


Collecting faker
  Downloading Faker-25.2.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-25.2.0


In [13]:
import pandas as pd
import random

# Jolt transformation specification
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "sumIntData": "=intSum(@(1,intData))",
            "sumLongData": "=intSum(@(1,intData))",
            "sumDoubleData": "=doubleSum(@(1,doubleData))",
            "avgIntData": "=avg(@(1,intData))",
            "avgDoubleData": "=avg(@(1,doubleData))",
            "sortedIntScores": "=sort(@(1,intData))",
            "minAB": "=min(@(1,a),@(1,b))",
            "maxAB": "=max(@(1,a),@(1,b))",
            "abs": "=abs(@(1,negative))",
            "aDivB": "=divide(@(1,a),@(1,b))",
            "aDivC": "=divide(@(1,a),@(1,c))",
            "aDivCRounded4": "=divideAndRound(4,@(1,a),@(1,c))"
        }
    }
]

# Generate 100 records
records = []
for _ in range(100):
    # Generate random values
    int_data = [random.randint(1, 10) for _ in range(3)]
    double_data = [random.uniform(0, 10) for _ in range(3)]
    a = random.randint(1, 10)
    b = random.randint(1, 10)
    c = random.randint(1, 10)
    negative = random.uniform(-10, 0)

    # Create input dictionary
    input_data = {
        "intData": int_data,
        "doubleData": double_data,
        "a": a,
        "b": b,
        "c": c,
        "negative": negative
    }

    # Apply Jolt transformation
    output_data = {
        "intData": int_data,
        "doubleData": double_data,
        "a": a,
        "b": b,
        "c": c,
        "negative": negative,
        "sumIntData": sum(int_data),
        "sumLongData": sum(int_data),
        "sumDoubleData": sum(double_data),
        "avgIntData": sum(int_data) / len(int_data),
        "avgDoubleData": sum(double_data) / len(double_data),
        "sortedIntScores": sorted(int_data),
        "minAB": min(a, b),
        "maxAB": max(a, b),
        "abs": abs(negative),
        "aDivB": a / b,
        "aDivC": a / c,
        "aDivCRounded4": round(a / c, 4)
    }

    records.append((input_data, output_data, jolt_spec))

# Convert records to DataFrame
df = pd.DataFrame(records, columns=["input", "output", "spec"])
df
# Save DataFrame to Excel file
df.to_excel("jolt_math_records.xlsx", index=False)


In [16]:
import pandas as pd
import random
import numpy as np

# Jolt transformation specification
jolt_spec = [
    {
        "operation": "modify-overwrite-beta",
        "spec": {
            "happy": "=toBoolean",
            "meh": ["=toBoolean", False],
            "answer": "=toString",
            "statistics": {
                "*": {
                    "min": ["=toInteger", None],  # Default value set to None
                    "max": ["=toInteger", None],
                    "avg": ["=toDouble", None],
                    "_id": "UNKNOWN"
                }
            }
        }
    }
]

# Generate 100 records
records = []
for _ in range(100):
    # Generate random values
    happy = random.choice([True, False])
    meh = random.choice([True, False])
    answer = random.randint(1, 100)

    # Randomize the presence of id, min, max, and avg fields
    statistics = []
    for _ in range(random.randint(1, 5)):
        stat = {"id": random.choice(["A", "B", "C", "D"])}
        if random.random() < 0.5:
            stat["min"] = random.randint(1, 10)
        if random.random() < 0.5:
            stat["max"] = random.randint(1, 10)
        if random.random() < 0.5:
            stat["avg"] = round(random.uniform(1, 10), 1)
        statistics.append(stat)

    # Create input dictionary
    input_data = {
        "happy": str(happy).lower(),
        "meh": str(meh).lower(),
        "answer": answer,
        "statistics": statistics
    }

    # Apply Jolt transformation
    output_data = {
        "happy": happy,
        "meh": meh,
        "answer": str(answer),
        "statistics": [
            {"id": s.get("id"), "min": s.get("min"), "max": s.get("max"), "avg": s.get("avg")}
            for s in statistics
        ]
    }

    records.append((input_data, output_data, jolt_spec))

# Convert records to DataFrame
df = pd.DataFrame(records, columns=["input", "output", "spec"])

# Save DataFrame to Excel file
df.to_excel("jolt_type_conversion_records.xlsx", index=False)


In [18]:
import pandas as pd
import random
import string

# Jolt transformation specification
jolt_spec = [
    {
        "operation": "modify-default-beta",
        "spec": {
            "y": "=join(',',@(1,x))",
            "z": "=join(' ',@(1,x))",
            "small_toUpper": "=toUpper(@(1,small))",
            "BIG_toLower": "=toLower(@(1,BIG))",
            "people": {
                "*": {
                    "fullName": "=concat(@(1,firstName),' ',@(1,lastName))",
                    "address?": {
                        "state": "Texas"
                    }
                }
            }
        }
    }
]

# Function to generate random strings
def random_string(length):
    letters = string.ascii_letters
    return ''.join(random.choice(letters) for _ in range(length))

# Function to generate random names
def random_name():
    first_names = ["John", "Jane", "Bob", "Alice", "Tom", "Lucy", "Mike", "Emma"]
    last_names = ["Smith", "Doe", "Brown", "Johnson", "White", "Jones", "Taylor", "Lee"]
    return random.choice(first_names), random.choice(last_names)

# Function to generate random people data
def random_people_data(num_people):
    people = []
    for _ in range(num_people):
        first_name, last_name = random_name()
        person = {"firstName": first_name, "lastName": last_name}
        if random.random() < 0.5:
            person["address"] = {"state": random.choice([None, "New York", "California", "Florida", "Texas"])}
        people.append(person)
    return people

# Generate 100 records
records = []
for _ in range(100):
    # Generate random values
    x = [random.choice([random.randint(1, 10), random_string(2)]) for _ in range(random.randint(3, 6))]
    small = random_string(5).lower()
    BIG = random_string(5).upper()

    # Generate random people data
    people = random_people_data(random.randint(1, 3))

    # Create input dictionary
    input_data = {
        "x": x,
        "small": small,
        "BIG": BIG,
        "people": people
    }

    # Apply Jolt transformation
    transformed_people = []
    for person in people:
        full_name = f"{person['firstName']} {person['lastName']}"
        address = person.get("address", {})
        if address.get("state") is None:
            address["state"] = "Texas"
        transformed_people.append({
            "firstName": person["firstName"],
            "lastName": person["lastName"],
            "address": address,
            "fullName": full_name
        })

    output_data = {
        "x": x,
        "small": small,
        "BIG": BIG,
        "people": transformed_people,
        "y": ",".join(map(str, x)),
        "z": " ".join(map(str, x)),
        "small_toUpper": small.upper(),
        "BIG_toLower": BIG.lower()
    }

    records.append((input_data, output_data, jolt_spec))

# Convert records to DataFrame
df = pd.DataFrame(records, columns=["input", "output", "spec"])
df
# Save DataFrame to Excel file
df.to_excel("jolt_string_concatenation_records.xlsx", index=False)


In [23]:
import pandas as pd

# Load the existing Excel file
df = pd.read_excel("joltData.xlsx")

# Swap the 'output' and 'spec' columns for rows 16 to 41 (index 15 to 40)
df.loc[14:40, ['output', 'spec']] = df.loc[14:40, ['spec', 'output']].values
df.loc[14:40, ['output', 'spec']]
# # Save the corrected DataFrame to a new Excel file
df.to_excel("joltTrainingData.xlsx", index=False)


In [25]:
import pandas as pd

def ensure_spec_is_list(spec):
    # Check if the first and last character of the spec are '[' and ']'
    if spec.strip().startswith('[') and spec.strip().endswith(']'):
        return spec  # It is already a list
    else:
        # It's not a list, wrap it in square brackets
        return f"[{spec}]"

# Load the existing Excel file
df = pd.read_excel("joltTrainingData.xlsx")

# Ensure the 'spec' column values are lists for the entire column
df['spec'] = df['spec'].apply(ensure_spec_is_list)

# Save the corrected DataFrame to a new Excel file
df.to_excel("joltTrainingData.xlsx", index=False)

