In [3]:
import json
import os
from typing import Dict, List

# ==========================================
# CONFIG
# ==========================================
OUTPUT_FILE = "sql_ast/phase1_simple_select.json"

# ==========================================
# SCHEMA (SOURCE OF TRUTH)
# ==========================================
SCHEMA: Dict[str, List[str]] = {
    "employees": ["id", "name", "salary", "department", "location"],
    "orders": ["id", "amount", "date", "customer_id"],
    "students": ["id", "name", "year", "cgpa", "branch"]
}

# ==========================================
# NL TEMPLATES (LOGICAL ONLY)
# ==========================================
NL_TEMPLATES = [
    "show {column} from {table}",
    "get {column} from {table}",
    "list {column} from {table}",
    "display {column} from {table}"
]

# ==========================================
# DETERMINISTIC GENERATOR (FAST & SAFE)
# ==========================================
def generate_phase1_dataset(
    schema: Dict[str, List[str]],
    templates: List[str]
) -> List[Dict]:

    dataset = []

    for table, columns in schema.items():
        for column in columns:
            for tmpl in templates:
                nl_query = tmpl.format(
                    column=column,
                    table=table
                )

                dataset.append({
                    "nl_query": nl_query,
                    "input_tokens": [
                        "<START>",
                        "SELECT",
                        "<COLUMN>",
                        "FROM",
                        "<TABLE>",
                        "<END>"
                    ],
                    "schema_bindings": {
                        "<TABLE>": table,
                        "<COLUMN>": f"{table}.{column}"
                    }
                })

    return dataset

# ==========================================
# MAIN
# ==========================================
if __name__ == "__main__":
    os.makedirs("sql_ast", exist_ok=True)

    dataset = generate_phase1_dataset(SCHEMA, NL_TEMPLATES)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2)

    print(f"‚úÖ Phase-1 dataset generated: {len(dataset)} samples")
    print(f"üìÅ Saved to: {OUTPUT_FILE}")


‚úÖ Phase-1 dataset generated: 56 samples
üìÅ Saved to: sql_ast/phase1_simple_select.json


In [1]:
import json
import os
from typing import Dict, List

# ==========================================
# CONFIG
# ==========================================
OUTPUT_FILE = "sql_ast/phase2_select_where.json"

# ==========================================
# SCHEMA (SOURCE OF TRUTH)
# ==========================================
SCHEMA: Dict[str, Dict[str, List[str]]] = {
    "employees": {
        "numeric": ["id", "salary"],
        "text": ["name", "department", "location"]
    },
    "orders": {
        "numeric": ["id", "amount", "customer_id"],
        "text": []
    },
    "students": {
        "numeric": ["id", "year", "cgpa"],
        "text": ["name", "branch"]
    },
    "customers": {
        "numeric": ["id", "order_id"],
        "text": ["name", "city"]
    }
}

# ==========================================
# SAFE VALUES
# ==========================================
TEXT_VALUES = {
    "department": ["IT", "HR", "Sales"],
    "location": ["Mumbai", "Delhi"],
    "branch": ["CS", "IT"],
    "city": ["Mumbai", "Pune"],
    "name": ["Alice", "Bob"]
}

NUMERIC_VALUES = [10, 50, 100, 500, 1000]
NUMERIC_OPS = [">", "<", ">=", "<=", "="]

# ==========================================
# NL TEMPLATES
# ==========================================
NL_TEMPLATES = [
    "show {sel_col} from {table} where {where_col} {op} {val}",
    "get {sel_col} from {table} where {where_col} {op} {val}",
    "list {sel_col} from {table} where {where_col} {op} {val}"
]

# ==========================================
# GENERATOR
# ==========================================
def generate_phase2_dataset(schema):
    dataset = []

    for table, cols in schema.items():

        # ---------- numeric WHERE ----------
        for sel_col in cols["numeric"] + cols["text"]:
            for where_col in cols["numeric"]:
                for op in NUMERIC_OPS:
                    val = NUMERIC_VALUES[0]

                    nl = NL_TEMPLATES[0].format(
                        sel_col=sel_col,
                        table=table,
                        where_col=where_col,
                        op=op,
                        val=val
                    )

                    dataset.append({
                        "nl_query": nl,
                        "input_tokens": [
                            "<START>",
                            "SELECT",
                            "<COLUMN>",
                            "FROM",
                            "<TABLE>",
                            "WHERE",
                            "<COLUMN>",
                            op,
                            "<VALUE>",
                            "<END>"
                        ],
                        "schema_bindings": {
                            "<TABLE>": table,
                            "<COLUMN>": [
                                f"{table}.{sel_col}",
                                f"{table}.{where_col}"
                            ],
                            "<VALUE>": val
                        }
                    })

        # ---------- text WHERE ----------
        for sel_col in cols["numeric"] + cols["text"]:
            for where_col in cols["text"]:
                if where_col not in TEXT_VALUES:
                    continue

                val = TEXT_VALUES[where_col][0]

                nl = NL_TEMPLATES[1].format(
                    sel_col=sel_col,
                    table=table,
                    where_col=where_col,
                    op="=",
                    val=val
                )

                dataset.append({
                    "nl_query": nl,
                    "input_tokens": [
                        "<START>",
                        "SELECT",
                        "<COLUMN>",
                        "FROM",
                        "<TABLE>",
                        "WHERE",
                        "<COLUMN>",
                        "=",
                        "<VALUE>",
                        "<END>"
                    ],
                    "schema_bindings": {
                        "<TABLE>": table,
                        "<COLUMN>": [
                            f"{table}.{sel_col}",
                            f"{table}.{where_col}"
                        ],
                        "<VALUE>": val
                    }
                })

    return dataset

# ==========================================
# MAIN
# ==========================================
if __name__ == "__main__":
    os.makedirs("sql_ast", exist_ok=True)

    dataset = generate_phase2_dataset(SCHEMA)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2)

    print(f"‚úÖ Phase-2 dataset generated: {len(dataset)} samples")
    print(f"üìÅ Saved to: {OUTPUT_FILE}")

‚úÖ Phase-2 dataset generated: 243 samples
üìÅ Saved to: sql_ast/phase2_select_where.json


In [2]:
"""
Phase-3 Dataset Generator
Covers:
1Ô∏è‚É£ Aggregation only
2Ô∏è‚É£ Aggregation + GROUP BY
3Ô∏è‚É£ Aggregation + HAVING
4Ô∏è‚É£ Aggregation + GROUP BY + HAVING

Output: sql_ast/phase3_groupby_having.json
"""

import random
import json
import os
from typing import Dict, List, Any

# =====================================================
# SCHEMA
# =====================================================
SCHEMA = {
    "employees": {
        "numeric": ["id", "salary"],
        "text": ["name", "department", "location"],
        "date": []
    },
    "orders": {
        "numeric": ["id", "amount", "customer_id"],
        "text": [],
        "date": ["date"]
    },
    "students": {
        "numeric": ["id", "year", "cgpa"],
        "text": ["name", "branch"],
        "date": []
    },
    "customers": {
        "numeric": ["id", "order_id"],
        "text": ["name", "city"],
        "date": []
    }
}

AGG_RULES = {
    "numeric": ["SUM", "AVG", "MIN", "MAX", "COUNT"],
    "text": ["COUNT"],
    "date": ["MIN", "MAX", "COUNT"]
}

# =====================================================
# TOKEN STRUCTURES (MODEL TARGET)
# =====================================================
TOKENS_AGG_ONLY = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>", "FROM", "<TABLE>", "<END>"
]

TOKENS_GROUP_BY = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>", "FROM", "<TABLE>",
    "GROUP_BY", "<COLUMN>", "<END>"
]

TOKENS_HAVING = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>", "FROM", "<TABLE>",
    "HAVING", "<AGG>", "<COLUMN>", ">", "<VALUE>", "<END>"
]

TOKENS_GROUP_BY_HAVING = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>", "FROM", "<TABLE>",
    "GROUP_BY", "<COLUMN>",
    "HAVING", "<AGG>", "<COLUMN>", ">", "<VALUE>", "<END>"
]

# =====================================================
# HELPERS
# =====================================================
def random_table():
    return random.choice(list(SCHEMA.keys()))

def pick_column(table: str, col_type: str):
    return random.choice(SCHEMA[table][col_type])

def random_value():
    return random.randint(10, 100000)

# =====================================================
# SAMPLE BUILDERS
# =====================================================
def build_agg_only():
    table = random_table()
    col_type = random.choice([k for k in SCHEMA[table] if SCHEMA[table][k]])
    agg = random.choice(AGG_RULES[col_type])
    col = pick_column(table, col_type)

    return {
        "nl_query": f"show {agg.lower()} {col} from {table}",
        "input_tokens": TOKENS_AGG_ONLY,
        "schema_bindings": {
            "<TABLE>": table,
            "<AGG>": agg,
            "<COLUMN>": f"{table}.{col}"
        }
    }

def build_group_by():
    table = random_table()
    if not SCHEMA[table]["text"] and not SCHEMA[table]["date"]:
        return None

    group_col = random.choice(
        SCHEMA[table]["text"] + SCHEMA[table]["date"]
    )

    agg_col = pick_column(table, "numeric")
    agg = random.choice(AGG_RULES["numeric"])

    return {
        "nl_query": f"show {agg.lower()} {agg_col} by {group_col} from {table}",
        "input_tokens": TOKENS_GROUP_BY,
        "schema_bindings": {
            "<TABLE>": table,
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{table}.{agg_col}",
                "group_by": f"{table}.{group_col}"
            }
        }
    }

def build_having():
    table = random_table()
    agg_col = pick_column(table, "numeric")
    agg = random.choice(AGG_RULES["numeric"])
    value = random_value()

    return {
        "nl_query": f"show {agg.lower()} {agg_col} from {table} having {agg.lower()} {agg_col} > {value}",
        "input_tokens": TOKENS_HAVING,
        "schema_bindings": {
            "<TABLE>": table,
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{table}.{agg_col}",
                "having": f"{table}.{agg_col}"
            },
            "<VALUE>": value
        }
    }

def build_group_by_having():
    table = random_table()
    if not SCHEMA[table]["text"] and not SCHEMA[table]["date"]:
        return None

    group_col = random.choice(
        SCHEMA[table]["text"] + SCHEMA[table]["date"]
    )

    agg_col = pick_column(table, "numeric")
    agg = random.choice(AGG_RULES["numeric"])
    value = random_value()

    return {
        "nl_query": (
            f"show {agg.lower()} {agg_col} by {group_col} "
            f"from {table} having {agg.lower()} {agg_col} > {value}"
        ),
        "input_tokens": TOKENS_GROUP_BY_HAVING,
        "schema_bindings": {
            "<TABLE>": table,
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{table}.{agg_col}",
                "group_by": f"{table}.{group_col}",
                "having": f"{table}.{agg_col}"
            },
            "<VALUE>": value
        }
    }

# =====================================================
# DATASET GENERATOR
# =====================================================
def generate_phase3(n: int):
    builders = [
        build_agg_only,
        build_group_by,
        build_having,
        build_group_by_having
    ]

    data = []
    while len(data) < n:
        sample = random.choice(builders)()
        if sample:
            data.append(sample)

    return data

# =====================================================
# MAIN
# =====================================================
if __name__ == "__main__":
    os.makedirs("sql_ast", exist_ok=True)

    samples = generate_phase3(2000)

    with open("sql_ast/phase3_groupby_having.json", "w") as f:
        json.dump(samples, f, indent=2)

    print(f"‚úÖ Phase-3 dataset generated: {len(samples)} samples")

‚úÖ Phase-3 dataset generated: 2000 samples


In [4]:
"""
Phase-4 Dataset Generator ‚Äî JOINs
Covers:
1Ô∏è‚É£ JOIN only
2Ô∏è‚É£ JOIN + WHERE
3Ô∏è‚É£ JOIN + AGG
4Ô∏è‚É£ JOIN + AGG + GROUP BY

NL queries NEVER mention join types explicitly.
"""

import random
import json
import os

# =====================================================
# SCHEMA (Phase-4 compatible)
# =====================================================
SCHEMA = {
    "employees": {
        "numeric": ["emp_id", "salary"],
        "text": ["first_name", "department", "location"]
    },
    "departments": {
        "numeric": ["dept_id"],
        "text": ["dept_name", "manager_id"]
    },
    "customers": {
        "numeric": ["id", "age"],
        "text": ["last_name"]
    },
    "orders": {
        "numeric": ["order_id", "customer_id", "total_amount"],
        "text": ["order_date"]
    },
    "products": {
        "numeric": ["product_id", "unit_price"],
        "text": ["product_name", "category"]
    },
    "inventory": {
        "numeric": ["stock_id", "product_id", "quantity"],
        "text": ["warehouse_location"]
    }
}

# =====================================================
# JOIN DEFINITIONS (RULE SOURCE)
# =====================================================
JOINS = [
    {
        "left": "employees",
        "right": "departments",
        "on": ("employees.department", "departments.dept_name"),
        "nl_templates": [
            "employees with their department details",
            "employees and their departments",
            "department information for employees"
        ]
    },
    {
        "left": "customers",
        "right": "orders",
        "on": ("customers.id", "orders.customer_id"),
        "nl_templates": [
            "customers and their orders",
            "orders placed by customers",
            "customer order details"
        ]
    },
    {
        "left": "products",
        "right": "inventory",
        "on": ("products.product_id", "inventory.product_id"),
        "nl_templates": [
            "products and their stock",
            "inventory details for products",
            "product availability information"
        ]
    }
]

# =====================================================
# TOKEN STRUCTURES (MODEL TARGET)
# =====================================================
TOKENS_JOIN_ONLY = [
    "<START>", "SELECT", "<COLUMN>",
    "FROM", "<TABLE>",
    "JOIN", "<TABLE>", "ON", "<COLUMN>", "<COLUMN>",
    "<END>"
]

TOKENS_JOIN_WHERE = [
    "<START>", "SELECT", "<COLUMN>",
    "FROM", "<TABLE>",
    "JOIN", "<TABLE>", "ON", "<COLUMN>", "<COLUMN>",
    "WHERE", "<COLUMN>", ">", "<VALUE>",
    "<END>"
]

TOKENS_JOIN_AGG = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>",
    "FROM", "<TABLE>",
    "JOIN", "<TABLE>", "ON", "<COLUMN>", "<COLUMN>",
    "<END>"
]

TOKENS_JOIN_AGG_GROUP = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>",
    "FROM", "<TABLE>",
    "JOIN", "<TABLE>", "ON", "<COLUMN>", "<COLUMN>",
    "GROUP_BY", "<COLUMN>",
    "<END>"
]

# =====================================================
# HELPERS
# =====================================================
def random_join():
    return random.choice(JOINS)

def random_numeric(table):
    return random.choice(SCHEMA[table]["numeric"])

def random_text(table):
    return random.choice(SCHEMA[table]["text"])

def random_value():
    return random.randint(10, 100000)

# =====================================================
# SAMPLE BUILDERS
# =====================================================
def build_join_only():
    j = random_join()
    col = random_text(j["left"])

    return {
        "nl_query": random.choice(j["nl_templates"]),
        "input_tokens": TOKENS_JOIN_ONLY,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<COLUMN>": [
                f"{j['left']}.{col}",
                j["on"][0],
                j["on"][1]
            ]
        }
    }

def build_join_where():
    j = random_join()
    col = random_numeric(j["right"])
    value = random_value()

    return {
        "nl_query": (
            f"{random.choice(j['nl_templates'])} "
            f"where {col} greater than {value}"
        ),
        "input_tokens": TOKENS_JOIN_WHERE,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<COLUMN>": [
                f"{j['right']}.{col}",
                j["on"][0],
                j["on"][1],
                f"{j['right']}.{col}"
            ],
            "<VALUE>": value
        }
    }

def build_join_agg():
    j = random_join()
    agg_col = random_numeric(j["right"])
    agg = random.choice(["SUM", "AVG", "COUNT"])

    return {
        "nl_query": (
            f"show {agg.lower()} {agg_col} "
            f"for {random.choice(j['nl_templates'])}"
        ),
        "input_tokens": TOKENS_JOIN_AGG,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<AGG>": agg,
            "<COLUMN>": [
                f"{j['right']}.{agg_col}",
                j["on"][0],
                j["on"][1]
            ]
        }
    }

def build_join_agg_group():
    j = random_join()
    agg_col = random_numeric(j["right"])
    group_col = random_text(j["left"])
    agg = random.choice(["SUM", "AVG", "COUNT"])

    return {
        "nl_query": (
            f"show {agg.lower()} {agg_col} by {group_col} "
            f"for {random.choice(j['nl_templates'])}"
        ),
        "input_tokens": TOKENS_JOIN_AGG_GROUP,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{j['right']}.{agg_col}",
                "group_by": f"{j['left']}.{group_col}",
                "join_left": j["on"][0],
                "join_right": j["on"][1]
            }
        }
    }

# =====================================================
# DATASET GENERATOR
# =====================================================
def generate_phase4(n: int):
    builders = [
        build_join_only,
        build_join_where,
        build_join_agg,
        build_join_agg_group
    ]

    data = []
    while len(data) < n:
        data.append(random.choice(builders)())

    return data

# =====================================================
# MAIN
# =====================================================
if __name__ == "__main__":
    os.makedirs("sql_ast", exist_ok=True)

    samples = generate_phase4(2000)

    with open("sql_ast/phase4_join.json", "w") as f:
        json.dump(samples, f, indent=2)

    print(f"‚úÖ Phase-4 JOIN dataset generated: {len(samples)} samples")

‚úÖ Phase-4 JOIN dataset generated: 2000 samples


In [1]:
"""
Phase-4 Dataset Generator ‚Äî JOINs (EXTENDED)
Covers:
1Ô∏è‚É£ INNER JOIN
2Ô∏è‚É£ LEFT JOIN
3Ô∏è‚É£ RIGHT JOIN
4Ô∏è‚É£ JOIN + WHERE
5Ô∏è‚É£ JOIN + GROUP BY
6Ô∏è‚É£ JOIN + GROUP BY + HAVING

NL queries NEVER mention join types explicitly.
JOIN semantics are implied by intent.
"""

import random
import json
import os

# =====================================================
# SCHEMA (Phase-4 compatible)
# =====================================================
SCHEMA = {
    "employees": {
        "numeric": ["emp_id", "salary"],
        "text": ["first_name", "department", "location"]
    },
    "departments": {
        "numeric": ["dept_id"],
        "text": ["dept_name", "manager_id"]
    },
    "customers": {
        "numeric": ["id", "age"],
        "text": ["last_name"]
    },
    "orders": {
        "numeric": ["order_id", "customer_id", "total_amount"],
        "text": ["order_date"]
    },
    "products": {
        "numeric": ["product_id", "unit_price"],
        "text": ["product_name", "category"]
    },
    "inventory": {
        "numeric": ["stock_id", "product_id", "quantity"],
        "text": ["warehouse_location"]
    }
}

# =====================================================
# JOIN DEFINITIONS (WITH PK‚ÄìFK + JOIN TYPE)
# =====================================================
JOINS = [
    {
        "left": "employees",
        "right": "departments",
        "join_type": "INNER",
        "primary_key": "departments.dept_name",
        "foreign_key": "employees.department",
        "on": ("employees.department", "departments.dept_name"),
        "nl_templates": [
            "employees with their department details",
            "employees and their departments",
            "department information for employees"
        ]
    },
    {
        "left": "customers",
        "right": "orders",
        "join_type": "LEFT",
        "primary_key": "customers.id",
        "foreign_key": "orders.customer_id",
        "on": ("customers.id", "orders.customer_id"),
        "nl_templates": [
            "customers and their orders",
            "orders placed by customers",
            "customer order details"
        ]
    },
    {
        "left": "products",
        "right": "inventory",
        "join_type": "RIGHT",
        "primary_key": "products.product_id",
        "foreign_key": "inventory.product_id",
        "on": ("products.product_id", "inventory.product_id"),
        "nl_templates": [
            "products and their stock",
            "inventory details for products",
            "product availability information"
        ]
    }
]

# =====================================================
# TOKEN STRUCTURES
# =====================================================
TOKENS_JOIN = [
    "<START>", "SELECT", "<COLUMN>",
    "FROM", "<TABLE>",
    "<JOIN_TYPE>", "<TABLE>",
    "ON", "<COLUMN>", "<COLUMN>",
    "<END>"
]

TOKENS_JOIN_WHERE = [
    "<START>", "SELECT", "<COLUMN>",
    "FROM", "<TABLE>",
    "<JOIN_TYPE>", "<TABLE>",
    "ON", "<COLUMN>", "<COLUMN>",
    "WHERE", "<COLUMN>", ">", "<VALUE>",
    "<END>"
]

TOKENS_JOIN_GROUP = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>",
    "FROM", "<TABLE>",
    "<JOIN_TYPE>", "<TABLE>",
    "ON", "<COLUMN>", "<COLUMN>",
    "GROUP_BY", "<COLUMN>",
    "<END>"
]

TOKENS_JOIN_GROUP_HAVING = [
    "<START>", "SELECT", "<AGG>", "<COLUMN>",
    "FROM", "<TABLE>",
    "<JOIN_TYPE>", "<TABLE>",
    "ON", "<COLUMN>", "<COLUMN>",
    "GROUP_BY", "<COLUMN>",
    "HAVING", "<AGG>", "<COLUMN>", ">", "<VALUE>",
    "<END>"
]

# =====================================================
# HELPERS
# =====================================================
def random_join():
    return random.choice(JOINS)

def random_numeric(table):
    return random.choice(SCHEMA[table]["numeric"])

def random_text(table):
    return random.choice(SCHEMA[table]["text"])

def random_value():
    return random.randint(10, 100000)

# =====================================================
# BUILDERS
# =====================================================
def base_join_bindings(j):
    return {
        "<TABLE>": [j["left"], j["right"]],
        "<JOIN_TYPE>": f"{j['join_type']} JOIN",
        "<COLUMN>": [
            j["foreign_key"],
            j["primary_key"]
        ]
    }

def build_inner_or_outer_join():
    j = random_join()
    col = random_text(j["left"])

    bindings = base_join_bindings(j)
    bindings["<COLUMN>"] = [
        f"{j['left']}.{col}",
        j["foreign_key"],
        j["primary_key"]
    ]

    return {
        "nl_query": random.choice(j["nl_templates"]),
        "input_tokens": TOKENS_JOIN,
        "schema_bindings": bindings
    }

def build_join_where():
    j = random_join()
    col = random_numeric(j["right"])
    value = random_value()

    bindings = base_join_bindings(j)
    bindings["<COLUMN>"] += [f"{j['right']}.{col}"]
    bindings["<VALUE>"] = value

    return {
        "nl_query": f"{random.choice(j['nl_templates'])} where {col} is greater than {value}",
        "input_tokens": TOKENS_JOIN_WHERE,
        "schema_bindings": bindings
    }

def build_join_groupby():
    j = random_join()
    agg = random.choice(["SUM", "AVG", "COUNT"])
    agg_col = random_numeric(j["right"])
    group_col = random_text(j["left"])

    return {
        "nl_query": f"show {agg.lower()} {agg_col} by {group_col} for {random.choice(j['nl_templates'])}",
        "input_tokens": TOKENS_JOIN_GROUP,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<JOIN_TYPE>": f"{j['join_type']} JOIN",
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{j['right']}.{agg_col}",
                "group_by": f"{j['left']}.{group_col}",
                "join_left": j["foreign_key"],
                "join_right": j["primary_key"]
            }
        }
    }

def build_join_groupby_having():
    j = random_join()
    agg = random.choice(["SUM", "AVG", "COUNT"])
    agg_col = random_numeric(j["right"])
    group_col = random_text(j["left"])
    value = random_value()

    return {
        "nl_query": f"show {group_col} with {agg.lower()} {agg_col} above {value} for {random.choice(j['nl_templates'])}",
        "input_tokens": TOKENS_JOIN_GROUP_HAVING,
        "schema_bindings": {
            "<TABLE>": [j["left"], j["right"]],
            "<JOIN_TYPE>": f"{j['join_type']} JOIN",
            "<AGG>": agg,
            "<COLUMN>": {
                "select": f"{j['right']}.{agg_col}",
                "group_by": f"{j['left']}.{group_col}",
                "having": f"{j['right']}.{agg_col}",
                "join_left": j["foreign_key"],
                "join_right": j["primary_key"]
            },
            "<VALUE>": value
        }
    }

# =====================================================
# DATASET GENERATOR
# =====================================================
def generate_phase4(n: int):
    builders = [
        build_inner_or_outer_join,
        build_join_where,
        build_join_groupby,
        build_join_groupby_having
    ]

    data = []
    while len(data) < n:
        data.append(random.choice(builders)())

    return data

# =====================================================
# MAIN
# =====================================================
if __name__ == "__main__":
    os.makedirs("sql_ast", exist_ok=True)

    samples = generate_phase4(2000)

    with open("sql_ast/phase4.5_join.json", "w") as f:
        json.dump(samples, f, indent=2)

    print(f"‚úÖ Phase-4.5 JOIN dataset generated: {len(samples)} samples")


‚úÖ Phase-4 JOIN dataset generated: 2000 samples
