In [None]:
import os
import json
import pandas as pd
from typing import List, Dict

In [None]:
# Создаем папку для датасетов
os.makedirs("datasets", exist_ok=True)

In [None]:
# ====== 1. Загружаем Spider Dataset ======
!wget -O datasets/spider_train.json https://raw.githubusercontent.com/taoyds/spider/master/spider/train.json
!wget -O datasets/spider_dev.json https://raw.githubusercontent.com/taoyds/spider/master/spider/dev.json
!wget -O datasets/spider_tables.json https://raw.githubusercontent.com/taoyds/spider/master/spider/tables.json


In [None]:
def load_spider_dataset(path: str, tables_path: str) -> List[Dict]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(tables_path, "r", encoding="utf-8") as f:
        tables = json.load(f)

    db_schemas = {table["db_id"]: table["tables"] for table in tables}

    processed_data = []
    for item in data:
        db_id = item["db_id"]
        schema_text = " ".join([
            f"Table: {table}\nColumns: {', '.join(columns)}"
            for table, columns in zip(db_schemas[db_id], item["query_toks"])
        ])

        processed_data.append({
            "schema": schema_text,
            "question": item["question"],
            "sql_query": item["query"]
        })

    return processed_data


spider_train = load_spider_dataset("datasets/spider_train.json", "datasets/spider_tables.json")
spider_dev = load_spider_dataset("datasets/spider_dev.json", "datasets/spider_tables.json")

In [None]:
# ====== 2. Загружаем WikiSQL ======
!wget -O datasets/wikisql_train.jsonl https://raw.githubusercontent.com/salesforce/WikiSQL/master/data/train.jsonl
!wget -O datasets/wikisql_dev.jsonl https://raw.githubusercontent.com/salesforce/WikiSQL/master/data/dev.jsonl

In [None]:
def load_wikisql_dataset(path: str) -> List[Dict]:
    data = [json.loads(line) for line in open(path, "r", encoding="utf-8")]

    processed_data = []
    for item in data:
        schema_text = f"Table: {item['table_id']}\nColumns: {', '.join(item['sql']['col_names'])}"

        processed_data.append({
            "schema": schema_text,
            "question": item["question"],
            "sql_query": item["sql"]["human_readable"]
        })

    return processed_data


wikisql_train = load_wikisql_dataset("datasets/wikisql_train.jsonl")
wikisql_dev = load_wikisql_dataset("datasets/wikisql_dev.jsonl")

In [None]:
# ====== 3. Загружаем BIRD Dataset ======
!wget -O datasets/bird_train.json https://raw.githubusercontent.com/megagonlabs/BIRD/main/data/train.json
!wget -O datasets/bird_dev.json https://raw.githubusercontent.com/megagonlabs/BIRD/main/data/dev.json

In [None]:
def load_bird_dataset(path: str) -> List[Dict]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    processed_data = []
    for item in data:
        schema_text = f"Table: {item['table']}\nColumns: {', '.join(item['columns'])}"

        processed_data.append({
            "schema": schema_text,
            "question": item["query_text"],
            "sql_query": item["sql"]
        })

    return processed_data


bird_train = load_bird_dataset("datasets/bird_train.json")
bird_dev = load_bird_dataset("datasets/bird_dev.json")

In [None]:
# ====== Сохраняем обработанные данные ======
with open("datasets/processed_train.json", "w", encoding="utf-8") as f:
    json.dump(spider_train + wikisql_train + bird_train, f, indent=4)

with open("datasets/processed_dev.json", "w", encoding="utf-8") as f:
    json.dump(spider_dev + wikisql_dev + bird_dev, f, indent=4)

print("✅ Датасеты загружены и обработаны!")
