# SQLite to Parquet

implements a fully automated, RAM-safe strategies to migrate a large SQLite database from Google Drive to a Hugging Face Parquet Dataset.

### Features
- **Smart Source Selection**: Uses `drive.mount` if you know the file path, OR `gdown` if you just have a shared link.
- **Fast Access**: Downloads/Copies to Colab VM (never to your local PC).
- **Smart Detection**: Automatically finds the largest table to convert.
- **DuckDB**: Fast, low-memory conversion.

In [None]:
!pip install duckdb huggingface_hub gdown

In [None]:
import os
import shutil
import json
import duckdb
import gdown
from google.colab import drive
from huggingface_hub import HfApi, login

# ==========================================
# CONFIGURATION
# ==========================================
config = {
    "source_url": "",
    "google_drive_path": "",
    "hf_repo_id": "YOUR_USERNAME/dataset-name",
    "hf_token": "hf_YOUR_WRITE_TOKEN"
}

# Load from file if uploaded
config_path = "migration_config.json"
if os.path.exists(config_path):
    print(f"Found {config_path}, loading values...")
    with open(config_path, 'r') as f:
        file_config = json.load(f)
        config.update(file_config)
else:
    print("Using in-script placeholders.")

print(f"Target Repo: {config['hf_repo_id']}")

## 1. Setup & Ingestion
Automatically determines usage of Drive Mount vs Gdown based on config.

In [None]:
local_db = "local_input.db"
source_path = config.get("google_drive_path", "")
source_url = config.get("source_url", "")

# STRATEGY 1: CHECK GOOGLE DRIVE PATH
if source_path and "/content/drive" in source_path and "YOUR_DB_FILE" not in source_path:
    print("Strategy: Google Drive System Mount")
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
    
    if os.path.exists(source_path):
        print(f"Copying database from Drive to Local VM... (Please wait)")
        shutil.copy(source_path, local_db)
        print("Copy complete.")
    else:
        print(f"File not found at {source_path}. Falling back to URL...")

# STRATEGY 2: CHECK GDOWN URL
if not os.path.exists(local_db) and source_url and "drive.google.com" in source_url:
    print("Strategy: Gdown (Direct Download to Colab)")
    # Extract ID for safety
    file_id = source_url.split('/d/')[1].split('/')[0] if '/d/' in source_url else None
    if file_id:
        url = f'https://drive.google.com/uc?id={file_id}'
        gdown.download(url, local_db, quiet=False)
    else:
        gdown.download(source_url, local_db, quiet=False, fuzzy=True)
    
    if os.path.exists(local_db):
        print("Download complete.")
    else:
         print("Download failed.")

if not os.path.exists(local_db):
    raise FileNotFoundError("Check your config! Could not find DB via Path OR URL.")

## 2. Smart Table Detection
Scans the database and selects the table with the most rows automatically.

In [None]:
con = duckdb.connect(local_db)

tables = con.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';").fetchall()
table_names = [t[0] for t in tables]

print(f"Found tables: {table_names}")

best_table = None
max_rows = 0

# Check row counts to find the "Main" table
for table in table_names:
    try:
        count = con.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
        print(f"   - Table '{table}': {count} rows")
        if count > max_rows:
            max_rows = count
            best_table = table
    except Exception as e:
        print(f"   - Error reading '{table}': {e}")

if not best_table:
    con.close()
    raise ValueError("No data found in any table!")

print(f"Auto-selected largest table: '{best_table}'")
con.close()

## 3. Conversion & Upload
Converts the selected table to Parquet and uploads it to Hugging Face.

In [None]:
parquet_file = "train.parquet"
con = duckdb.connect(local_db)

print(f"Converting '{best_table}' to Parquet...")
con.execute(f"""
    COPY (SELECT * FROM {best_table}) 
    TO '{parquet_file}' 
    (FORMAT 'PARQUET', CODEC 'SNAPPY');
""")
con.close()
print("Conversion Complete.")

print("Uploading to Hugging Face...")

if "OPTIONAL" in config["hf_token"] or "YOUR_TOKEN" in config["hf_token"]:
    print("Logging in interactively (Token not in config)...")
    login()
else:
    login(token=config["hf_token"])

api = HfApi()
repo_id = config["hf_repo_id"]

print(f"Ensuring repo {repo_id} exists...")
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)

print(f"Uploading {parquet_file}...")
try:
    api.upload_file(
        path_or_fileobj=parquet_file,
        path_in_repo="data/train.parquet",
        repo_id=repo_id,
        repo_type="dataset"
    )
    print("Upload Success!")
    print(f"DONE! Dataset is live at: https://huggingface.co/datasets/{repo_id}")
except Exception as e:
    print(f"Upload failed: {e}")