# Export Iris Dataset to DuckDB

In [7]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.preprocessing import StandardScaler

import duckdb

### Config

In [8]:
table_name = 'iris'
ori_workload = False

if not ori_workload:
    workload = 20000000
    table_name += "_" + str(workload / 1000000).replace(".", "_")

table_name

'iris_20_0'

### Load iris data

In [9]:
iris = datasets.load_iris()
iris_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [10]:
# iris_data = pd.read_csv('data/iris.data', names=[, 'class']).drop(columns=iris_cols + ['class'])

scaler = StandardScaler()
iris_data = scaler.fit_transform(iris.data)

iris_data = pd.DataFrame(iris_data, columns=iris_cols)
iris_data['label'] = iris.target
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,-0.900681,1.019004,-1.340227,-1.315444,0
1,-1.143017,-0.131979,-1.340227,-1.315444,0
2,-1.385353,0.328414,-1.397064,-1.315444,0
3,-1.506521,0.098217,-1.283389,-1.315444,0
4,-1.021849,1.249201,-1.340227,-1.315444,0


In [11]:
if not ori_workload:
    iris_data = iris_data.sample(n=workload, replace=True)

In [12]:
# iris_data.drop(columns=['label']).to_csv('dump.csv', header=False)

### Connect to the DB

In [18]:
# con = duckdb.connect("../test_4096.db")

### Create the iris table and populate

In [19]:
# con.sql(f"DROP TABLE IF EXISTS {table_name}")

# con.sql(f"CREATE TABLE {table_name} (sepal_length float, sepal_width float, petal_length float, petal_width float, label integer)")

# con.sql(f"INSERT INTO {table_name} SELECT * FROM iris_data")

# con.sql(f"SELECT * FROM {table_name}").show()

# con.close()

┌──────────────┬─────────────┬──────────────┬──────────────┬───────┐
│ sepal_length │ sepal_width │ petal_length │ petal_width  │ label │
│    float     │    float    │    float     │    float     │ int32 │
├──────────────┼─────────────┼──────────────┼──────────────┼───────┤
│    0.6745011 │  0.32841405 │    0.4217337 │    0.3957741 │     1 │
│   0.31099755 │   -0.592373 │   0.13754657 │   0.13250974 │     1 │
│  -0.53717756 │   1.9397914 │    -1.397064 │   -1.0521799 │     0 │
│    0.5533333 │   0.7888076 │    1.0469455 │    1.5804638 │     2 │
│   -0.9006812 │   0.5586108 │   -1.1697142 │   -0.9205477 │     0 │
│   -0.9006812 │   1.7095946 │   -1.0560393 │   -1.0521799 │     0 │
│    0.5533333 │  -1.7433568 │   0.36489627 │   0.13250974 │     1 │
│   0.31099755 │   -0.592373 │   0.53540856 │ 0.0008775479 │     1 │
│    -1.021849 │   0.7888076 │   -1.2833891 │   -1.3154444 │     0 │
│   -0.7795133 │   1.0190043 │   -1.2833891 │   -1.3154444 │     0 │
│        ·     │       ·     │    

In [21]:
iris_data.to_csv('../out.csv', header=True, index=False)