# Data Dictionaries and Schema

Examples of data dictionary or schema extraction from datasets via Pandas dataframe and various methods.

In [1]:
import json

import pandas as pd
import pandavro as pda
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn import datasets

# load iris dataset from sklearn
iris = datasets.load_iris()

# load iris dataset as pandas dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [2]:
# show pandas json table schema
pd.io.json.build_table_schema(df)

{'fields': [{'name': 'index', 'type': 'integer'},
  {'name': 'sepal length (cm)', 'type': 'number'},
  {'name': 'sepal width (cm)', 'type': 'number'},
  {'name': 'petal length (cm)', 'type': 'number'},
  {'name': 'petal width (cm)', 'type': 'number'}],
 'primaryKey': ['index'],
 'pandas_version': '0.20.0'}

In [3]:
# use pandavro schema inference for quick schema outline
schema = pda.__schema_infer(df=df, times_as_micros=True)

# indent from json dumps helps make formatting more legible
print(json.dumps(schema, indent=4))

{
    "type": "record",
    "name": "Root",
    "fields": [
        {
            "name": "sepal length (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "sepal width (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "petal length (cm)",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "petal width (cm)",
            "type": [
                "null",
                "double"
            ]
        }
    ]
}


In [4]:
# show arrow table schema
table = pa.Table.from_pandas(df)
table.schema

sepal length (cm): double
sepal width (cm): double
petal length (cm): double
petal width (cm): double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 796

In [5]:
table.schema.metadata

{b'pandas': b'{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 150, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "sepal length (cm)", "field_name": "sepal length (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "sepal width (cm)", "field_name": "sepal width (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "petal length (cm)", "field_name": "petal length (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "petal width (cm)", "field_name": "petal width (cm)", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}], "creator": {"library": "pyarrow", "version": "4.0.0"}, "pandas_version": "1.2.4"}'}

In [6]:
# read from parquet file and determine schema
df.to_parquet("iris.parquet")
parquet_file = pq.ParquetFile("iris.parquet")
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x000001D36C1ABFA8>
  created_by: parquet-cpp-arrow version 4.0.0
  num_columns: 4
  num_rows: 150
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 3154