In [1]:
import requests

# Example ingestion from URL

https://github.com/remram44/coronavirus-data/blob/historical/tests-by-zcta.historical.csv

## Step 1: Get direct link to file

You have to use the direct link to a support file (CSV, Excel, SPSS, ...) not a page where it is embedded. We click the "raw" button on GitHub and get:

In [2]:
url = "https://raw.githubusercontent.com/remram44/coronavirus-data/historical/tests-by-zcta.historical.csv"

## Step 2: Assemble some metadata

In [3]:
name = "NYC Coronavirus (COVID-19) data"
description = (
    "Data are assembled by the NYC Department of Health and Mental Hygiene " +
    "(DOHMH) Incident Command System for COVID-19 Response (Surveillance and " +
    "Epidemiology Branch in collaboration with Public Information Office Branch). " +
    "You can view these data on the Department of Health's website. Note that " +
    "data are being collected in real-time and are preliminary and subject to " +
    "change as COVID-19 response continues."
)

## Step 3: Issue POST request to Datamart to register the dataset

In [4]:
response = requests.post(
    'https://auctus.vida-nyu.org/api/v1/upload',
    data={
        "name": name,
        "description": description,
        "address": url,
    }
)
response.raise_for_status()
dataset_id = response.json()['id']
response.json()

{'id': 'datamart.url.9a47f900c06943ecb731d6213a5b883c'}

## Step 4: Wait for profiling to end, get result

Here profiling is almost instant, but can take more time if the data is big, addresses need to be resolved, locations need to be clustered, etc.

In [5]:
response = requests.get('https://auctus.vida-nyu.org/api/v1/metadata/' + dataset_id)
response.raise_for_status()
response.json()

{'id': 'datamart.url.9a47f900c06943ecb731d6213a5b883c',
 'status': 'indexed',
 'metadata': {'name': 'NYC Coronavirus (COVID-19) data',
  'source': 'upload',
  'description': "Data are assembled by the NYC Department of Health and Mental Hygiene (DOHMH) Incident Command System for COVID-19 Response (Surveillance and Epidemiology Branch in collaboration with Public Information Office Branch). You can view these data on the Department of Health's website. Note that data are being collected in real-time and are preliminary and subject to change as COVID-19 response continues.",
  'date': '2020-06-12T00:00:08.011577Z',
  'size': 398039,
  'nb_rows': 8332,
  'nb_profiled_rows': 8332,
  'columns': [{'name': 'date',
    'structural_type': 'http://schema.org/Text',
    'semantic_types': ['http://schema.org/Enumeration',
     'http://schema.org/DateTime'],
    'num_distinct_values': 47,
    'mean': 1587918818.918867,
    'stddev': 1185682.3387396699,
    'coverage': [{'range': {'gte': 1585758976

# Example ingestion by file upload

## Step 1: Get file in supported format

Get a file in a supported file format (Excel, SPSS, ... will be converted to CSV for you).

In [6]:
dataset = open('tests-by-zcta.historical.csv', 'rb')

## Step 2: Assemble some metadata

In [7]:
name = "NYC Coronavirus (COVID-19) data"
description = (
    "Data are assembled by the NYC Department of Health and Mental Hygiene " +
    "(DOHMH) Incident Command System for COVID-19 Response (Surveillance and " +
    "Epidemiology Branch in collaboration with Public Information Office Branch). " +
    "You can view these data on the Department of Health's website. Note that " +
    "data are being collected in real-time and are preliminary and subject to " +
    "change as COVID-19 response continues."
)

## Step 3: Issue POST request to Datamart to register the dataset

In [8]:
response = requests.post(
    'https://auctus.vida-nyu.org/api/v1/upload',
    data={
        "name": name,
        "description": description,
    },
    files={'file': dataset}
)
response.raise_for_status()
dataset_id = response.json()['id']
response.json()

{'id': 'datamart.upload.177d01411ce744be99bf47cc705cf6cf'}

## Step 4: Wait for profiling to end, get result

In [9]:
response = requests.get('https://auctus.vida-nyu.org/api/v1/metadata/' + dataset_id)
response.raise_for_status()
response.json()

{'id': 'datamart.upload.177d01411ce744be99bf47cc705cf6cf',
 'status': 'indexed',
 'metadata': {'filename': 'tests-by-zcta.historical.csv',
  'name': 'NYC Coronavirus (COVID-19) data',
  'source': 'upload',
  'description': "Data are assembled by the NYC Department of Health and Mental Hygiene (DOHMH) Incident Command System for COVID-19 Response (Surveillance and Epidemiology Branch in collaboration with Public Information Office Branch). You can view these data on the Department of Health's website. Note that data are being collected in real-time and are preliminary and subject to change as COVID-19 response continues.",
  'size': 398039,
  'nb_rows': 8332,
  'nb_profiled_rows': 8332,
  'columns': [{'name': 'date',
    'structural_type': 'http://schema.org/Text',
    'semantic_types': ['http://schema.org/Enumeration',
     'http://schema.org/DateTime'],
    'num_distinct_values': 47,
    'mean': 1587918818.918867,
    'stddev': 1185682.3387396699,
    'coverage': [{'range': {'gte': 15

# Bonus: profile a file without adding it to the index

In [10]:
# If reusing the file you already sent through requests.post(), you need to rewind it
dataset.seek(0, 0)

In [11]:
response = requests.post(
    'https://auctus.vida-nyu.org/api/v1/profile',
    files={'data': dataset}
)
response.raise_for_status()
response.json()

{'size': 398039,
 'nb_rows': 8332,
 'nb_profiled_rows': 8332,
 'columns': [{'name': 'date',
   'structural_type': 'http://schema.org/Text',
   'semantic_types': ['http://schema.org/Enumeration',
    'http://schema.org/DateTime'],
   'num_distinct_values': 47,
   'mean': 1587918818.918867,
   'stddev': 1185682.3387396699,
   'coverage': [{'range': {'gte': 1585758976.0, 'lte': 1587061760.0}},
    {'range': {'gte': 1587148032.0, 'lte': 1588446848.0}},
    {'range': {'gte': 1588526848.0, 'lte': 1589912064.0}}],
   'temporal_resolution': 'day'},
  {'name': 'MODZCTA',
   'structural_type': 'http://schema.org/Integer',
   'semantic_types': [],
   'unclean_values_ratio': 0.005640902544407105,
   'num_distinct_values': 179,
   'mean': 10828.056125528063,
   'stddev': 1500.0918206015112,
   'coverage': [{'range': {'gte': 10005.0, 'lte': 10471.0}},
    {'range': {'gte': 11104.0, 'lte': 11436.0}}]},
  {'name': 'Positive',
   'structural_type': 'http://schema.org/Integer',
   'semantic_types': [],


## You can also use our library locally

In [12]:
%pip install datamart-profiler

In [13]:
import datamart_profiler

In [14]:
datamart_profiler.process_dataset('tests-by-zcta.historical.csv')

{'size': 398039,
 'nb_rows': 8332,
 'nb_profiled_rows': 8332,
 'columns': [{'name': 'date',
   'structural_type': 'http://schema.org/Text',
   'semantic_types': ['http://schema.org/Enumeration',
    'http://schema.org/DateTime'],
   'num_distinct_values': 47,
   'mean': 1587918818.918867,
   'stddev': 1185682.3387396699,
   'coverage': [{'range': {'gte': 1585758976.0, 'lte': 1587061760.0}},
    {'range': {'gte': 1587148032.0, 'lte': 1588446848.0}},
    {'range': {'gte': 1588526848.0, 'lte': 1589912064.0}}],
   'temporal_resolution': 'day'},
  {'name': 'MODZCTA',
   'structural_type': 'http://schema.org/Integer',
   'semantic_types': [],
   'unclean_values_ratio': 0.005640902544407105,
   'num_distinct_values': 179,
   'mean': 10828.056125528063,
   'stddev': 1500.0918206015112,
   'coverage': [{'range': {'gte': 10005.0, 'lte': 10471.0}},
    {'range': {'gte': 11104.0, 'lte': 11436.0}}]},
  {'name': 'Positive',
   'structural_type': 'http://schema.org/Integer',
   'semantic_types': [],
