In [1]:
import json

with open("../auth/nyc-open-data.json", "r") as f:
    nyc_auth = json.load(f)
    
import pysocrata
nyc_datasets = pysocrata.get_datasets(**nyc_auth)

nyc_datasets = [d for d in nyc_datasets if d['resource']['type'] != 'story']

nyc_types = [d['resource']['type'] for d in nyc_datasets]
volcab_map = {'dataset': 'table', 'href': 'link', 'map': 'geospatial dataset', 'file': 'blob'}
nyc_types = list(map(lambda d: volcab_map[d], nyc_types))

nyc_endpoints = [d['resource']['id'] for d in nyc_datasets]

In [4]:
import numpy as np
table_indices = np.nonzero([t == 'table' for t in nyc_types])
table_endpoints = np.array(nyc_endpoints)[table_indices]
table_datasets = np.array(nyc_datasets)[table_indices]

In [8]:
table_datasets[0]

{'classification': {'categories': [],
  'domain_category': 'City Government',
  'domain_metadata': [{'key': 'Update_Automation', 'value': 'Yes'},
   {'key': 'Update_Update-Frequency', 'value': 'Daily'},
   {'key': 'Dataset-Information_Agency',
    'value': 'Department of Citywide Administrative Services (DCAS)'}],
  'domain_tags': [],
  'tags': []},
 'link': 'https://data.cityofnewyork.us/City-Government/City-Store-The-Official-Store-of-the-City-of-New-Y/mqdy-gu73',
 'metadata': {'domain': 'data.cityofnewyork.us'},
 'permalink': 'https://data.cityofnewyork.us/d/mqdy-gu73',
 'resource': {'attribution': 'Department of Citywide Administrative Services (DCAS)',
  'columns_description': ['', '', '', '', '', '', '', '', ''],
  'columns_field_name': ['size',
   'color',
   'product_name',
   'citystore_exclusive',
   'unit_price',
   'item_number',
   'description',
   'category_name',
   'style'],
  'columns_name': ['Size',
   'Color',
   'Product Name',
   'CityStore Exclusive',
   'Unit Pr

Tabular datasets will always be resolvable to the `CSV` filetype. The slug will be:

`https://data.cityofnewyork.us/api/views/<ENDPOINT_ID>/rows.csv?accessType=DOWNLOAD`

In [10]:
ex_slug = "https://data.cityofnewyork.us/api/views/" + table_endpoints[0] + "/rows.csv?accessType=DOWNLOAD"

In [11]:
ex_slug

'https://data.cityofnewyork.us/api/views/mqdy-gu73/rows.csv?accessType=DOWNLOAD'

To verify that this is true, can we can send `HEAD` requests to each of these slugs in turn, and make sure that we get back a request denied, no `HEAD` allowed token in response?

In [12]:
import requests
requests.head(ex_slug).headers

{'X-Socrata-Region': 'aws-us-east-1-fedramp-prod', 'X-Error-Message': 'HEAD is not supported', 'Access-Control-Allow-Origin': '*', 'Cache-Control': 'private, no-cache, must-revalidate', 'X-Error-Code': 'invalid_request', 'Age': '0', 'X-Socrata-RequestId': 'aw6msqrhrs1esjw9vft453ipa', 'Connection': 'keep-alive', 'Date': 'Sun, 22 Jan 2017 22:46:06 GMT', 'Server': 'nginx'}

In [13]:
requests.head(ex_slug).headers['X-Error-Message'] == 'HEAD is not supported'

True

In [14]:
requests.head("https://data.cityofnewyork.us/api/views/gibberish").headers

{'X-Socrata-Region': 'aws-us-east-1-fedramp-prod', 'X-Error-Message': 'HEAD is not supported', 'Access-Control-Allow-Origin': '*', 'Cache-Control': 'private, no-cache, must-revalidate', 'X-Error-Code': 'invalid_request', 'Age': '0', 'X-Socrata-RequestId': '4amy9sq2vrro0pezr6ycmfn99', 'Connection': 'keep-alive', 'Date': 'Sun, 22 Jan 2017 22:46:48 GMT', 'Server': 'nginx'}

No! Just going to have to take it on faith ATM.

In [15]:
nyc_table_datasets = dict()

In [21]:
len(table_datasets[0]['resource']['columns_field_name'])

9

In [25]:
table_datasets[0]['resource']['id']

'mqdy-gu73'

In [26]:
ex_endpoint_uri = "https://data.cityofnewyork.us/d/" + table_datasets[0]['resource']['id']

In [27]:
ex_endpoint_uri

'https://data.cityofnewyork.us/d/mqdy-gu73'

One more thing that we can get without downloading the dataset is the number of rows. Or can we?

In [30]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get(ex_endpoint_uri).text, "html.parser")

In [35]:
soup.findAll('span', {'class': 'pager-label'})

[]

In [36]:
soup.findAll('span')

[<span class="site-name"></span>,
 <span></span>,
 <span class="socrata-icon-search collapsible-search-toggle" onclick="toggleCollapsibleSearch(this)" role="button" title="Search">
 </span>,
 <span class="socrata-icon-search" title="Search"></span>,
 <span class="searchbox-label">Search</span>,
 <span></span>,
 <span class="monosocial">twitterbird</span>,
 <span class="monosocial">tumblr</span>,
 <span></span>,
 <span><span>Access Data</span><span class="socrata-icon-arrow-down"></span></span>,
 <span>Access Data</span>,
 <span class="socrata-icon-arrow-down"></span>,
 <span><span>Developers</span><span class="socrata-icon-arrow-down"></span></span>,
 <span>Developers</span>,
 <span class="socrata-icon-arrow-down"></span>,
 <span><span>About</span><span class="socrata-icon-arrow-down"></span></span>,
 <span>About</span>,
 <span class="socrata-icon-arrow-down"></span>,
 <span><span>NYC</span><span class="socrata-icon-arrow-down"></span></span>,
 <span>NYC</span>,
 <span class="socrata-i

Examining `soup` we see that it looks like all of the actual information on the page (besides boilerplate) is rendered at runtime by a script. Accessing the full render would require making use of Selenium, which is...not something I want to do.

Well. Since we can't get the number of rows for any other dataset anyway, it looks like we're basically stuck. We can only transmit the basic information.

In [41]:
nyc_table_datasets = []

In [42]:
for dataset in table_datasets:
    endpoint = dataset['resource']['id']
    slug = "https://data.cityofnewyork.us/api/views/" + endpoint + "/rows.csv?accessType=DOWNLOAD"
    nyc_table_datasets.append({'endpoint': endpoint, 'resource': slug, 'dataset': '.'})

In [45]:
nyc_table_datasets[:5]

[{'dataset': '.',
  'endpoint': 'mqdy-gu73',
  'resource': 'https://data.cityofnewyork.us/api/views/mqdy-gu73/rows.csv?accessType=DOWNLOAD'},
 {'dataset': '.',
  'endpoint': 'mdcw-n682',
  'resource': 'https://data.cityofnewyork.us/api/views/mdcw-n682/rows.csv?accessType=DOWNLOAD'},
 {'dataset': '.',
  'endpoint': 'i296-73x5',
  'resource': 'https://data.cityofnewyork.us/api/views/i296-73x5/rows.csv?accessType=DOWNLOAD'},
 {'dataset': '.',
  'endpoint': 'nyis-y4yr',
  'resource': 'https://data.cityofnewyork.us/api/views/nyis-y4yr/rows.csv?accessType=DOWNLOAD'},
 {'dataset': '.',
  'endpoint': 'eabe-havv',
  'resource': 'https://data.cityofnewyork.us/api/views/eabe-havv/rows.csv?accessType=DOWNLOAD'}]

In [49]:
%ls "../data/nyc/glossaries/"

 Volume in drive C is SSD_80GB
 Volume Serial Number is 9279-00B2

 Directory of C:\Users\Alex\Desktop\urban-data-concordance\data\nyc\glossaries

01/22/2017  06:20 PM    <DIR>          .
01/22/2017  06:20 PM    <DIR>          ..
01/22/2017  03:15 PM            35,462 nyc-link-resources.json
               1 File(s)         35,462 bytes
               2 Dir(s)   2,174,885,888 bytes free


In [51]:
with open("../data/nyc/glossaries/nyc-table-datasets.json", "w") as fp:
    json.dump(nyc_table_datasets, fp, indent=4)