In [1]:
# Download the full 'DOB Job Application Fiings' dataset. This file is about 1GB (XXXMB gzipped).
# To avoid downloading the full file into memory first the streaing option is used (based on
# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests)

import gzip
import requests

url = 'https://data.cityofnewyork.us/api/views/ic3t-wcy2/rows.tsv?accessType=DOWNLOAD'
outfile = './ic3t-wcy2.tsv.gz'

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with gzip.open(outfile, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

In [2]:
# Verify that the download was successful. Print dataset columns and number of rows.
# This example makes use of the streaming option to avoid loading the full data frame
# into memory.

from openclean.pipeline import stream

df = stream(outfile)


print('Schema\n------')
for col in df.columns:
    print("  '{}'".format(col))
    
print('\n{} rows.'.format(df.count()))

Schema
------
  'Job #'
  'Doc #'
  'Borough'
  'House #'
  'Street Name'
  'Block'
  'Lot'
  'Bin #'
  'Job Type'
  'Job Status'
  'Job Status Descrp'
  'Latest Action Date'
  'Building Type'
  'Community - Board'
  'Cluster'
  'Landmarked'
  'Adult Estab'
  'Loft Board'
  'City Owned'
  'Little e'
  'PC Filed'
  'eFiling Filed'
  'Plumbing'
  'Mechanical'
  'Boiler'
  'Fuel Burning'
  'Fuel Storage'
  'Standpipe'
  'Sprinkler'
  'Fire Alarm'
  'Equipment'
  'Fire Suppression'
  'Curb Cut'
  'Other'
  'Other Description'
  'Applicant's First Name'
  'Applicant's Last Name'
  'Applicant Professional Title'
  'Applicant License #'
  'Professional Cert'
  'Pre- Filing Date'
  'Paid'
  'Fully Paid'
  'Assigned'
  'Approved'
  'Fully Permitted'
  'Initial Cost'
  'Total Est. Fee'
  'Fee Status'
  'Existing Zoning Sqft'
  'Proposed Zoning Sqft'
  'Horizontal Enlrgmt'
  'Vertical Enlrgmt'
  'Enlargement SQ Footage'
  'Street Frontage'
  'ExistingNo. of Stories'
  'Proposed No. of Stories'
  