In [1]:
# Open the 311.tsv dataset as a data stream.

from openclean.data.load import stream

ds = stream('../data/311.tsv')

In [2]:
# Print list of columns in the dataset.

ds.columns

['closed_date',
 'descriptor',
 'borough',
 'complaint_type',
 'incident_zip',
 'school_zip',
 'longitude',
 'park_facility_name',
 'unique_key',
 'school_phone_number',
 'school_region',
 'school_state',
 'resolution_action_updated_date',
 'community_board',
 'school_number',
 'bridge_highway_direction',
 'status',
 'school_code',
 'city',
 'latitude',
 'intersection_street_2',
 'school_city',
 'intersection_street_1',
 'x_coordinate_state_plane_',
 'school_address',
 'facility_type',
 'y_coordinate_state_plane_',
 'agency_name',
 'agency',
 'address_type',
 'school_name',
 'location_type',
 'park_borough',
 'location/latitude',
 'location/longitude',
 'created_date',
 'cross_street_2',
 'cross_street_1',
 'street_name',
 'incident_address',
 'landmark',
 'school_or_citywide_complaint',
 'taxi_pick_up_location',
 'ferry_direction',
 'due_date',
 'bridge_highway_name']

In [3]:
# Count the number of rows in the stream.

ds.count()

999

In [4]:
# Get frequency counts for distinct values in clolumn 'borough'.

ds.distinct('borough')

Counter({'MANHATTAN': 332,
         'QUEENS': 212,
         'BRONX': 112,
         'BROOKLYN': 231,
         'Unspecified': 50,
         'STATEN ISLAND': 62})

In [5]:
# Show the values for columns 'borough' and 'city' for the
# first 10 rows in the dataset.

ds.select('borough', 'city').head()

Unnamed: 0,borough,city
0,MANHATTAN,NEW YORK
1,QUEENS,FLUSHING
2,QUEENS,EAST ELMHURST
3,BRONX,BRONX
4,BROOKLYN,BROOKLYN
5,Unspecified,
6,BROOKLYN,BROOKLYN
7,BROOKLYN,BROOKLYN
8,BROOKLYN,BROOKLYN
9,MANHATTAN,NEW YORK


In [6]:
# A more complex example that selects two columns, then removes rows
# where 'borough' equals 'city' and the pronts frequency counts for
# the different (borough, city) pairs in the remaining rows.

from openclean.function.eval.base import Col

ds.select('borough', 'city').filter(Col('borough') != Col('city')).distinct()

Counter({('MANHATTAN', 'NEW YORK'): 304,
         ('QUEENS', 'FLUSHING'): 21,
         ('QUEENS', 'EAST ELMHURST'): 10,
         ('Unspecified', ''): 50,
         ('QUEENS', 'QUEENS VILLAGE'): 6,
         ('QUEENS', 'MASPETH'): 10,
         ('MANHATTAN', ''): 19,
         ('QUEENS', 'RICHMOND HILL'): 6,
         ('QUEENS', 'KEW GARDENS'): 3,
         ('QUEENS', 'FOREST HILLS'): 8,
         ('BROOKLYN', ''): 8,
         ('QUEENS', 'ROSEDALE'): 4,
         ('QUEENS', 'WOODSIDE'): 7,
         ('QUEENS', ''): 15,
         ('STATEN ISLAND', ''): 3,
         ('QUEENS', 'CORONA'): 6,
         ('QUEENS', 'JAMAICA'): 25,
         ('QUEENS', 'COLLEGE POINT'): 3,
         ('QUEENS', 'RIDGEWOOD'): 6,
         ('QUEENS', 'JACKSON HEIGHTS'): 5,
         ('QUEENS', 'LITTLE NECK'): 3,
         ('QUEENS', 'OAKLAND GARDENS'): 2,
         ('QUEENS', 'OZONE PARK'): 5,
         ('QUEENS', 'NY'): 2,
         ('MANHATTAN', 'NY'): 7,
         ('QUEENS', 'FRESH MEADOWS'): 2,
         ('QUEENS', 'MIDDLE VILLAGE

In [7]:
# Iterate over the first ten rows in the filtered data stream.

from openclean.function.eval.base import Col

for rowid, row in ds.select('borough', 'city').filter(Col('borough') != Col('city')).limit(10).iterrows():
    print('{} {}'.format(rowid, row))

0 ['MANHATTAN', 'NEW YORK']
1 ['QUEENS', 'FLUSHING']
2 ['QUEENS', 'EAST ELMHURST']
5 ['Unspecified', '']
9 ['MANHATTAN', 'NEW YORK']
11 ['MANHATTAN', 'NEW YORK']
15 ['QUEENS', 'FLUSHING']
18 ['QUEENS', 'QUEENS VILLAGE']
19 ['MANHATTAN', 'NEW YORK']
22 ['QUEENS', 'MASPETH']


In [8]:
# Write names of neighborhoods in Queens to file (note that this does
# not remove duplicates at this time).

ds.select('borough', 'city').filter(Col('borough') == 'QUEENS').select('city').write('../data/queens.tsv')