In [2]:
# Download the full 'NYC Parking Violations Issued - Fiscal Year 2014' dataset.
# Note that the downloaded full dataset file is about 380 MB in size! Use the
# alternative data file with 10,000 rows that is included in the repository if
# you do not want to download the full data file.

import gzip
import humanfriendly
import os
from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset("k397-673e")

# By default, this example uses a small sample of the full dataset that
# is included in the 'data' subfolder within this repository.
#datafile = './data/jt7v-77mi.tsv.gz'

# Remove the comment for this line if you want to use the full dataset.
datafile = './k397-673e.tsv.gz'


# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Downloading ...

Using 'Citywide Payroll Data (Fiscal Year)' in file ./k397-673e.tsv.gz of size 89.62 MB


In [3]:
# Due to the size of the full dataset file, we make use of openclean's
# stream operator to avoid having to load the dataset into main-memory.

from openclean.pipeline import stream

ds_full = stream(datafile)

In [4]:
# Count number of records in the datasets.

print(f'{ds_full.count():,} rows.')

3,923,290 rows.


In [5]:
# Print the first ten rows of the dataset to get a first
# idea of the content.

ds_full.head()

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Mid Init,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
0,2020,17,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
1,2020,17,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,M,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
2,2020,17,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
3,2020,17,OFFICE OF EMERGENCY MANAGEMENT,ROTTA,JONATHAN,D,09/16/2013,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
4,2020,17,OFFICE OF EMERGENCY MANAGEMENT,WILSON II,ROBERT,P,04/30/2018,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,84698.21,0.0,0.0,0.0
5,2020,17,OFFICE OF EMERGENCY MANAGEMENT,WASHINGTON,MORIAH,A,03/18/2019,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,87900.95,0.0,0.0,-3202.74
6,2020,17,OFFICE OF EMERGENCY MANAGEMENT,VAZQUEZ,MARGARET,,09/29/2008,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,94415.0,per Annum,1820,84312.72,0.0,0.0,0.0
7,2020,17,OFFICE OF EMERGENCY MANAGEMENT,KRAWCZYK,AMANDA,N,05/15/2017,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,83976.54,0.0,0.0,0.0
8,2020,17,OFFICE OF EMERGENCY MANAGEMENT,MURRELL,JALEESA,S,12/01/2014,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820,83877.36,0.0,0.0,0.0
9,2020,17,OFFICE OF EMERGENCY MANAGEMENT,DE LOS SANTOS,JANIRA,,06/05/2017,BROOKLYN,EMERGENCY PREPAREDNESS SPECIALIST,ACTIVE,67676.0,per Annum,1820,66647.77,348.5,16572.64,144.15


In [7]:
# Create a view on a subset of columns in the dataset.
# Choose the attributes that we are interested in.
COLUMNS = [
    'Fiscal Year',
    'Payroll Number',
    'Agency Name',
#     'Last Name',
#     'First Name',
#     'Mid Init',
    'Agency Start Date',
    'Work Location Borough',
    'Title Description',
    'Leave Status as of June 30',
    'Base Salary',
    'Pay Basis',
    'Regular Hours',
    'Regular Gross Paid',
    'OT Hours',
    'Total OT Paid',
    'Total Other Pay'
]

ds = ds_full.select(columns=COLUMNS)

### Data Profiling

Data profiling is an important first step in many data analytics efforts. Profiling helps users to gain an understanding of the data properties and to uncover data quality flaws. openclean supports a variety of different data profiling operators that can also be used to generate metadata about the data at hand.

We can use the default column profiler to compute basic statistics such as the number of distinct values, missing values, etc. for each of the columns in our dataset. In the example shown below we use a random sample of 1000 rows for profiling. The result is a list of profiling results (dictionaries). A summary of the results can then be accessed as a data frame using the stats() method.

In [8]:
# Profile the resulting dataset view using the default data profiler.

from openclean.profiling.column import DefaultColumnProfiler

profiles = ds.profile(default_profiler=DefaultColumnProfiler)

In [9]:
# Print overview of profiling results.

profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Fiscal Year,3923290,0,7,2e-06,2.805614
Payroll Number,3923290,1745440,157,7.2e-05,4.286506
Agency Name,3923290,0,165,4.2e-05,4.365925
Agency Start Date,3923290,63,14933,0.003806,11.097847
Work Location Borough,3923290,506226,22,6e-06,1.507244
Title Description,3923290,84,1802,0.000459,6.207524
Leave Status as of June 30,3923290,0,5,1e-06,0.710495
Base Salary,3923290,0,98573,0.025125,10.22424
Pay Basis,3923290,0,4,1e-06,1.405552
Regular Hours,3923290,0,86101,0.021946,4.617157


In [10]:
# Print the most frequent data type for each column.

print('Schema\n------')
for col in ds.columns:
    p = profiles.column(col)
    print("  '{}' ({})".format(col, p['datatypes']['distinct'].most_common(1)[0][0]))

Schema
------
  'Fiscal Year' (int)
  'Payroll Number' (int)
  'Agency Name' (str)
  'Agency Start Date' (date)
  'Work Location Borough' (str)
  'Title Description' (str)
  'Leave Status as of June 30' (str)
  'Base Salary' (float)
  'Pay Basis' (str)
  'Regular Hours' (float)
  'Regular Gross Paid' (float)
  'OT Hours' (float)
  'Total OT Paid' (float)
  'Total Other Pay' (float)


In [11]:
# Print the minimum and maximum value for column 'Fiscal Year'

profiles.minmax('Fiscal Year')

Unnamed: 0,min,max
int,2014,2020


In [13]:
# Print the minimum and maximum value for column 'Payroll Number'

profiles.minmax('Payroll Number')

Unnamed: 0,min,max
int,2,996


In [14]:
# Print the minimum and maximum value for column 'Agency Start Date'

profiles.minmax('Agency Start Date')

Unnamed: 0,min,max
date,1901-01-01,9999-12-31 00:00:00


In [23]:
# Print the minimum and maximum value for column 'Base Salary'

profiles.minmax('Base Salary')

Unnamed: 0,min,max
float,0.01,414707.0


In [18]:
# Print the minimum and maximum value for column 'Regular Hours'

profiles.minmax('Regular Hours')

Unnamed: 0,min,max
int,-1260.0,4160.0
float,-730.43,4171.43


In [19]:
# Print the minimum and maximum value for column 'Regular Gross Paid'

profiles.minmax('Regular Gross Paid')

Unnamed: 0,min,max
float,-76223.05,672308.86


In [21]:
# Print the minimum and maximum value for column 'OT Hours'

profiles.minmax('OT Hours')

Unnamed: 0,min,max
int,-209.0,3147.0
float,-66.5,3347.5


In [25]:
# Print the minimum and maximum value for column 'Total OT Paid'

profiles.minmax('Total OT Paid')

Unnamed: 0,min,max
float,-26493.88,237389.73


In [26]:
# Print the minimum and maximum value for column 'Total Other Pay'

profiles.minmax('Total Other Pay')

Unnamed: 0,min,max
float,-281595.04,650000.0


In [27]:
# Print the most frequent values in column 'Agency Name'

profiles.column('Agency Name').get('topValues')

[('DEPT OF ED PEDAGOGICAL', 758360),
 ('DEPT OF ED PER SESSION TEACHER', 608565),
 ('POLICE DEPARTMENT', 367745),
 ('DEPT OF ED PARA PROFESSIONALS', 245259),
 ('BOARD OF ELECTION POLL WORKERS', 235235),
 ('DEPT OF ED HRLY SUPPORT STAFF', 164165),
 ('FIRE DEPARTMENT', 128819),
 ('DEPT OF PARKS & RECREATION', 117212),
 ('DEPARTMENT OF EDUCATION ADMIN', 110936),
 ('HRA/DEPT OF SOCIAL SERVICES', 104331)]

In [28]:
# Print the most frequent values in column 'Work Location Borough'

profiles.column('Work Location Borough').get('topValues')

[('MANHATTAN', 2394979),
 ('QUEENS', 379695),
 ('BROOKLYN', 323565),
 ('BRONX', 177881),
 ('OTHER', 83688),
 ('RICHMOND', 46156),
 ('WESTCHESTER', 3417),
 ('ULSTER', 1953),
 ('Manhattan', 1622),
 ('Bronx', 935)]

In [30]:
# Print the most frequent values in column 'OT Hours'

profiles.column('OT Hours').get('topValues')

[('0', 2923200),
 ('1', 8563),
 ('2', 6718),
 ('8', 6652),
 ('4', 5161),
 ('3', 4871),
 ('5', 4064),
 ('7', 3842),
 ('6', 3408),
 ('16', 2956)]

### Outliers and Anomalies in Dataset Columns
One common profiling task is the detection of outliers (anomalies) in datasets. In our example we focus on outlier values in individual columns. 

#### Agency Name

In [32]:
# Get set of distinct values for column 'Agency Name'. Print the
# values in decreasing order of frequency.

an = ds.distinct('Agency Name')
for rank, val in enumerate(an.most_common()):
    an, freq = val
    print(f'{rank + 1:<3} {an}  {freq:>10,}')

1   DEPT OF ED PEDAGOGICAL     758,360
2   DEPT OF ED PER SESSION TEACHER     608,565
3   POLICE DEPARTMENT     367,745
4   DEPT OF ED PARA PROFESSIONALS     245,259
5   BOARD OF ELECTION POLL WORKERS     235,235
6   DEPT OF ED HRLY SUPPORT STAFF     164,165
7   FIRE DEPARTMENT     128,819
8   DEPT OF PARKS & RECREATION     117,212
9   DEPARTMENT OF EDUCATION ADMIN     110,936
10  HRA/DEPT OF SOCIAL SERVICES     104,331
11  NYC HOUSING AUTHORITY      95,132
12  DEPARTMENT OF SANITATION      92,755
13  DEPARTMENT OF CORRECTION      91,843
14  DEPT OF ED PER DIEM TEACHERS      90,784
15  Police Department      55,619
16  DEPT OF HEALTH/MENTAL HYGIENE      54,291
17  ADMIN FOR CHILDREN'S SVCS      53,268
18  DEPT OF ENVIRONMENT PROTECTION      48,751
19  DEPARTMENT OF TRANSPORTATION      42,801
20  COMMUNITY COLLEGE (MANHATTAN)      41,594
21  COMMUNITY COLLEGE (LAGUARDIA)      32,068
22  COMMUNITY COLLEGE (KINGSBORO)      31,582
23  COMMUNITY COLLEGE (QUEENSBORO)      26,474
24  COMMUNIT

#### Agency Start Date

In [35]:
# Print the ten most frequent values for the 'Agency Start Date' column.

agency_start_dates = ds.distinct('Agency Start Date')

for rank, val in enumerate(agency_start_dates.most_common()):
    dt, freq = val
    print(f'{rank + 1:<3} {dt:>8}  {freq:>10,}')

print('\nTotal number of distinct values is {}'.format(len(agency_start_dates)))

1   01/01/2010      86,137
2   09/03/2013      45,631
3   08/30/2007      44,324
4   09/04/2012      40,349
5   09/02/2014      40,170
6   08/31/2006      39,040
7   09/08/2015      37,706
8   09/06/2005      36,351
9   09/02/2003      35,441
10  09/07/2004      34,948
11  08/28/2008      34,886
12  09/04/2001      34,510
13  12/31/1999      33,716
14  09/03/2002      30,509
15  09/06/2016      29,847
16  09/07/1999      28,957
17  09/05/2000      28,016
18  01/01/2016      27,906
19  09/02/1997      27,646
20  01/02/2012      27,645
21  09/06/2011      26,874
22  09/08/1998      26,081
23  09/07/2010      21,880
24  09/05/2017      20,378
25  01/01/2013      20,263
26  01/01/2014      18,805
27  09/03/1996      17,356
28  01/01/2011      16,136
29  06/23/1996      14,116
30  09/08/2009      14,027
31  01/02/2017      11,902
32  09/04/2018      11,749
33  01/01/2015      11,150
34  01/01/2018      10,893
35  07/11/2005      10,363
36  09/08/2016      10,021
37  07/06/2011      10,020
3

2164 11/10/2003         312
2165 09/19/1994         312
2166 10/01/1987         312
2167 03/01/1994         312
2168 03/01/2002         312
2169 12/05/2013         312
2170 10/10/1995         311
2171 04/09/2019         311
2172 08/10/2009         311
2173 04/07/1986         311
2174 09/06/2001         311
2175 01/16/2002         311
2176 11/10/2015         311
2177 02/08/1988         310
2178 10/05/1993         310
2179 09/22/1999         310
2180 09/25/1997         310
2181 09/13/2011         309
2182 10/28/1985         309
2183 11/22/1993         309
2184 04/20/2010         309
2185 12/04/2013         308
2186 12/17/1999         308
2187 03/21/1994         308
2188 12/12/1988         308
2189 10/16/2008         308
2190 02/27/2015         308
2191 02/19/2018         307
2192 07/31/2006         307
2193 02/26/2019         307
2194 09/06/1984         307
2195 11/01/2006         307
2196 12/26/2013         307
2197 08/16/2011         306
2198 04/05/2010         306
2199 02/16/2010     

4496 04/10/2018         129
4497 07/07/2019         128
4498 12/06/2011         128
4499 05/16/1994         128
4500 03/25/2008         128
4501 12/19/1997         128
4502 12/11/2009         128
4503 08/24/2018         128
4504 01/16/1996         128
4505 05/14/2015         128
4506 10/20/2009         128
4507 11/29/2017         128
4508 01/09/2003         128
4509 06/21/2016         128
4510 07/28/2013         128
4511 09/15/2004         128
4512 11/01/2009         128
4513 10/24/1996         128
4514 02/13/1992         128
4515 09/27/2005         128
4516 10/11/1996         128
4517 10/14/1987         128
4518 04/23/1993         128
4519 02/03/1993         128
4520 11/08/1989         128
4521 02/15/2008         128
4522 01/15/1998         128
4523 02/08/2007         128
4524 01/10/2003         128
4525 04/20/2007         128
4526 02/16/2017         128
4527 09/04/1993         128
4528 07/24/1995         127
4529 11/08/2006         127
4530 12/10/1996         127
4531 04/18/1983     

6356 02/28/1991          75
6357 11/19/1985          75
6358 03/20/1991          75
6359 02/12/2002          75
6360 03/17/2004          75
6361 01/16/2003          75
6362 05/25/1995          75
6363 06/11/2008          75
6364 03/05/1985          75
6365 02/22/2007          75
6366 08/31/2011          75
6367 03/27/2009          75
6368 02/08/1992          75
6369 01/22/1985          75
6370 07/08/2012          75
6371 09/24/1989          75
6372 02/09/1997          75
6373 03/27/2013          75
6374 05/24/2018          74
6375 02/18/1998          74
6376 06/06/2013          74
6377 05/18/2010          74
6378 03/03/2009          74
6379 11/21/2006          74
6380 08/12/2016          74
6381 05/17/1982          74
6382 01/10/1983          74
6383 08/02/1982          74
6384 12/10/1995          74
6385 04/29/1996          74
6386 09/25/2019          74
6387 12/18/2012          74
6388 10/24/1980          74
6389 10/17/2019          74
6390 09/17/2019          74
6391 02/10/2008     

8870 04/14/1992          35
8871 04/29/2018          35
8872 04/30/2000          35
8873 08/04/1996          35
8874 01/06/1987          35
8875 10/26/1995          35
8876 08/28/1985          35
8877 08/28/2002          35
8878 05/29/2006          35
8879 04/29/2007          35
8880 07/09/1986          35
8881 12/24/1990          35
8882 08/16/1995          35
8883 07/23/1996          35
8884 12/18/1988          35
8885 04/15/2007          35
8886 11/22/2019          35
8887 06/22/2003          35
8888 06/03/1993          35
8889 10/26/1986          35
8890 07/02/2017          35
8891 10/27/1982          35
8892 04/22/1982          35
8893 04/04/1977          35
8894 07/05/1977          35
8895 10/23/1978          35
8896 08/14/2005          35
8897 10/12/2018          35
8898 07/03/2011          35
8899 08/09/1998          35
8900 05/31/1998          35
8901 06/07/1982          35
8902 11/20/2011          35
8903 06/23/1994          35
8904 07/28/1993          35
8905 06/01/1986     

10897 12/05/1973          16
10898 10/31/1993          16
10899 12/14/1983          16
10900 02/16/2003          16
10901 05/31/1997          16
10902 03/28/1980          16
10903 08/02/2008          16
10904 08/23/2007          16
10905 08/19/2003          16
10906 01/04/2014          16
10907 05/24/2008          16
10908 10/12/1997          16
10909 09/22/2017          16
10910 03/02/1991          16
10911 09/13/1997          16
10912 05/23/2015          16
10913 07/26/1995          16
10914 07/08/1987          16
10915 04/23/1975          16
10916 08/26/2003          16
10917 07/29/1986          16
10918 08/06/1985          16
10919 12/22/1989          16
10920 01/29/1968          16
10921 06/10/1986          16
10922 05/29/1979          16
10923 07/31/1997          16
10924 06/19/1998          16
10925 01/18/1978          16
10926 03/24/1982          16
10927 10/11/1977          16
10928 12/14/1982          16
10929 02/03/1982          16
10930 02/06/1975          16
10931 05/15/19

13287 09/08/1954           6
13288 01/25/1971           6
13289 06/09/1966           6
13290 09/20/1966           6
13291 09/04/1967           6
13292 10/26/1977           6
13293 09/03/1966           6
13294 02/18/1971           6
13295 11/22/1972           6
13296 09/09/1959           6
13297 02/07/1974           6
13298 12/20/1972           6
13299 10/03/1967           6
13300 09/13/1961           6
13301 02/03/1966           6
13302 10/28/1975           6
13303 03/30/1970           6
13304 07/21/1981           6
13305 12/08/1985           6
13306 03/15/1987           6
13307 05/24/1992           6
13308 11/09/1986           6
13309 08/29/2004           6
13310 07/03/1981           6
13311 08/07/1971           6
13312 06/02/2001           6
13313 06/22/1963           6
13314 08/18/1981           6
13315 09/23/1990           6
13316 08/02/1977           6
13317 08/14/1974           6
13318 04/21/2000           6
13319 06/21/1994           6
13320 03/25/1971           6
13321 06/15/19

In [36]:
# Using the default settings yields two outliers.

from openclean.profiling.anomalies.sklearn import DBSCANOutliers

DBSCANOutliers().find(agency_start_dates)

['', '01/01/2010']

In [37]:
# If we change the eps parameter (maximum distance between two samples for one to be considered
# as in the neighborhood of the other) we can find even more potential outliers (including one that
# we had not seen before).

DBSCANOutliers(eps=0.05).find(agency_start_dates)

['',
 '09/08/2015',
 '01/02/2012',
 '01/01/2013',
 '09/06/2016',
 '09/05/2017',
 '09/02/2003',
 '08/28/2008',
 '09/03/1996',
 '01/01/2011',
 '08/30/2007',
 '09/02/1997',
 '09/07/2010',
 '01/01/2014',
 '01/01/2010',
 '09/06/2011',
 '08/31/2006',
 '09/03/2013']

#### Work Location Borough

In [38]:
# Get set of distinct values for column 'Work Location Borough'. Print the
# values in decreasing order of frequency.

wlb = ds.distinct('Work Location Borough')
for rank, val in enumerate(wlb.most_common()):
    wlb, freq = val
    print(f'{rank + 1:<3} {wlb}  {freq:>10,}')

1   MANHATTAN   2,394,979
2        506,226
3   QUEENS     379,695
4   BROOKLYN     323,565
5   BRONX     177,881
6   OTHER      83,688
7   RICHMOND      46,156
8   WESTCHESTER       3,417
9   ULSTER       1,953
10  Manhattan       1,622
11  Bronx         935
12  SULLIVAN         822
13  Queens         660
14  DELAWARE         551
15  NASSAU         245
16  PUTNAM         243
17  SCHOHARIE         175
18  DUTCHESS         140
19  Richmond         112
20  ALBANY          95
21  GREENE          61
22  WASHINGTON DC          47
23  ORANGE          22


#### Leave Status as of June 30

In [39]:
# Get set of distinct values for column 'Leave Status as of June 30'. Print the
# values in decreasing order of frequency.

ls = ds.distinct('Leave Status as of June 30')
for rank, val in enumerate(ls.most_common()):
    ls, freq = val
    print(f'{rank + 1:<3} {ls}  {freq:>10,}')

1   ACTIVE   3,355,483
2   CEASED     485,414
3   ON LEAVE      42,401
4   SEASONAL      33,451
5   ON SEPARATION LEAVE       6,541
