# Group 11: DOB Job Application Filings - Data Profiling and Data Cleaning
Team members: Peng-Yuan Chen (pc2973), Chun-Yen Liou (cyl625), Tsung-Lin Yang (ty2065)

In the following we perform the data profiling and data cleaning on the dataset of [DOB Job Application Filings](https://data.cityofnewyork.us/Housing-Development/DOB-Job-Application-Filings/ic3t-wcy2).

This dataset includes all the job applications submitted to Department of Buildings (DOB) through the Borough Offices, through eFiling, or through the HUB. It has a "Latest Action Date" since January 1, 2000. 

The dataset consists of over 1.77 million rows and the data file is about 1 GB in size. The dataset is available for download via the Socrata Open Data API (SODA).

In [27]:
# Since we are using Google Colab, we have to first install openclean library.
!pip install openclean_notebook
!pip install openclean
!pip install openclean_geo



You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [28]:
# Import necessary libraries
import os
import requests
from openclean.pipeline import stream
import pandas as pd


# Download the full 'DOB Job Application Filings' dataset.
csvPath = './bty7-2jhb.csv'
csvPath_new = './bty7-2jhb_raw.csv'
if not os.path.isfile(csvPath):
  csvUrl = "https://data.cityofnewyork.us/resource/bty7-2jhb.csv"
  req = requests.get(csvUrl)
  url_content = req.content
  outfile = open(csvPath, 'wb')
  outfile.write(url_content)
  outfile.close()

ds_Full = pd.read_csv(csvPath, nrows=113)
ds_Full.to_csv('./bty7-2jhb_raw.csv', encoding='utf-8', index=False)

# Data Profiling

Let's first do some preliminary profiling on the dataset so that we can gain some insight about the data.

In [29]:
# Do the preliminary profiling
from openclean.profiling.column import DefaultColumnProfiler

ds_Full = stream(csvPath_new)
profiles = ds_Full.profile(default_profiler=DefaultColumnProfiler)

In [30]:
# Take a look at the column names in this dataset
ds_Full.columns

['borough',
 'bin__',
 'house__',
 'street_name',
 'job__',
 'job_doc___',
 'job_type',
 'self_cert',
 'block',
 'lot',
 'community_board',
 'zip_code',
 'bldg_type',
 'residential',
 'special_district_1',
 'special_district_2',
 'work_type',
 'permit_status',
 'filing_status',
 'permit_type',
 'permit_sequence__',
 'permit_subtype',
 'oil_gas',
 'site_fill',
 'filing_date',
 'issuance_date',
 'expiration_date',
 'job_start_date',
 'permittee_s_first_name',
 'permittee_s_last_name',
 'permittee_s_business_name',
 'permittee_s_phone__',
 'permittee_s_license_type',
 'permittee_s_license__',
 'act_as_superintendent',
 'permittee_s_other_title',
 'hic_license',
 'site_safety_mgr_s_first_name',
 'site_safety_mgr_s_last_name',
 'site_safety_mgr_business_name',
 'superintendent_first___last_name',
 'superintendent_business_name',
 'owner_s_business_type',
 'non_profit',
 'owner_s_business_name',
 'owner_s_first_name',
 'owner_s_last_name',
 'owner_s_house__',
 'owner_s_house_street_name',
 '

In [31]:
# Take a look at the first 10 rows
ds_Full.head()

Unnamed: 0,borough,bin__,house__,street_name,job__,job_doc___,job_type,self_cert,block,lot,...,state,owner_s_zip_code,owner_s_phone__,dobrundate,permit_si_no,gis_latitude,gis_longitude,gis_council_district,gis_census_tract,gis_nta_name
0,MANHATTAN,1077287,1230,6TH AVENUE,123725807,1,A2,Y,1264,5,...,NY,10111,2127150300,12/12/2020 00:00:00,3554580,40.758977,-73.981089,4,96,Midtown-Midtown South
1,STATEN ISLAND,5113169,715,OCEAN TERRACE,500876037,1,A2,Y,683,1,...,NY,11101,7184728000,12/12/2020 00:00:00,3719150,40.608512,-74.102067,50,177,Todt Hill-Emerson Hill-Heartland Village-Light...
2,BROOKLYN,3253458,9952,3 AVE,321963014,1,DM,N,6133,56,...,NY,11234,3478661439,06/18/2020 00:00:00,3765458,40.613341,-74.035582,43,5602,Bay Ridge
3,BROOKLYN,3117942,179,LOTT STREET,322006618,1,DM,N,5136,58,...,NY,11205,7184146042,06/18/2020 00:00:00,3765459,40.645537,-73.954034,40,792,Erasmus
4,BROOKLYN,3210296,2917,AVENUE N,321996970,1,DM,N,7665,4,...,NY,11210,3474928492,06/18/2020 00:00:00,3765460,40.617141,-73.945805,45,746,Flatlands
5,BROOKLYN,3055183,245,FRANKLIN AVENUE,340735789,1,A3,Y,1927,6,...,NY,11205,9142786038,06/18/2020 00:00:00,3765461,40.691216,-73.957363,33,235,Clinton Hill
6,BROOKLYN,3169308,338,BAY 10 STREET,340734904,1,A2,Y,6460,310,...,NY,11228,9175821295,06/18/2020 00:00:00,3765462,40.604804,-74.01519,43,168,Bath Beach
7,MANHATTAN,1079152,333,W 17TH STREET,103651709,1,A2,Y,741,10,...,NY,11101,7183495590,06/18/2020 00:00:00,3765463,40.742129,-74.002198,3,83,Hudson Yards-Chelsea-Flatiron-Union Square
8,BROOKLYN,3184453,1855,E 26 ST,340734085,1,A2,Y,6832,64,...,NY,11229,9292383678,06/18/2020 00:00:00,3765464,40.605739,-73.946743,48,562,Madison
9,BROOKLYN,3132159,1864,60TH STREET,340733969,1,A2,Y,5519,34,...,NY,11204,7184156051,06/18/2020 00:00:00,3765465,40.621221,-73.985944,44,244,Borough Park


In [32]:
# See how many rows are there in this dataset
ds_Full.count()

113

In [33]:
# Output the profiling result
profiles

[{'column': 'borough',
  'stats': {'totalValueCount': 113,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 113}),
                'distinct': Counter({'str': 5})}),
   'minmaxValues': {'str': {'minimum': 'BRONX', 'maximum': 'STATEN ISLAND'}},
   'distinctValueCount': 5,
   'entropy': 1.6502552511522224,
   'topValues': [('MANHATTAN', 52),
    ('BROOKLYN', 44),
    ('BRONX', 11),
    ('STATEN ISLAND', 3),
    ('QUEENS', 3)]}},
 {'column': 'bin__',
  'stats': {'totalValueCount': 113,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 113}),
                'distinct': Counter({'int': 104})}),
   'minmaxValues': {'int': {'minimum': 1002675, 'maximum': 5171693}},
   'distinctValueCount': 104,
   'entropy': 6.636507391599586,
   'topValues': [('3000519', 4),
    ('1016889', 3),
    ('1008507', 2),
    ('2015241', 2),
    ('3337913', 2),
    ('1044930', 2),
    (

In [34]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
borough,113,0,5,0.044248,1.650255
bin__,113,0,104,0.920354,6.636507
house__,113,0,100,0.884956,6.559031
street_name,113,0,93,0.823009,6.417438
job__,113,0,108,0.955752,6.713984
job_doc___,113,0,3,0.026549,0.249498
job_type,113,0,5,0.044248,1.192197
self_cert,113,0,2,0.017699,0.937574
block,113,0,99,0.876106,6.534651
lot,113,0,57,0.504425,5.301555


In [35]:
# Detect which column has empty value and its amount
profiles.stats()['empty']

borough                               0
bin__                                 0
house__                               0
street_name                           0
job__                                 0
job_doc___                            0
job_type                              0
self_cert                             0
block                                 0
lot                                   0
community_board                       0
zip_code                              0
bldg_type                             0
residential                          47
special_district_1                   89
special_district_2                  113
work_type                            10
permit_status                         0
filing_status                         0
permit_type                           0
permit_sequence__                     0
permit_subtype                       25
oil_gas                             113
site_fill                            16
filing_date                           0


In [36]:
# Check the inconsistent datatype
# Now we can investigate the outliers issue in this dataset
profiles.multitype_columns().types()

Unnamed: 0,int,str
house__,97,3
owner_s_house__,78,9
owner_s_house_street_name,1,84


# Data Cleaning

In [37]:
from openclean.operator.transform.update import update
from openclean.function.eval.base import Col
from openclean.function.eval.datatype import IsDatetime
from openclean.function.eval.null import IsEmpty
from openclean.function.eval.null import IsNotEmpty
from openclean.function.eval.datatype import IsInt
from openclean.operator.transform.filter import filter
from openclean.function.eval.datatype import IsFloat
from openclean.function.eval.logic import And

## Remove rows with empty/problematic values that are not possible to recover

There are columns in this dataset that have empty or wrong values. Normally, we will try to recover the mnissing values. Yet, values in some columns are just not able to be infer from other values. In this case, we can only choose to remove those rows.

In [38]:
ds_Update = ds_Full\
    .filter(predicate=IsNotEmpty("permittee_s_first_name"))\
    .filter(predicate=IsNotEmpty("owner_s_house__"))

## Data Standardization

In some case, different values may actually represent the same thing. For example, "5th Avenue" and "Fifth AVE" both point to "5th AVE". Thus, we need to standardize the data.

In [39]:
# Street name is a example that needs to be standardized. 
# Uncomment the lines below and see the clustering of the street names.


from openclean.cluster.key import KeyCollision
from openclean_geo.address.usstreet import USStreetNameKey

street_names = ds_Update.update('street_name', str.upper).distinct('street_name')
clusters = KeyCollision(func=USStreetNameKey(), threads=3).clusters(street_names)

def print_k_clusters(clusters, k=5):
    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print(f'  {key} (x {cnt})')
print_k_clusters(clusters)


Total number of clusters is 4 with 9 values

Cluster 1
  3 AVE (x 1)
  THIRD AVENUE (x 1)
  3RD AVENUE (x 1)

Cluster 2
  7 AVENUE (x 1)
  7TH AVE (x 1)

Cluster 3
  WEST 75TH STREET (x 2)
  WEST 75 STREET (x 1)

Cluster 4
  RALPH AVENUE (x 2)
  RALPH AVE (x 1)


In [40]:
# Standardize the "Street Name"
from openclean_geo.address.usstreet import StandardizeUSStreetName
ds_Update = ds_Update.update(columns="street_name", func=StandardizeUSStreetName(characters='upper'))

In [41]:
# Fix "Community - Board" so the column has consistent format
def fixCommunityBoard(num):
  if len(num) == 1:
    try:
      if int(num) <= 5:
        return num + '--'
      else:
        return '---'
    except:
      return '---'
  elif len(num) == 3:
    try:
      if int(num) <= 5:
        return str(int(num)) + '--'
      elif int(num) < 100:
        return '---'
      return num
    except:
      return '---'
  return '---'

ds_Test = ds_Update.update(columns="community_board", func=fixCommunityBoard)

In [42]:
# There are some rows that use "X" for positive representation and empty for negative representation.
# We decided to standardize them to "Y" and "N" where "Y" for positive and "N" for negative.
def fixYN(x):
  if x == 'X' or x == 'Y' or x == 'YES':
    return 'Y'
  return 'N'
ds_Update = ds_Update\
  .update(columns="residential", func=fixYN)

# The column "Cluster" has similar problem.
def insertN(x):
  if x not in ['Y', 'N']:
    return 'N'
  return x
ds_Update = ds_Update.update(columns="self_cert", func=insertN)

#some column has empty value where should be "NONE"
def insertNone(x):
    if x == '':
        return 'NONE'
ds_Update = ds_Update.update(columns="site_fill", func=insertNone)
    

## Fix characters

There are some weird typo in the dataset. For example, in numeric values, what should be "0" is replaced by "O". We also want to deal with those problems.

In [43]:
# Fix the characters in "Block" and "Lot" that actually represent "0"
def fixNum(num):
  if num.isdigit():
    return num
  res = ""
  for c in num:
    if c.isdigit():
      res += c
    elif c in ['O', '.', '-']:
      res += '0'
  return res

ds_Update = ds_Update.update(columns="block", func=fixNum)
ds_Update = ds_Update.update(columns="lot", func=fixNum)

# Correct misspelled city name
In the "City " column, some cities' name are misspelled. Take BROOKLYN for example, some values maight be like BROKKLYN, BROOLKYN,...,etc. Therefore, we use soundex() to find the misspelled city names and correct them with the matching city name.

In [44]:
ds_Update.select('city').distinct()

Counter({'NEW YORK': 40,
         'L.I.C.': 3,
         'BROOKLYN': 37,
         'LIC': 4,
         'ROCKAWAY POINT': 1,
         'MANHATTAN': 3,
         'JAMAICA': 2,
         'STATEN ISLAND': 1,
         'NEW HYDE PARK': 1,
         'FAR ROCKAWAY': 1,
         'NY': 2,
         'JERSEY CITY': 1,
         'BRONX': 6,
         'LIC NY': 1,
         'ROSLYN': 3,
         'LAS VEGAS': 1,
         'NEW  YORK': 1,
         'BK': 2,
         'LONG ISLAND CIT': 1,
         'SECAUCUS': 1,
         'QUEENS': 1})

In [45]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

ds_Update = ds_Update.update('city', str.upper)

In [46]:
# Fix the name of Brooklyn
def fixBrooklyn(name):
    if soundex(name)==soundex("BROOKLYN"):
        name="BROOKLYN"
    return name

ds_Update = ds_Update.update(columns="city", func=fixBrooklyn)

In [47]:
# Fix the name of Long Island City
def fixLongIslandCity(name):
  if soundex(name)==soundex("LONG ISLAND CITY"):
    name="LONG ISLAND CITY"
  return name

ds_Update = ds_Update.update(columns="city", func=fixLongIslandCity)

In [48]:
#Fix the name of Bronx
def fixBronx(name):
  if soundex(name)==soundex("BRONX"):
    name="BRONX"
  return name

ds_Update = ds_Update.update(columns="city", func=fixBronx)

In [49]:
#Fix the name of Manahttan
def fixManhattan(name):
  if soundex(name)==soundex("MANHATTAN"):
    name="MANHATTAN"
  return name

ds_Update = ds_Update.update(columns="city", func=fixManhattan)

In [50]:
#Fix the name of New York
def fixNewYork(name):
    if soundex(name)==soundex("NEW YORK"):
        name="NEW YORK"
    return name

ds_Update = ds_Update.update(columns="city", func=fixNewYork)    

In [51]:
def fixFlushing(name):
    if soundex(name)==soundex("FLUSHING"):
        name="FLUSHING"
    return name
ds_Update = ds_Update.update(columns="city", func=fixFlushing)

In [52]:
ds_Update = ds_Update.to_df()
ds_Update.to_csv('./bty7-2jhb_program_modify.csv', encoding='utf-8', index=False)