# Group 11: DOB Job Application Filings - Data Profiling and Data Cleaning
Team members: Peng-Yuan Chen (pc2973), Chun-Yen Liou (cyl625), Tsung-Lin Yang (ty2065)

In the following we perform the data profiling and data cleaning on the dataset of [DOB Job Application Filings](https://data.cityofnewyork.us/Housing-Development/DOB-Job-Application-Filings/ic3t-wcy2).

This dataset includes all the job applications submitted to Department of Buildings (DOB) through the Borough Offices, through eFiling, or through the HUB. It has a "Latest Action Date" since January 1, 2000. 

The dataset consists of over 1.77 million rows and the data file is about 1 GB in size. The dataset is available for download via the Socrata Open Data API (SODA).

In [27]:
# Since we are using Google Colab, we have to first install openclean library.
!pip install openclean_notebook
!pip install openclean
!pip install openclean_geo



You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [28]:
# Import necessary libraries
import os
import requests
from openclean.pipeline import stream
import pandas as pd


# Download the full 'DOB Job Application Filings' dataset.
csvPath = './dm9a-ab7w.csv'
csvPath_new = './dm9a-ab7w_raw.csv'
if not os.path.isfile(csvPath):
  csvUrl = "https://data.cityofnewyork.us/resource/dm9a-ab7w.csv"
  req = requests.get(csvUrl)
  url_content = req.content
  outfile = open(csvPath, 'wb')
  outfile.write(url_content)
  outfile.close()

ds_Full = pd.read_csv(csvPath, nrows=12)
ds_Full.to_csv('./dm9a-ab7w_raw.csv', encoding='utf-8', index=False)

# Data Profiling

Let's first do some preliminary profiling on the dataset so that we can gain some insight about the data.

In [29]:
# Do the preliminary profiling
from openclean.profiling.column import DefaultColumnProfiler

ds_Full = stream(csvPath_new)
profiles = ds_Full.profile(default_profiler=DefaultColumnProfiler)

In [30]:
# Take a look at the column names in this dataset
ds_Full.columns

['job_filing_number',
 'job_number',
 'filing_number',
 'filing_date',
 'filing_type',
 'filing_status',
 'job_status',
 'house_number',
 'street_name',
 'borough',
 'zip_code',
 'block',
 'lot',
 'bin',
 'community_board',
 'joint_venture_work',
 'building_use_type',
 'applicant_first_name',
 'applicant_last_name',
 'license_type',
 'license_number',
 'firm_name',
 'firm_number',
 'firm_address',
 'city',
 'state',
 'zip',
 'general_liability_company',
 'general_liability_policy',
 'general_liability_expiration',
 'worker_comp_company_name',
 'worker_comp_policy',
 'worker_comp_expiration_date',
 'disability_company_name',
 'disability_policy',
 'disability_expiration_date',
 'owner_first_name',
 'owner_last_name',
 'title',
 'business_name',
 'owner_address',
 'owner_city',
 'owner_state',
 'owner_zip',
 'owner_type',
 'auth_rep_first_name',
 'auth_rep_last_name',
 'auth_rep_owner_relation',
 'coo_related',
 'const_bis_job_number',
 'removal_of_vio_or_owner',
 'svc_work_notify_utilit

In [31]:
# Take a look at the first 10 rows
ds_Full.head()

Unnamed: 0,job_filing_number,job_number,filing_number,filing_date,filing_type,filing_status,job_status,house_number,street_name,borough,...,amount_paid,amount_due,payment_method,gis_latitude,gis_longitude,gis_council_district,gis_census_tract,gis_bin,gis_bbl,gis_nta_name
0,B00083823-P1,B00083823,P1,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,2438,EAST 7 STREET,BROOKLYN,...,40,0,,40.590828,-73.962756,48,370,3195471,3072030014,Brighton Beach
1,B00143086-P1,B00143086,P1,2021-04-27T00:00:00.000,PAA,Approved,Job in Process,601,OCEAN VIEW AVENUE,BROOKLYN,...,40,0,,40.579607,-73.962071,48,364,3244470,3086660610,Brighton Beach
2,B00262288-P1,B00262288,P1,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,84,WITHERS STREET,BROOKLYN,...,40,0,,40.716973,-73.948873,34,501,3068248,3027420015,East Williamsburg
3,B00304119-P3,B00304119,P3,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,85,3 STREET,BROOKLYN,...,40,0,,40.677481,-73.992963,39,77,3007911,3004610058,Carroll Gardens-Columbia Street-Red Hook
4,B00304755-P2,B00304755,P2,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,251,WALLABOUT STREET,BROOKLYN,...,40,0,,40.701158,-73.949433,33,507,3000000,3022490037,Bedford
5,B00088313-P1,B00088313,P1,2021-07-27T00:00:00.000,PAA,Approved,Job in Process,1578,59 STREET,BROOKLYN,...,40,0,,40.625819,-73.991988,44,242,3000000,3055090039,Borough Park
6,B00364423-P1,B00364423,P1,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,89,MOFFAT STREET,BROOKLYN,...,40,0,,40.685366,-73.908557,37,403,3079933,3034390051,Bushwick South
7,B00364423-P2,B00364423,P2,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,89,MOFFAT STREET,BROOKLYN,...,40,0,,40.685366,-73.908557,37,403,3079933,3034390051,Bushwick South
8,B00371529-P1,B00371529,P1,2021-06-02T00:00:00.000,PAA,Approved,Job in Process,899,PACIFIC STREET,BROOKLYN,...,40,0,,40.680517,-73.965536,35,203,3027478,3011220045,Prospect Heights
9,B00377661-P1,B00377661,P1,2021-04-27T00:00:00.000,PAA,Approved,Job in Process,1567,63 STREET,BROOKLYN,...,40,0,,40.623667,-73.994542,38,250,3259616,3055300050,Bensonhurst West


In [32]:
# See how many rows are there in this dataset
ds_Full.count()

12

In [33]:
# Output the profiling result
profiles

[{'column': 'job_filing_number',
  'stats': {'totalValueCount': 12,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 12}),
                'distinct': Counter({'str': 12})}),
   'minmaxValues': {'str': {'minimum': 'B00083823-P1',
     'maximum': 'B00458761-P1'}},
   'distinctValueCount': 12,
   'entropy': 3.584962500721157,
   'topValues': [('B00083823-P1', 1),
    ('B00143086-P1', 1),
    ('B00262288-P1', 1),
    ('B00304119-P3', 1),
    ('B00304755-P2', 1),
    ('B00088313-P1', 1),
    ('B00364423-P1', 1),
    ('B00364423-P2', 1),
    ('B00371529-P1', 1),
    ('B00377661-P1', 1)]}},
 {'column': 'job_number',
  'stats': {'totalValueCount': 12,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 12}),
                'distinct': Counter({'str': 11})}),
   'minmaxValues': {'str': {'minimum': 'B00083823', 'maximum': 'B00458761'}},
   'distinctValueCount': 11,
 

In [34]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
job_filing_number,12,0,12,1.000000,3.584963
job_number,12,0,11,0.916667,3.418296
filing_number,12,0,3,0.250000,1.040852
filing_date,12,0,3,0.250000,1.280672
filing_type,12,0,1,0.083333,0.000000
...,...,...,...,...,...
gis_council_district,12,0,9,0.750000,3.084963
gis_census_tract,12,0,11,0.916667,3.418296
gis_bin,12,0,10,0.833333,3.251629
gis_bbl,12,0,11,0.916667,3.418296


In [35]:
# Detect which column has empty value and its amount
profiles.stats()['empty']

job_filing_number       0
job_number              0
filing_number           0
filing_date             0
filing_type             0
                       ..
gis_council_district    0
gis_census_tract        0
gis_bin                 0
gis_bbl                 0
gis_nta_name            0
Name: empty, Length: 84, dtype: int64

In [36]:
# Check the inconsistent datatype
# Now we can investigate the outliers issue in this dataset
profiles.multitype_columns().types()

Unnamed: 0,int,str
worker_comp_policy,1,9


# Data Cleaning

In [37]:
from openclean.operator.transform.update import update
from openclean.function.eval.base import Col
from openclean.function.eval.datatype import IsDatetime
from openclean.function.eval.null import IsEmpty
from openclean.function.eval.null import IsNotEmpty
from openclean.function.eval.datatype import IsInt
from openclean.operator.transform.filter import filter
from openclean.function.eval.datatype import IsFloat
from openclean.function.eval.logic import And

## Remove rows with empty/problematic values that are not possible to recover

There are columns in this dataset that have empty or wrong values. Normally, we will try to recover the mnissing values. Yet, values in some columns are just not able to be infer from other values. In this case, we can only choose to remove those rows.

In [38]:
ds_Update = ds_Full

## Data Standardization

In some case, different values may actually represent the same thing. For example, "5th Avenue" and "Fifth AVE" both point to "5th AVE". Thus, we need to standardize the data.

In [39]:
# Street name is a example that needs to be standardized. 
# Uncomment the lines below and see the clustering of the street names.


from openclean.cluster.key import KeyCollision
from openclean_geo.address.usstreet import USStreetNameKey

street_names = ds_Update.update('street_name', str.upper).distinct('street_name')
clusters = KeyCollision(func=USStreetNameKey(), threads=3).clusters(street_names)

def print_k_clusters(clusters, k=5):
    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print(f'  {key} (x {cnt})')
print_k_clusters(clusters)


Total number of clusters is 0 with 0 values


In [40]:
# Standardize the "Street Name"
from openclean_geo.address.usstreet import StandardizeUSStreetName
ds_Update = ds_Update.update(columns="street_name", func=StandardizeUSStreetName(characters='upper'))

In [41]:
# Fix "Community - Board" so the column has consistent format
def fixCommunityBoard(num):
  if len(num) == 1:
    try:
      if int(num) <= 5:
        return num + '--'
      else:
        return '---'
    except:
      return '---'
  elif len(num) == 3:
    try:
      if int(num) <= 5:
        return str(int(num)) + '--'
      elif int(num) < 100:
        return '---'
      return num
    except:
      return '---'
  return '---'

ds_Test = ds_Update.update(columns="community_board", func=fixCommunityBoard)

In [42]:
# There are some rows that use "X" for positive representation and empty for negative representation.
# We decided to standardize them to "Y" and "N" where "Y" for positive and "N" for negative.
def fixYN(x):
  if x == 'X' or x == 'Y' or x == 'YES':
    return 'Y'
  return 'N'


# The column "Cluster" has similar problem.
def insertN(x):
  if x not in ['Y', 'N']:
    return 'N'
  return x


#some column has empty value where should be "NONE"
def insertNone(x):
    if x == '':
        return 'NONE'

    

## Fix characters

There are some weird typo in the dataset. For example, in numeric values, what should be "0" is replaced by "O". We also want to deal with those problems.

In [43]:
# Fix the characters in "Block" and "Lot" that actually represent "0"
def fixNum(num):
  if num.isdigit():
    return num
  res = ""
  for c in num:
    if c.isdigit():
      res += c
    elif c in ['O', '.', '-']:
      res += '0'
  return res

ds_Update = ds_Update.update(columns="block", func=fixNum)
ds_Update = ds_Update.update(columns="lot", func=fixNum)

# Correct misspelled city name
In the "City " column, some cities' name are misspelled. Take BROOKLYN for example, some values maight be like BROKKLYN, BROOLKYN,...,etc. Therefore, we use soundex() to find the misspelled city names and correct them with the matching city name.

In [44]:
ds_Update.select('city').distinct()

Counter({'RICHMOND HILL': 2,
         'LONG ISLAND CITY': 1,
         'BROOKLYN': 4,
         'STATEN ISLAND': 1,
         'FLUSHING': 2,
         'OZONE PARK': 1,
         'BRONX': 1})

In [45]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

ds_Update = ds_Update.update('city', str.upper)

In [46]:
# Fix the name of Brooklyn
def fixBrooklyn(name):
    if soundex(name)==soundex("BROOKLYN"):
        name="BROOKLYN"
    return name

ds_Update = ds_Update.update(columns="city", func=fixBrooklyn)

In [47]:
# Fix the name of Long Island City
def fixLongIslandCity(name):
  if soundex(name)==soundex("LONG ISLAND CITY"):
    name="LONG ISLAND CITY"
  return name

ds_Update = ds_Update.update(columns="city", func=fixLongIslandCity)

In [48]:
#Fix the name of Bronx
def fixBronx(name):
  if soundex(name)==soundex("BRONX"):
    name="BRONX"
  return name

ds_Update = ds_Update.update(columns="city", func=fixBronx)

In [49]:
#Fix the name of Manahttan
def fixManhattan(name):
  if soundex(name)==soundex("MANHATTAN"):
    name="MANHATTAN"
  return name

ds_Update = ds_Update.update(columns="city", func=fixManhattan)

In [50]:
#Fix the name of New York
def fixNewYork(name):
    if soundex(name)==soundex("NEW YORK"):
        name="NEW YORK"
    return name

ds_Update = ds_Update.update(columns="city", func=fixNewYork)    

In [51]:
def fixFlushing(name):
    if soundex(name)==soundex("FLUSHING"):
        name="FLUSHING"
    return name
ds_Update = ds_Update.update(columns="city", func=fixFlushing)

In [52]:
ds_Update = ds_Update.to_df()
ds_Update.to_csv('./dm9a-ab7w_program_modify.csv', encoding='utf-8', index=False)