# Group 11: DOB Job Application Filings - Data Profiling and Data Cleaning
Team members: Peng-Yuan Chen (pc2973), Chun-Yen Liou (cyl625), Tsung-Lin Yang (ty2065)

In the following we perform the data profiling and data cleaning on the dataset of [DOB Job Application Filings](https://data.cityofnewyork.us/Housing-Development/DOB-Job-Application-Filings/ic3t-wcy2).

This dataset includes all the job applications submitted to Department of Buildings (DOB) through the Borough Offices, through eFiling, or through the HUB. It has a "Latest Action Date" since January 1, 2000. 

The dataset consists of over 1.77 million rows and the data file is about 1 GB in size. The dataset is available for download via the Socrata Open Data API (SODA).

In [1]:
# Since we are using Google Colab, we have to first install openclean library.
!pip install openclean_notebook
!pip install openclean
!pip install openclean_geo



You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# Import necessary libraries
import os
import requests
from openclean.pipeline import stream
import pandas as pd


# Download the full 'DOB Job Application Filings' dataset.
csvPath = './8fei-z6rz.csv'
csvPath_new = './8fei-z6rz_raw.csv'
if not os.path.isfile(csvPath):
  csvUrl = "https://data.cityofnewyork.us/api/views/8fei-z6rz/rows.csv"
  req = requests.get(csvUrl)
  url_content = req.content
  outfile = open(csvPath, 'wb')
  outfile.write(url_content)
  outfile.close()

ds_Full = pd.read_csv(csvPath, nrows=2)
ds_Full.to_csv('./8fei-z6rz_raw.csv', encoding='utf-8', index=False)

# Data Profiling

Let's first do some preliminary profiling on the dataset so that we can gain some insight about the data.

In [3]:
# Do the preliminary profiling
from openclean.profiling.column import DefaultColumnProfiler

ds_Full = stream(csvPath_new)
profiles = ds_Full.profile(default_profiler=DefaultColumnProfiler)

In [4]:
# Take a look at the column names in this dataset
ds_Full.columns

['Record ID',
 'Business Name',
 'Industry',
 'Certificate Number',
 'Inspection Date',
 'Inspection Result',
 'Test Type',
 'Approved',
 'Condemned',
 'Confiscated',
 'Notes',
 'Building Number',
 'Street',
 'City',
 'State',
 'Zip',
 'Unit',
 'Longitude',
 'Latitude']

In [5]:
# Take a look at the first 10 rows
ds_Full.head()

Unnamed: 0,Record ID,Business Name,Industry,Certificate Number,Inspection Date,Inspection Result,Test Type,Approved,Condemned,Confiscated,Notes,Building Number,Street,City,State,Zip,Unit,Longitude,Latitude
0,32262-2020-ENFO,1750 WALTON LAUNDROMAT CORP,Laundries,A0012847,12/14/2020,Pass,SCALE TO 661 LBS-02,1,0,0,,1750,WALTON AVE,BRONX,NY,10453,,-77.51985087626232,40.11248699222675
1,16390-2019-ENFO,"THE TJX COMPANIES, INC.",Megastore - 821,03088329,03/19/2019,Pass,SCANNER-69,9,1,0,,410,GATEWAY DR,BROOKLYN,NY,11239,,-73.87705635618526,40.652876989285126


In [6]:
# See how many rows are there in this dataset
ds_Full.count()

2

In [7]:
# Output the profiling result
profiles

[{'column': 'Record ID',
  'stats': {'totalValueCount': 2,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 2}),
                'distinct': Counter({'str': 2})}),
   'minmaxValues': {'str': {'minimum': '16390-2019-ENFO',
     'maximum': '32262-2020-ENFO'}},
   'distinctValueCount': 2,
   'entropy': 1.0,
   'topValues': [('32262-2020-ENFO', 1), ('16390-2019-ENFO', 1)]}},
 {'column': 'Business Name',
  'stats': {'totalValueCount': 2,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 2}),
                'distinct': Counter({'str': 2})}),
   'minmaxValues': {'str': {'minimum': '1750 WALTON LAUNDROMAT CORP',
     'maximum': 'THE TJX COMPANIES, INC.'}},
   'distinctValueCount': 2,
   'entropy': 1.0,
   'topValues': [('1750 WALTON LAUNDROMAT CORP', 1),
    ('THE TJX COMPANIES, INC.', 1)]}},
 {'column': 'Industry',
  'stats': {'totalValueCount': 2,
   'emptyValue

In [8]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Record ID,2,0,2,1.0,1.0
Business Name,2,0,2,1.0,1.0
Industry,2,0,2,1.0,1.0
Certificate Number,2,0,2,1.0,1.0
Inspection Date,2,0,2,1.0,1.0
Inspection Result,2,0,1,0.5,0.0
Test Type,2,0,2,1.0,1.0
Approved,2,0,2,1.0,1.0
Condemned,2,0,2,1.0,1.0
Confiscated,2,0,1,0.5,0.0


In [9]:
# Detect which column has empty value and its amount
profiles.stats()['empty']

Record ID             0
Business Name         0
Industry              0
Certificate Number    0
Inspection Date       0
Inspection Result     0
Test Type             0
Approved              0
Condemned             0
Confiscated           0
Notes                 2
Building Number       0
Street                0
City                  0
State                 0
Zip                   0
Unit                  2
Longitude             0
Latitude              0
Name: empty, dtype: int64

In [10]:
# Check the inconsistent datatype
# Now we can investigate the outliers issue in this dataset
profiles.multitype_columns().types()

Unnamed: 0,int,str
Certificate Number,1,1


# Data Cleaning

In [11]:
from openclean.operator.transform.update import update
from openclean.function.eval.base import Col
from openclean.function.eval.datatype import IsDatetime
from openclean.function.eval.null import IsEmpty
from openclean.function.eval.null import IsNotEmpty
from openclean.function.eval.datatype import IsInt
from openclean.operator.transform.filter import filter
from openclean.function.eval.datatype import IsFloat
from openclean.function.eval.logic import And

## Remove rows with empty/problematic values that are not possible to recover

There are columns in this dataset that have empty or wrong values. Normally, we will try to recover the mnissing values. Yet, values in some columns are just not able to be infer from other values. In this case, we can only choose to remove those rows.

In [12]:
ds_Update = ds_Full

## Data Standardization

In some case, different values may actually represent the same thing. For example, "5th Avenue" and "Fifth AVE" both point to "5th AVE". Thus, we need to standardize the data.

In [13]:
# Street name is a example that needs to be standardized. 
# Uncomment the lines below and see the clustering of the street names.

'''
from openclean.cluster.key import KeyCollision
from openclean_geo.address.usstreet import USStreetNameKey

street_names = ds_Update.update('street_name', str.upper).distinct('street_name')
clusters = KeyCollision(func=USStreetNameKey(), threads=3).clusters(street_names)

def print_k_clusters(clusters, k=5):
    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print(f'  {key} (x {cnt})')
print_k_clusters(clusters)
'''

'\nfrom openclean.cluster.key import KeyCollision\nfrom openclean_geo.address.usstreet import USStreetNameKey\n\nstreet_names = ds_Update.update(\'street_name\', str.upper).distinct(\'street_name\')\nclusters = KeyCollision(func=USStreetNameKey(), threads=3).clusters(street_names)\n\ndef print_k_clusters(clusters, k=5):\n    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)\n    val_count = sum([len(c) for c in clusters])\n    print(\'Total number of clusters is {} with {} values\'.format(len(clusters), val_count))\n    for i in range(min(k, len(clusters))):\n        print(\'\nCluster {}\'.format(i + 1))\n        for key, cnt in clusters[i].items():\n            if key == \'\':\n                key = "\'\'"\n            print(f\'  {key} (x {cnt})\')\nprint_k_clusters(clusters)\n'

In [14]:
# Standardize the "Street Name"
from openclean_geo.address.usstreet import StandardizeUSStreetName
#ds_Update = ds_Update.update(columns="street_name", func=StandardizeUSStreetName(characters='upper'))

In [15]:
# Fix "Community - Board" so the column has consistent format
def fixCommunityBoard(num):
  if len(num) == 1:
    try:
      if int(num) <= 5:
        return num + '--'
      else:
        return '---'
    except:
      return '---'
  elif len(num) == 3:
    try:
      if int(num) <= 5:
        return str(int(num)) + '--'
      elif int(num) < 100:
        return '---'
      return num
    except:
      return '---'
  return '---'

ds_Test = ds_Update.update(columns="community_board", func=fixCommunityBoard)

In [16]:
# There are some rows that use "X" for positive representation and empty for negative representation.
# We decided to standardize them to "Y" and "N" where "Y" for positive and "N" for negative.
def fixYN(x):
  if x == 'X' or x == 'Y' or x == 'YES':
    return 'Y'
  return 'N'


# The column "Cluster" has similar problem.
def insertN(x):
  if x not in ['Y', 'N']:
    return 'N'
  return x


#some column has empty value where should be "NONE"
def insertNone(x):
    if x == '':
        return 'NONE'

    

## Fix characters

There are some weird typo in the dataset. For example, in numeric values, what should be "0" is replaced by "O". We also want to deal with those problems.

In [17]:
# Fix the characters in "Block" and "Lot" that actually represent "0"
def fixNum(num):
  if num.isdigit():
    return num
  res = ""
  for c in num:
    if c.isdigit():
      res += c
    elif c in ['O', '.', '-']:
      res += '0'
  return res

#ds_Update = ds_Update.update(columns="block", func=fixNum)
#ds_Update = ds_Update.update(columns="lot", func=fixNum)

# Correct misspelled city name
In the "City " column, some cities' name are misspelled. Take BROOKLYN for example, some values maight be like BROKKLYN, BROOLKYN,...,etc. Therefore, we use soundex() to find the misspelled city names and correct them with the matching city name.

In [18]:
ds_Update.select('City').distinct()

Counter({'BRONX': 1, 'BROOKLYN': 1})

In [19]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

ds_Update = ds_Update.update('City', str.upper)

In [20]:
# Fix the name of Brooklyn
def fixBrooklyn(name):
    if soundex(name)==soundex("BROOKLYN"):
        name="BROOKLYN"
    return name

ds_Update = ds_Update.update(columns="City", func=fixBrooklyn)

In [21]:
# Fix the name of Long Island City
def fixLongIslandCity(name):
  if soundex(name)==soundex("LONG ISLAND CITY"):
    name="LONG ISLAND CITY"
  return name

ds_Update = ds_Update.update(columns="City", func=fixLongIslandCity)

In [22]:
#Fix the name of Bronx
def fixBronx(name):
  if soundex(name)==soundex("BRONX"):
    name="BRONX"
  return name

ds_Update = ds_Update.update(columns="City", func=fixBronx)

In [23]:
#Fix the name of Manahttan
def fixManhattan(name):
  if soundex(name)==soundex("MANHATTAN"):
    name="MANHATTAN"
  return name

ds_Update = ds_Update.update(columns="City", func=fixManhattan)

In [24]:
#Fix the name of New York
def fixNewYork(name):
    if soundex(name)==soundex("NEW YORK"):
        name="NEW YORK"
    return name

ds_Update = ds_Update.update(columns="City", func=fixNewYork)    

In [25]:
def fixFlushing(name):
    if soundex(name)==soundex("FLUSHING"):
        name="FLUSHING"
    return name
ds_Update = ds_Update.update(columns="City", func=fixFlushing)

In [26]:
ds_Update = ds_Update.to_df()
ds_Update.to_csv('./8fei-z6rz_program_modify.csv', encoding='utf-8', index=False)