# FinalProject - Data Profiling and Data Cleaning(DOB Cellular Antenna Filings)
Team members: Peng-Yuan Chen (pc2973), Chun-Yen Liou (cyl625), Tsung-Lin Yang (ty2065)



In [1]:
# Since we are using Google Colab, we have to first install openclean library.
!pip install openclean_notebook
!pip install openclean
!pip install openclean_geo



You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# Import necessary libraries
import os
import requests
import random
from openclean.pipeline import stream
import pandas as pd


# Download the sample size of 'DOB Job Application Filings' dataset.
csvPath = './iz2q-9x8d.csv'
csvPath_new = './iz2q-9x8d_raw.csv'
if not os.path.isfile(csvPath):
  csvUrl = "https://data.cityofnewyork.us/api/views/iz2q-9x8d/rows.csv"
  req = requests.get(csvUrl)
  url_content = req.content
  outfile = open(csvPath, 'wb')
  outfile.write(url_content)
  outfile.close()

ds_Full = pd.read_csv(csvPath, nrows=1)
ds_Full.to_csv('./iz2q-9x8d_raw.csv', encoding='utf-8', index=False)

# Data Profiling

Let's first do some preliminary profiling on the dataset so that we can gain some insight about the data.

In [3]:
# Do the preliminary profiling
from openclean.profiling.column import DefaultColumnProfiler
ds_Full = stream(csvPath_new)
profiles = ds_Full.profile(default_profiler=DefaultColumnProfiler)

In [4]:
# Take a look at the column names in this dataset
ds_Full.columns

['Job #',
 'Doc #',
 'Borough',
 'House #',
 'Street Name',
 'Block',
 'Lot',
 'Bin #',
 'Job Type',
 'Job Status',
 'Job Status Descrp',
 'Latest Action Date',
 'Building Type',
 'Community - Board',
 'Landmarked',
 'Little e',
 'PC Filed',
 'Other',
 'Other Description',
 "Applicant's First Name",
 "Applicant's Last Name",
 'Applicant Professional Title',
 'Applicant License #',
 'Professional Cert',
 'Pre- Filing Date',
 'Paid',
 'Fully Paid',
 'Assigned',
 'Approved',
 'Fully Permitted',
 'Initial Cost',
 'Total Est. Fee',
 'Fee Status',
 'Existing Occupancy',
 'Proposed Occupancy',
 'Zoning Distr 1',
 'Zoning Distr 2',
 'Zoning Distr 3',
 'Special Distr 1',
 'Special Distr 2 ',
 'Owner Type',
 'Owner Type Description',
 'Non-Profit',
 "Owner's First Name",
 "Owner's Last Name",
 "Owner's Business Name",
 "Owner's  House #",
 "Owner's  House Street",
 'City',
 'State',
 'Zip Code',
 "Owner's  Phone #",
 'First Permit  Date',
 'Job Description',
 'DOBRunDate']

In [5]:
# Take a look at the first 10 rows
ds_Full.head()

Unnamed: 0,Job #,Doc #,Borough,House #,Street Name,Block,Lot,Bin #,Job Type,Job Status,...,Owner's Business Name,Owner's House #,Owner's House Street,City,State,Zip Code,Owner's Phone #,First Permit Date,Job Description,DOBRunDate
0,321410745,1,3,96,SCHERMERHORN STREET,271,47,3002707,A3,R,...,DOUGLAS ELLIMAN PROPERTY MANAGEM,96,SCHERMERHORN STREET,BROOKLYN,NY,11201,7188474820,08/16/2016 12:00:00 AM,INSTALLATION OF MICROWAVE DISH ON ROOF. ALL IN...,08/17/2016 00:00:00


In [6]:
# See how many rows are there in this dataset
ds_Full.count()

1

In [7]:
# Output the profiling result
profiles

[{'column': 'Job #',
  'stats': {'totalValueCount': 1,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1}),
                'distinct': Counter({'int': 1})}),
   'minmaxValues': {'int': {'minimum': 321410745, 'maximum': 321410745}},
   'distinctValueCount': 1,
   'entropy': 0.0,
   'topValues': [('321410745', 1)]}},
 {'column': 'Doc #',
  'stats': {'totalValueCount': 1,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1}),
                'distinct': Counter({'int': 1})}),
   'minmaxValues': {'int': {'minimum': 1, 'maximum': 1}},
   'distinctValueCount': 1,
   'entropy': 0.0,
   'topValues': [('1', 1)]}},
 {'column': 'Borough',
  'stats': {'totalValueCount': 1,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1}),
                'distinct': Counter({'int': 1})}),
   'minmaxValues': {'int':

In [8]:
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Job #,1,0,1,1.0,0.0
Doc #,1,0,1,1.0,0.0
Borough,1,0,1,1.0,0.0
House #,1,0,1,1.0,0.0
Street Name,1,0,1,1.0,0.0
Block,1,0,1,1.0,0.0
Lot,1,0,1,1.0,0.0
Bin #,1,0,1,1.0,0.0
Job Type,1,0,1,1.0,0.0
Job Status,1,0,1,1.0,0.0


In [9]:
# Detect which column has empty value and its amount
profiles.stats()['empty']

Job #                           0
Doc #                           0
Borough                         0
House #                         0
Street Name                     0
Block                           0
Lot                             0
Bin #                           0
Job Type                        0
Job Status                      0
Job Status Descrp               0
Latest Action Date              0
Building Type                   0
Community - Board               0
Landmarked                      1
Little e                        1
PC Filed                        0
Other                           0
Other Description               0
Applicant's First Name          0
Applicant's Last Name           0
Applicant Professional Title    0
Applicant License #             0
Professional Cert               1
Pre- Filing Date                0
Paid                            0
Fully Paid                      0
Assigned                        0
Approved                        0
Fully Permitte

In [10]:
# Check the inconsistent datatype
# Now we can investigate the outliers issue in this dataset
profiles.multitype_columns().types()

# Data Cleaning

In [11]:
from openclean.operator.transform.update import update
from openclean.function.eval.base import Col
from openclean.function.eval.datatype import IsDatetime
from openclean.function.eval.null import IsEmpty
from openclean.function.eval.null import IsNotEmpty
from openclean.function.eval.datatype import IsInt
from openclean.operator.transform.filter import filter
from openclean.function.eval.datatype import IsFloat
from openclean.function.eval.logic import And
from openclean.function.eval.datatype import Datetime
from datetime import datetime
from openclean.function.eval.datatype import Str

## Remove rows with empty/problematic values that are not possible to recover

There are columns in this dataset that have empty or wrong values. Normally, we will try to recover the mnissing values. Yet, values in some columns are just not able to be infer from other values. In this case, we can only choose to remove those rows.

In [12]:
ds_Update = ds_Full

## Data Standardization

In some case, different values may actually represent the same thing. For example, "5th Avenue" and "Fifth AVE" both point to "5th AVE". Thus, we need to standardize the data.

In [13]:
# Street name is a example that needs to be standardized. 
# Uncomment the lines below and see the clustering of the street names.


from openclean.cluster.key import KeyCollision
from openclean_geo.address.usstreet import USStreetNameKey

street_names = ds_Update.update('Street Name', str.upper).distinct('Street Name')
clusters = KeyCollision(func=USStreetNameKey(), threads=3).clusters(street_names)

def print_k_clusters(clusters, k=5):
    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print(f'  {key} (x {cnt})')
print_k_clusters(clusters)

Total number of clusters is 0 with 0 values


In [14]:
# Standardize the "street_name"
from openclean_geo.address.usstreet import StandardizeUSStreetName
ds_Update = ds_Update.update(columns="Street Name", func=StandardizeUSStreetName(characters='upper'))

In [15]:
# Fix "Community - Board" so the column has consistent format
def fixCommunityBoard(num):
  if len(num) == 1:
    try:
      if int(num) <= 5:
        return num + '--'
      else:
        return '---'
    except:
      return '---'
  elif len(num) == 3:
    try:
      if int(num) <= 5:
        return str(int(num)) + '--'
      elif int(num) < 100:
        return '---'
      return num
    except:
      return '---'
  return '---'

ds_Test = ds_Update.update(columns="Community - Board", func=fixCommunityBoard)

In [16]:
# We decided to standardize them to "Y" and "N" where "Y" for positive and "N" for negative.
def fixYN(x):
  if x == 'X':
    return 'N'
ds_Update = ds_Update\
  .update(columns="Other", func=fixYN)



def insertN(x):
  if x not in ['Y', 'N']:
    return 'N'
  return x
ds_Update = ds_Update\
    .update(columns="Landmarked", func=insertN)\
    .update(columns="PC Filed", func=insertN)\
    .update(columns="Professional Cert", func=insertN)

def insertN_little_e(x):# insert N for hazardous notice
    if x != 'H':
        return 'N'
    return x
ds_update = ds_Update.update(columns="Little e", func=insertN_little_e)


## Fix characters

There are some weird typo in the dataset. For example, in numeric values, what should be "0" is replaced by "O". We also want to deal with those problems.

In [17]:
# Fix the characters in "Block" and "Lot" that actually represent "0"
def fixNum(num):
  if num.isdigit():
    return num
  res = ""
  for c in num:
    if c.isdigit():
      res += c
    elif c in ['O', '.', '-']:
      res += '0'
  return res

ds_Update = ds_Update.update(columns="Block", func=fixNum)
ds_Update = ds_Update.update(columns="Lot", func=fixNum)

# Correct misspelled city name
In the "City " column, some cities' name are misspelled. Take BROOKLYN for example, some values maight be like BROKKLYN, BROOLKYN,...,etc. Therefore, we use soundex() to find the misspelled city names and correct them with the matching city name.

In [25]:
ds_Full.select('City').distinct()

Counter({'BROOKLYN       ': 1})

In [19]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

In [20]:
# Fix the name of englewoodcliff
def fixEnglewoodCliff(name):
  if soundex(name)==soundex("ENGLEWOOD CLIFF"):
    name="ENGLEWOOD CLIFF"
  return name

ds_Update = ds_Update.update(columns="City", func=fixEnglewoodCliff)

In [21]:
#Fix the name of NEW YORK
def fixNewYork(name):
  if soundex(name)==soundex("NEW YORK"):
    name="NEW YORK"
  return name

ds_Update = ds_Update.update(columns="City", func=fixNewYork)

In [22]:
#Fix the name of LONG ISLAND CITY
def fixLongIslandCity(name):
  if soundex(name)==soundex("LONG ISLAND CITY"):
    name="LONG ISLAND CITY"
  return name

ds_Update = ds_Update.update(columns="City", func=fixLongIslandCity)

In [23]:
#Fix the name of ROOSEVELT ISLAND
def fixRooseveltIsland(name):
  if soundex(name)==soundex("ROOSEVELT ISLAND"):
    name="ROOSEVELT ISLAND"
  return name

ds_Update = ds_Update.update(columns="City", func=fixRooseveltIsland)

In [24]:
ds_Update = ds_Update.to_df()
ds_Update.to_csv('./iz2q-9x8d_program_modify.csv', encoding='utf-8', index=False)