# Install openclean
Using openclean to clean dataset.

In [None]:
# Please install openclean otherwise please skip this installation
!pip install openclean[full]

In [1]:
%matplotlib inline
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
import csv
from openclean.data.load import dataset
from openclean.data.source.socrata import Socrata
from openclean.profiling.dataset import dataset_profile

# Apply the techniques you used for Part 1 and measure their effectiveness

In [2]:
# There are 1838945 rows in origial dataset and 1263421 rows after cleaned.
ori_data_cnt, cleaned_data_cnt = 1838945, 1263421

# remove invalid data type. e.g.
valid_data_cnt = 1263347

precision = cleaned_data_cnt / ori_data_cnt
recall = valid_data_cnt / cleaned_data_cnt

print("Original data cleaning precision is {}".format(precision))
print("Original data cleaning recall is {}".format(recall))

Original data cleaning precision is 0.6870357732286718
Original data cleaning recall is 0.9999414288665457


# Data cleaning target
Improve/refine your techniques to cover the new data and compare its effectiveness with your original approach.

We removed all invalid data include unaccepted data type and blank cell. 

To improve our cleaning stratagy, we are going to fill empty cell by 'Unspecified', convert address under naming convention, standardlizate all datetime fields.

Example，

Collision reason:
    Unattention,
    ...
    
Address: 
    Street: 181 East street
    City: New York
    State: NY
    Zipcode: 10010
    
Date: 2021-12-09 (yyyy-MM-DD)
Time: 16:44:00 (hh:mm:ss)


# Create reference data for the data types you cleaned.


In [3]:
# Add zipcode reference

nyc_borough_zipcode_ref = pd.read_csv('data/ZIP_BOROUGH.csv', dtype=str)

nyc_zipcode_ref = {
    'Staten Islant':[10301,10302,10303,10304,10305,10306,10307,10308,10309,10310,10311,10312,\
                      10314,10301,10302,10303,10304,10305,10306,10307,10308,10309,10310,10311,\
                      10312,10314],
    'Queens':[11004,11101,11102,11103,11104,11105,11106,11109,11351,11354,11355,11356,11357,11358,\
              11359,11360,11361,11362,11363,11364,11365,11366,11367,11368,11369,11370,11371,11372,\
              11373,11374,11375,11377,11378,11379,11385,11411,11412,11413,11414,11415,11416,11417,\
              11418,11419,11420,11421,11422,11423,11426,11427,11428,11429,11430,11432,11433,11434,\
              11435,11436,11691,11692,11693,11694,11697,11101,11102,11103,11004,11104,11105,11106,\
              11109,11351,11354,11355,11356,11357,11358,11359,11360,11361,11362,11363,11364,11365,\
              11366,11367,11368,11369,11370,11371,11372,11373,11374,11375,11377,11378,11379,11385,\
              11411,11412,11413,11414,11415,11416,11417,11418,11419,11420,11421,11422,11423,11426,\
              11427,11428,11429,11430,11432,11433,11434,11435,11436,11691,11692,11693,11694,11697], 
    'Manhattan':[10001,10002,10003,10004,10005,10006,10007,10009,10010,10011,10012,10013,10014,10015,\
                 10016,10017,10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,\
                 10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10044,10045,\
                 10048,10055,10060,10069,10090,10095,10098,10099,10103,10104,10105,10106,10107,10110,\
                 10111,10112,10115,10118,10119,10120,10121,10122,10123,10128,10151,10152,10153,10154,\
                 10155,10158,10161,10162,10165,10166,10167,10168,10169,10170,10171,10172,10173,10174,\
                 10175,10176,10177,10178,10199,10270,10271,10278,10279,10280,10281,10282,10001,10002,\
                 10003,10004,10005,10006,10007,10009,10010,10011,10012,10013,10014,10015,10016,10017,\
                 10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,10030,10031,\
                 10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10044,10045,10048,10055,\
                 10060,10069,10090,10095,10098,10099,10103,10104,10105,10106,10107,10110,10111,10112,\
                 10115,10118,10119,10120,10121,10122,10123,10128,10151,10152,10153,10154,10155,10158,\
                 10161,10162,10165,10166,10167,10168,10169,10170,10171,10172,10173,10174,10175,10176,\
                 10177,10178,10199,10270,10271,10278,10279,10280,10281,10282],
    'Bronx':[10451,10452,10453,10454,10455,10456,10457,10458,10459,10460,10461,10462,10463,10464,10465,\
             10466,10467,10468,10469,10470,10471,10472,10473,10474,10475,10451,10452,10453,10454,10455,\
             10456,10457,10458,10459,10460,10461,10462,10463,10464,10465,10466,10467,10468,10469,10470,\
             10471,10472,10473,10474,10475]
    }

print(nyc_zipcode_ref)

{'Staten Islant': [10301, 10302, 10303, 10304, 10305, 10306, 10307, 10308, 10309, 10310, 10311, 10312, 10314, 10301, 10302, 10303, 10304, 10305, 10306, 10307, 10308, 10309, 10310, 10311, 10312, 10314], 'Queens': [11004, 11101, 11102, 11103, 11104, 11105, 11106, 11109, 11351, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11361, 11362, 11363, 11364, 11365, 11366, 11367, 11368, 11369, 11370, 11371, 11372, 11373, 11374, 11375, 11377, 11378, 11379, 11385, 11411, 11412, 11413, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11422, 11423, 11426, 11427, 11428, 11429, 11430, 11432, 11433, 11434, 11435, 11436, 11691, 11692, 11693, 11694, 11697, 11101, 11102, 11103, 11004, 11104, 11105, 11106, 11109, 11351, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11361, 11362, 11363, 11364, 11365, 11366, 11367, 11368, 11369, 11370, 11371, 11372, 11373, 11374, 11375, 11377, 11378, 11379, 11385, 11411, 11412, 11413, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11422, 11423, 11426, 114

## Cleaning column: BOROUGH    


Source dataset:
https://data.cityofnewyork.us/Transportation/Automated-Traffic-Volume-Counts/7ym2-wayt

Column name: Boro

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Fill 'Unspecified' category

In [4]:
# sample data
datafile = "data/Automated_Traffic_Volume_Counts_sample1.csv"

# original data
# datafile = "data/Automated_Traffic_Volume_Counts.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")


In [5]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update

ds = update(ds, ["Boro"], lambda x: "Unspecified" if is_empty(x) else x)

In [6]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
RequestID,2999,0,1393,0.464488,9.834483
Boro,2999,0,5,0.001667,2.153459
Yr,2999,0,15,0.005002,3.458017
M,2999,0,12,0.004001,3.329946
D,2999,0,31,0.010337,4.914259
HH,2999,0,24,0.008003,4.580641
MM,2999,0,4,0.001334,1.99896
Vol,2999,0,452,0.150717,7.8297
SegmentID,2999,0,2401,0.8006,11.090162
WktGeom,2999,0,2640,0.880293,11.291368


## Cleaning column: BOROUGH 

Source dataset:
https://data.cityofnewyork.us/Transportation/Traffic-Signal-and-All-Way-Stop-Study-Requests/w76s-c5u4

Column name: Borough

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Fill 'Unspecified' category

In [7]:
# sample data
# datafile = "data/Traffic_Signal_and_All-Way_Stop_Study_Requests_sample1.csv"

# original data
datafile = "data/Traffic_Signal_and_All-Way_Stop_Study_Requests.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-8")


In [8]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update

ds = update(ds, ["Borough"], lambda x: "Unspecified" if is_empty(x) else x)

In [9]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Id,60130,0,60044,0.99857,15.872474
ReferenceNumber,60130,0,60044,0.99857,15.872474
StudyUnit,60130,0,2,3.3e-05,0.874897
DateCreated,60130,39225,1542,0.073762,10.206735
OldReferenceNumber,60130,43180,16364,0.965428,13.884466
ExternalReferenceNumber,60130,12933,42425,0.898892,15.196084
DateRequested,60130,0,5856,0.097389,11.972782
StatusDescription,60130,0,57,0.000948,2.268256
StudyStatus,60130,0,1,1.7e-05,0.0
StatusDate,60130,283,6304,0.105335,12.123976


## Cleaning column: BOROUGH/ZIPCODE

Source dataset:
https://data.cityofnewyork.us/City-Government/Mobile-Telecommunications-Franchise-Pole-Reservati/tbgj-tdd6

Column name: Borough/Zipcode

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Based on zipcode fill Borough field if this cell was empty

In [10]:
def find_key(input_dict, value):
    return next((k for k, v in input_dict.items() if v == value), None)

In [11]:
# sample data
datafile = "data/Mobile_Telecommunications_Franchise_Pole_Reservation_Locations_sample1.csv"

# original data
# datafile = "data/Mobile_Telecommunications_Franchise_Pole_Reservation_Locations.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")


ds = update(ds, ["Borough", "Zipcode"], lambda x, y: ("Unspecified", y) if is_empty(x) and is_empty(y) else (x, y))
ds = update(ds, ["Borough", "Zipcode"], lambda x, y: (x, y) if not is_empty(x) else (find_key(nyc_zipcode_ref, y), y))

In [12]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
Id,2999,0,2999,1.0,11.550266
Reservation Date,2999,0,146,0.048683,5.598481
Franchisee Name,2999,0,12,0.004001,3.445323
Status,2999,0,3,0.001,0.585593
Installation Date,2999,2917,52,0.634146,5.482254
Pole Class,2999,0,2,0.000667,0.799032
Borough,2999,0,5,0.001667,2.200983
X Coord.,2999,0,2989,0.996666,11.543597
Y Coord.,2999,0,2987,0.995999,11.542263
Latitude,2999,0,2992,0.997666,11.545598


## Cleaning column: CONTRIBUTING FACTOR VEHICLE 1(Collision reason)

Source dataset:
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Person/f55k-p6yu

Column name: Pedestrain - CONTRIBUTING_FACTOR_1

Origial:
Delete it if it was empty or invalid CONTRIBUTING_FACTOR

Improvement:
Using KNN to find most similar reason cause the collision

In [13]:
from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan
from openclean.function.eval.domain import Lookup


# sample data
datafile = "data/Person_sample11.csv"

# original data
# datafile = "data/Motor_Vehicle_Collisions_-_Person.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")


# cleaning using cluster and mapping
for i in range(1, 3): 
    col_name = "CONTRIBUTING_FACTOR_{}".format(i)

    # edit distance cluster
    clusters = knn_clusters(values=ds[col_name].unique().tolist(),\
                            sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
          )

    mapping = {}
    for cluster in clusters: 
        mapping.update(cluster.to_mapping())

    ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [14]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
UNIQUE_ID,2999,0,2999,1.0,11.550266
COLLISION_ID,2999,0,1021,0.340447,9.690908
CRASH_DATE,2999,0,102,0.034011,2.634173
CRASH_TIME,2999,0,480,0.160053,8.300347
PERSON_ID,2999,0,2999,1.0,11.550266
PERSON_TYPE,2999,0,3,0.001,0.184039
PERSON_INJURY,2999,0,2,0.000667,0.399919
VEHICLE_ID,2999,153,1538,0.540408,10.412097
PERSON_AGE,2999,648,88,0.037431,5.969127
EJECTION,2999,1430,5,0.003187,0.112124


## Cleaning column: CONTRIBUTING FACTOR VEHICLE 1(Collision reason)

Source dataset:
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Vehicles/bm4k-52h4

Column name: Vechicles - CONTRIBUTING_FACTOR_1

Origial:
Delete it if it was empty or invalid CONTRIBUTING_FACTOR

Improvement:
Using KNN to find most similar reason cause the collision

In [15]:
# sample data
datafile = "data/Motor_Vehicle_Collisions_-_Vehicles_sample1.csv"

# original data
# datafile = "data/Motor_Vehicle_Collisions_-_Vehicles.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")


# cleaning using cluster and mapping
for i in range(1, 3): 
    col_name = "CONTRIBUTING_FACTOR_{}".format(i)

    # edit distance cluster
    clusters = knn_clusters(values=ds[col_name].unique().tolist(),\
                            sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
          )

    mapping = {}
    for cluster in clusters: 
        mapping.update(cluster.to_mapping())

    ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [16]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
UNIQUE_ID,2999,0,2999,1.0,11.550266
COLLISION_ID,2999,0,2595,0.865288,11.273971
CRASH_DATE,2999,0,1182,0.394131,8.256007
CRASH_TIME,2999,0,668,0.222741,8.43313
VEHICLE_ID,2999,0,1282,0.427476,6.099176
STATE_REGISTRATION,2999,102,39,0.013462,1.152376
VEHICLE_TYPE,2999,89,54,0.018557,3.459897
VEHICLE_MAKE,2999,1847,118,0.102431,4.805664
VEHICLE_MODEL,2999,2949,39,0.78,5.101468
VEHICLE_YEAR,2999,1844,37,0.032035,4.350989


## Cleaning column: VEHICLE_OCCUPANTS

Source dataset:
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Person/f55k-p6yu

Column name: Vehicles - ["VEHICLE_OCCUPANTS"]

Origial:
Delete it if it was empty or invalid ["VEHICLE_OCCUPANTS"]

Improvement:
fill 0 if these columns are empty

In [17]:
ds = update(ds, ["VEHICLE_OCCUPANTS"], lambda x: 0 if is_empty(x) else x)

In [18]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
UNIQUE_ID,2999,0,2999,1.0,11.550266
COLLISION_ID,2999,0,2595,0.865288,11.273971
CRASH_DATE,2999,0,1182,0.394131,8.256007
CRASH_TIME,2999,0,668,0.222741,8.43313
VEHICLE_ID,2999,0,1282,0.427476,6.099176
STATE_REGISTRATION,2999,102,39,0.013462,1.152376
VEHICLE_TYPE,2999,89,54,0.018557,3.459897
VEHICLE_MAKE,2999,1847,118,0.102431,4.805664
VEHICLE_MODEL,2999,2949,39,0.78,5.101468
VEHICLE_YEAR,2999,1844,37,0.032035,4.350989


## Cleaning column: VEHICLE TYPE CODE(Vechicle type)

Source dataset:
https://data.cityofnewyork.us/Transportation/Vehicle-Classification-Counts-2014-2019-/96ay-ea4r

Column name: Veh Class Type

Origial:
Delete it if it was empty or invalid Veh Class Type

Improvement:
Using KNN to find most similar vehicle type

In [19]:
# sample data
datafile = "data/Vehicle_Classification_Counts_sample1.csv"

# original data
# datafile = "data/Vehicle_Classification_Counts__2014-2019_.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")

# cleaning letter case and null

col_name = "Veh Class Type"
ds = update(ds, col_name, str.title)

ds = update(ds, col_name, lambda x: "Unknown" if is_empty(x) else x)

In [20]:
from openclean.cluster.knn import knn_collision_clusters
from openclean.function.similarity.text import JaroWinklerSimilarity

# cleaning using cluster and mapping

col_name = "Veh Class Type"

# edit distance cluster
clusters = knn_collision_clusters(values=ds[col_name].tolist(),\
                                  sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.7))
     )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

# common substr cluster
clusters = knn_collision_clusters(values=ds[col_name].tolist(),\
                                  sim=SimilarityConstraint(func=JaroWinklerSimilarity(), pred=GreaterThan(0.9))
     )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [21]:
# select top 20 types and only keep these types
# all other types are regarded as unrecognizable values
top_types = ds["Veh Class Type"].value_counts().head(20).keys()

col_name = "Veh Class Type"

ds = update(ds, col_name, lambda x: "Unknown" if x not in top_types else x)

In [22]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
ID,2999,0,110,0.036679,6.684061
SegmentID,2999,0,176,0.058686,7.358322
Roadway Name,2999,0,96,0.032011,6.358331
From,2999,0,123,0.041014,6.822644
To,2999,0,122,0.04068,6.806488
Direction,2999,0,5,0.001667,2.024744
Date,2999,0,47,0.015672,5.195482
Veh Class Type,2999,0,7,0.002334,2.807353
12:00-1:00 AM,2999,1467,246,0.160574,4.758128
1:00-2:00AM,2999,1466,212,0.138291,4.442395


## Cleaning column: CRASH DATE/CRASH TIME

Source dataset:
https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2022/pvqr-7yc4

Column name: Vehicle Expiration Date

Origial:
Delete it if it was empty or invalid Date

Improvement:
Convert invalid data type to datetime

In [23]:
# sample data
df = pd.read_csv('data/Parking_Violations_Issued_-_Fiscal_Year_2022_sample1.csv', dtype=str, encoding="utf-16")

# original data
# df = pd.read_csv('data/Parking_Violations_Issued_-_Fiscal_Year_2022.csv', dtype=str)
df['Vehicle Expiration Date'] = pd.to_datetime(df['Vehicle Expiration Date'], format='%y%m%d', errors='coerce')


In [24]:
df

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1457617912,JEB5683,NY,PAS,06/25/2021,40,VAN,FORD,P,63430,...,GY,0,2007,-,3,,,,,
1,1457617924,JAN2986,NY,PAS,06/25/2021,20,SUBN,DODGE,P,13490,...,BLU,0,2007,-,0,,,,,
2,1457622427,FJH6630,TX,PAS,06/17/2021,98,SDN,AUDI,P,79430,...,WHITE,0,0,-,0,,,,,
3,1457638629,RD1Y5N,MO,PAS,06/16/2021,98,SDN,TOYOT,P,53130,...,TAN,0,2001,-,0,,,,,
4,1457639580,T503814C,NY,OMT,07/04/2021,40,TAXI,HONDA,P,81030,...,WHI,0,2020,-,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,1472129763,FJJ9206,NC,PAS,07/09/2021,98,SDN,HONDA,P,23930,...,RED,0,2015,-,0,,,,,
2995,1472130157,LHF1656,PA,PAS,07/03/2021,14,SUBN,HONDA,P,78130,...,BLUE,0,0,-,0,,,,,
2996,1472130200,CRPF005,99,PAS,06/26/2021,98,SDN,MAZDA,P,78330,...,GY,0,0,-,0,,,,,
2997,1472130212,AM01831,CT,PAS,06/26/2021,98,SDN,SUBAR,P,78330,...,BLACK,0,0,-,0,,,,,


## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Street-Closures-due-to-construction-activities-by-/i6b5-j7bu

Column name: ONSTREETNAME

Origial:
Delete it if it was empty or invalid ONSTREETNAME

Improvement:
Using KNN to find most similar Street name

In [25]:
# sample data
datafile = "data/Street_Closures_due_to_construction_activities_by_Block_sample1.csv"

# original data
# datafile = "data/Street_Closures_due_to_construction_activities_by_Block.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")

col_name = "ONSTREETNAME"

# edit distance cluster
clusters = knn_clusters(values=ds[col_name].unique().tolist(),\
                        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [26]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
SEGMENTID,2999,0,1854,0.618206,10.544037
ONSTREETNAME,2999,0,358,0.119373,6.614724
FROMSTREETNAME,2999,0,685,0.228409,8.28442
TOSTREETNAME,2999,0,746,0.24875,8.567063
BOROUGH_CODE,2999,0,5,0.001667,2.148741
WORK_START_DATE,2999,0,714,0.238079,7.956535
WORK_END_DATE,2999,0,254,0.084695,6.549187
PURPOSE,2999,0,35,0.011671,3.098242


## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Street-Closures-due-to-construction-activities-by-/478a-yykk

Column name: ONSTREETNAME

Origial:
Delete it if it was empty or invalid ONSTREETNAME

Improvement:
Using KNN to find most similar Street name

In [27]:
# sample data
datafile = "data/Street_Closures_due_to_construction_activities_by_Intersection_sample1.csv"

# original data
# datafile = "data/Street_Closures_due_to_construction_activities_by_Intersection.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")

col_name = "ONSTREETNAME"

# edit distance cluster
clusters = knn_clusters(values=ds[col_name].unique().tolist(),\
                        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [28]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
NODEID,2999,0,1979,0.659887,10.753798
ONSTREETNAME,2999,0,432,0.144048,7.054032
FROMSTREETNAME,2999,0,867,0.289096,9.06227
BOROUGH_CODE,2999,0,5,0.001667,1.715284
WORK_START_DATE,2999,0,1210,0.403468,8.590246
WORK_END_DATE,2999,0,235,0.078359,5.873009
PURPOSE,2999,0,29,0.00967,2.331951


## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Parking-Regulation-Locations-and-Signs/xswq-wnv9

Column name: main_st

Origial:
Delete it if it was empty or invalid main_st

Improvement:
Using KNN to find most similar Street name

In [29]:
# sample data
datafile = "data/locations_sample1.csv"

# original data
# datafile = "data/locations.csv"
# load the file in memory
ds = dataset(datafile, encoding="utf-16")

col_name = "main_st"

# edit distance cluster
clusters = knn_clusters(values=ds[col_name].unique().tolist(),\
                        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.7))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(ds, col_name, Lookup(columns=[col_name], mapping=mapping, default=col_name))

In [30]:
# review dataset profile
profiles = dataset_profile(ds)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
boro,2999,0,1,0.000333,0.0
order_no,2999,0,2999,1.0,11.550266
main_st,2999,0,280,0.093364,6.281427
from_st,2999,0,751,0.250417,8.818426
to_st,2999,0,762,0.254085,8.842728
sos,2999,0,4,0.001334,1.998387
