In [1]:
%matplotlib inline
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
import csv
from openclean.data.load import dataset
from openclean.data.source.socrata import Socrata
from openclean.profiling.dataset import dataset_profile

# Apply the techniques you used for Part 1 and measure their effectiveness

In [2]:
# There are 1838945 rows in origial dataset and 1263421 rows after cleaned.
ori_data_cnt, cleaned_data_cnt = 1838945, 1263421

# remove invalid data type. e.g.
valid_data_cnt = 1263347

precision = cleaned_data_cnt / ori_data_cnt
recall = valid_data_cnt / cleaned_data_cnt

print(precision, recall)

0.6870357732286718 0.9999414288665457


## Data cleaning target
Improve/refine your techniques to cover the new data and compare its effectiveness with your original approach.

We removed all invalid data include unaccepted data type and blank cell. 

To improve our cleaning stratagy, we are going to fill empty cell by 'Unspecified', convert address under naming convention, standardlizate all datetime fields.

Example，

Collision reason:
    Unattention,
    ...
    
Address: 
    Street: 181 East street
    City: New York
    State: NY
    Zipcode: 10010
    
Date: 2021-12-09 (yyyy-MM-DD)
Time: 16:44:00 (hh:mm:ss)


In [16]:
# Add zipcode reference

nyc_zipcode_ref = {
    'Staten Islant':[10301,10302,10303,10304,10305,10306,10307,10308,10309,10310,10311,10312,\
                      10314,10301,10302,10303,10304,10305,10306,10307,10308,10309,10310,10311,\
                      10312,10314],
    'Queens':[11004,11101,11102,11103,11104,11105,11106,11109,11351,11354,11355,11356,11357,11358,\
              11359,11360,11361,11362,11363,11364,11365,11366,11367,11368,11369,11370,11371,11372,\
              11373,11374,11375,11377,11378,11379,11385,11411,11412,11413,11414,11415,11416,11417,\
              11418,11419,11420,11421,11422,11423,11426,11427,11428,11429,11430,11432,11433,11434,\
              11435,11436,11691,11692,11693,11694,11697,11101,11102,11103,11004,11104,11105,11106,\
              11109,11351,11354,11355,11356,11357,11358,11359,11360,11361,11362,11363,11364,11365,\
              11366,11367,11368,11369,11370,11371,11372,11373,11374,11375,11377,11378,11379,11385,\
              11411,11412,11413,11414,11415,11416,11417,11418,11419,11420,11421,11422,11423,11426,\
              11427,11428,11429,11430,11432,11433,11434,11435,11436,11691,11692,11693,11694,11697], 
    'Manhattan':[10001,10002,10003,10004,10005,10006,10007,10009,10010,10011,10012,10013,10014,10015,\
                 10016,10017,10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,\
                 10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10044,10045,\
                 10048,10055,10060,10069,10090,10095,10098,10099,10103,10104,10105,10106,10107,10110,\
                 10111,10112,10115,10118,10119,10120,10121,10122,10123,10128,10151,10152,10153,10154,\
                 10155,10158,10161,10162,10165,10166,10167,10168,10169,10170,10171,10172,10173,10174,\
                 10175,10176,10177,10178,10199,10270,10271,10278,10279,10280,10281,10282,10001,10002,\
                 10003,10004,10005,10006,10007,10009,10010,10011,10012,10013,10014,10015,10016,10017,\
                 10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,10030,10031,\
                 10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10044,10045,10048,10055,\
                 10060,10069,10090,10095,10098,10099,10103,10104,10105,10106,10107,10110,10111,10112,\
                 10115,10118,10119,10120,10121,10122,10123,10128,10151,10152,10153,10154,10155,10158,\
                 10161,10162,10165,10166,10167,10168,10169,10170,10171,10172,10173,10174,10175,10176,\
                 10177,10178,10199,10270,10271,10278,10279,10280,10281,10282],
    'Bronx':[10451,10452,10453,10454,10455,10456,10457,10458,10459,10460,10461,10462,10463,10464,10465,\
             10466,10467,10468,10469,10470,10471,10472,10473,10474,10475,10451,10452,10453,10454,10455,\
             10456,10457,10458,10459,10460,10461,10462,10463,10464,10465,10466,10467,10468,10469,10470,\
             10471,10472,10473,10474,10475]
    }

print(nyc_zipcode_ref)

{'Staten Islant': [10301, 10302, 10303, 10304, 10305, 10306, 10307, 10308, 10309, 10310, 10311, 10312, 10314, 10301, 10302, 10303, 10304, 10305, 10306, 10307, 10308, 10309, 10310, 10311, 10312, 10314], 'Queens': [11004, 11101, 11102, 11103, 11104, 11105, 11106, 11109, 11351, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11361, 11362, 11363, 11364, 11365, 11366, 11367, 11368, 11369, 11370, 11371, 11372, 11373, 11374, 11375, 11377, 11378, 11379, 11385, 11411, 11412, 11413, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11422, 11423, 11426, 11427, 11428, 11429, 11430, 11432, 11433, 11434, 11435, 11436, 11691, 11692, 11693, 11694, 11697, 11101, 11102, 11103, 11004, 11104, 11105, 11106, 11109, 11351, 11354, 11355, 11356, 11357, 11358, 11359, 11360, 11361, 11362, 11363, 11364, 11365, 11366, 11367, 11368, 11369, 11370, 11371, 11372, 11373, 11374, 11375, 11377, 11378, 11379, 11385, 11411, 11412, 11413, 11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421, 11422, 11423, 11426, 114

## Cleaning column: BOROUGH    

Source dataset:
https://data.cityofnewyork.us/Transportation/Automated-Traffic-Volume-Counts/7ym2-wayt

Column name: Boro

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Fill 'Unspecified' category

In [4]:
datafile = "data/Automated_Traffic_Volume_Counts.csv"
# load the file in memory
ds = dataset(datafile)


FileNotFoundError: [Errno 2] No such file or directory: 'data/Automated_Traffic_Volume_Counts.csv'

In [7]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update

ds = update(
    ds, 
    ["Boro"], 
    lambda x: "Unspecified" if is_empty(x) else x
    )

## Cleaning column: BOROUGH 

Source dataset:
https://data.cityofnewyork.us/Transportation/Traffic-Signal-and-All-Way-Stop-Study-Requests/w76s-c5u4

Column name: Borough

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Fill 'Unspecified' category

In [18]:
datafile = "data/Traffic_Signal_and_All-Way_Stop_Study_Requests.csv"
# load the file in memory
ds = dataset(datafile)


UnicodeDecodeError: 'gbk' codec can't decode byte 0xa6 in position 7219: illegal multibyte sequence

In [None]:
from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update

ds = update(
    ds, 
    ["Borough"], 
    lambda x: "Unspecified" if is_empty(x) else x
    )

## Cleaning column: BOROUGH/ZIPCODE

Source dataset:
https://data.cityofnewyork.us/City-Government/Mobile-Telecommunications-Franchise-Pole-Reservati/tbgj-tdd6

Column name: Borough/Zipcode

Origial:
Delete it if it was empty or invalid Borough

Improvement:
Based on zipcode fill Borough field if this cell was empty

In [16]:
def find_key(input_dict, value):
    return next((k for k, v in input_dict.items() if v == value), None)

In [17]:
datafile = "data/Mobile_Telecommunications_Franchise_Pole_Reservation_Locations.csv"
# load the file in memory
ds = dataset(datafile)


ds = update(
    ds, 
    ["Borough", "Zipcode"], 
    lambda x, y: ("Unspecified", y) if is_empty(x) and is_empty(y) else (x, y)
    )
ds = update(
    ds, 
    ["Borough", "Zipcode"], 
    lambda x, y: (x, y) if not is_empty(x) else (find_key(nyc_zipcode_ref, y), y)
    )

UnicodeDecodeError: 'gbk' codec can't decode byte 0x94 in position 8122: illegal multibyte sequence

## Cleaning column: CONTRIBUTING FACTOR VEHICLE 1(Collision reason)

Source dataset:
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Person/f55k-p6yu

Column name: Pedestrain - CONTRIBUTING_FACTOR_1

Origial:
Delete it if it was empty or invalid CONTRIBUTING_FACTOR

Improvement:
Using KNN to find most similar reason cause the collision

In [8]:
from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan
from openclean.function.eval.domain import Lookup

datafile = "data/Motor_Vehicle_Collisions_-_Person.csv"
# load the file in memory
ds = dataset(datafile)


# cleaning using cluster and mapping
for i in range(1, 3): 
    col_name = "CONTRIBUTING_FACTOR_{}".format(i)

    # edit distance cluster
    clusters = knn_clusters(
          values=ds[col_name].unique().tolist(),
          sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
          )

    mapping = {}
    for cluster in clusters: 
        mapping.update(cluster.to_mapping())

    ds = update(
          ds, 
          col_name, 
          Lookup(columns=[col_name], mapping=mapping, default=col_name)
          )

## Cleaning column: CONTRIBUTING FACTOR VEHICLE 1(Collision reason)

Source dataset:
https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Vehicles/bm4k-52h4

Column name: Vechicles - CONTRIBUTING_FACTOR_1

Origial:
Delete it if it was empty or invalid CONTRIBUTING_FACTOR

Improvement:
Using KNN to find most similar reason cause the collision

In [None]:

datafile = "data/Motor_Vehicle_Collisions_-_Vehicles.csv"
# load the file in memory
ds = dataset(datafile)


# cleaning using cluster and mapping
for i in range(1, 3): 
    col_name = "CONTRIBUTING_FACTOR_{}".format(i)

    # edit distance cluster
    clusters = knn_clusters(
          values=ds[col_name].unique().tolist(),
          sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
          )

    mapping = {}
    for cluster in clusters: 
        mapping.update(cluster.to_mapping())

    ds = update(
          ds, 
          col_name, 
          Lookup(columns=[col_name], mapping=mapping, default=col_name)
          )

## Cleaning column: VEHICLE TYPE CODE(Vechicle type)

Source dataset:
https://data.cityofnewyork.us/Transportation/Vehicle-Classification-Counts-2014-2019-/96ay-ea4r

Column name: Veh Class Type

Origial:
Delete it if it was empty or invalid Veh Class Type

Improvement:
Using KNN to find most similar vehicle type

In [9]:
datafile = "data/Vehicle_Classification_Counts__2014-2019_.csv"
# load the file in memory
ds = dataset(datafile)

# cleaning letter case and null

col_name = "Veh Class Type"
ds = update(
      ds, 
      col_name, 
      str.title
      )

ds = update(
      ds, 
      col_name, 
      lambda x: "Unknown" if is_empty(x) else x
      )

In [10]:
from openclean.cluster.knn import knn_collision_clusters
from openclean.function.similarity.text import JaroWinklerSimilarity

# cleaning using cluster and mapping

col_name = "Veh Class Type"

# edit distance cluster
clusters = knn_collision_clusters(
       values=ds[col_name].tolist(),
       sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.7))
     )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(
      ds, 
      col_name, 
      Lookup(columns=[col_name], mapping=mapping, default=col_name)
      )

# common substr cluster
clusters = knn_collision_clusters(
       values=ds[col_name].tolist(),
       sim=SimilarityConstraint(func=JaroWinklerSimilarity(), pred=GreaterThan(0.9))
     )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(
      ds, 
      col_name, 
      Lookup(columns=[col_name], mapping=mapping, default=col_name)
      )

In [11]:
# select top 20 types and only keep these types
# all other types are regarded as unrecognizable values
top_types = ds["Veh Class Type"].value_counts().head(20).keys()

col_name = "Veh Class Type"

ds = update(
      ds, 
      col_name, 
      lambda x: "Unknown" if x not in top_types else x
      )

## Cleaning column: CRASH DATE/CRASH TIME

Source dataset:
https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2022/pvqr-7yc4

Column name: Vehicle Expiration Date

Origial:
Delete it if it was empty or invalid Date

Improvement:
Convert invalid data type to datetime

In [13]:
df = pd.read_csv('data/Parking_Violations_Issued_-_Fiscal_Year_2022.csv', dtype=str)
df['Vehicle Expiration Date'] = pd.to_datetime(df['Vehicle Expiration Date'], format='%y%m%d', errors='coerce')


## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Street-Closures-due-to-construction-activities-by-/i6b5-j7bu

Column name: ONSTREETNAME

Origial:
Delete it if it was empty or invalid ONSTREETNAME

Improvement:
Using KNN to find most similar Street name

In [None]:

datafile = "data/Street_Closures_due_to_construction_activities_by_Block.csv"
# load the file in memory
ds = dataset(datafile)

col_name = "ONSTREETNAME"

# edit distance cluster
clusters = knn_clusters(
      values=ds[col_name].unique().tolist(),
      sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(
      ds, 
      col_name, 
      Lookup(columns=[col_name], mapping=mapping, default=col_name)
      )

## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Street-Closures-due-to-construction-activities-by-/478a-yykk

Column name: ONSTREETNAME

Origial:
Delete it if it was empty or invalid ONSTREETNAME

Improvement:
Using KNN to find most similar Street name

In [None]:
datafile = "data/Street_Closures_due_to_construction_activities_by_Intersection.csv"
# load the file in memory
ds = dataset(datafile)

col_name = "ONSTREETNAME"

# edit distance cluster
clusters = knn_clusters(
      values=ds[col_name].unique().tolist(),
      sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.8))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(
      ds, 
      col_name, 
      Lookup(columns=[col_name], mapping=mapping, default=col_name)
      )

## Cleaning column: ON STREET NAME

Source dataset:
https://data.cityofnewyork.us/Transportation/Parking-Regulation-Locations-and-Signs/xswq-wnv9

Column name: main_st

Origial:
Delete it if it was empty or invalid main_st

Improvement:
Using KNN to find most similar Street name

In [19]:
datafile = "data/locations.csv"
# load the file in memory
ds = dataset(datafile)

col_name = "main_st"

# edit distance cluster
clusters = knn_clusters(
      values=ds[col_name].unique().tolist(),
      sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.7))
      )

mapping = {}
for cluster in clusters: 
    mapping.update(cluster.to_mapping())

ds = update(
      ds, 
      col_name, 
      Lookup(columns=[col_name], mapping=mapping, default=col_name)
      )