In [1]:
# Since we are using Google Colab, we have to first install openclean library.
!pip install openclean_notebook
!pip install openclean
!pip install openclean_geo



In [2]:
from openclean.pipeline import stream
from pandas import *
import pandas as pd
pd.set_option("max_rows", None)

# Please first mount the drive and make sure that the csvPath points to the correct csv file path.
csvPath = './combine_raw.csv'
# Read the csv.
ds_Full = stream(csvPath)

In [3]:
# Take a look at the sample data!
ds_Full.to_df()

Unnamed: 0,origin,index,city_name,street_name,block,lot,community_board
0,8fei-z6rz,1,BRONX,WALTON AVE,,,
1,8fei-z6rz,2,BROOKLYN,GATEWAY DR,,,
2,bty7-2jhb,1,NEW YORK,6TH AVENUE,1264.0,5.0,105
3,bty7-2jhb,2,L.I.C.,OCEAN TERRACE,683.0,1.0,502
4,bty7-2jhb,3,BROOKLYN,3 AVE,6133.0,56.0,310
5,bty7-2jhb,4,BROOKLYN,LOTT STREET,5136.0,58.0,317
6,bty7-2jhb,5,BROOKLYN,AVENUE N,7665.0,4.0,314
7,bty7-2jhb,6,BROOKLYN,FRANKLIN AVENUE,1927.0,6.0,303
8,bty7-2jhb,7,BROOKLYN,BAY 10 STREET,6460.0,310.0,311
9,bty7-2jhb,8,LIC,W 17TH STREET,741.0,10.0,104


#Updated Street Name Fixing
This part demonstrate the refined street name fixing strategy.
The problem of redundant period and comma is resolved, which contributes to the improvement of ***Precision*** from 0.xx to 0.yy. Also, the ***Recall*** improves from 0.xx to 0.yy after this modification.


In [4]:
# Get the clusters of the street names.
from openclean.cluster.key import key_collision
streets = ds_Full.to_df()['street_name'].to_list()
clusters = key_collision(values=streets, minsize=1, threads=4)
# Store the cluster result into a dictionary for further usage.
clusterLookUp = {}
for cluster in clusters:
  standard = cluster.suggestion()
  for sName, _count in cluster.items():
    clusterLookUp[sName] = standard

In [5]:
# Define the fixing function for street names.
def preCleanStreetName(s):
  # Changing the same street names with different appearances into the same name.
  try:
    s = clusterLookUp[s]
  except: 
    pass
  # Removing "." and "," in so that the following step won't retain them.
  try:
    s = s.replace('.', ' ')
    s = s.replace(',', ' ')
  except:
    pass
  return s

ds_Update = ds_Full.update(columns='street_name', func=preCleanStreetName)

# Call the built-in standardizing func of Openclean to finish the job.
from openclean_geo.address.usstreet import StandardizeUSStreetName
ds_Update = ds_Update.update(columns="street_name", func=StandardizeUSStreetName(characters='upper'))

In [6]:
ds_Update.to_df()

Unnamed: 0,origin,index,city_name,street_name,block,lot,community_board
0,8fei-z6rz,1,BRONX,WALTON AVE,,,
1,8fei-z6rz,2,BROOKLYN,GATEWAY DR,,,
2,bty7-2jhb,1,NEW YORK,6 AVE,1264.0,5.0,105
3,bty7-2jhb,2,L.I.C.,OCEAN TER,683.0,1.0,502
4,bty7-2jhb,3,BROOKLYN,3 AVE,6133.0,56.0,310
5,bty7-2jhb,4,BROOKLYN,LOTT ST,5136.0,58.0,317
6,bty7-2jhb,5,BROOKLYN,AVENUE N,7665.0,4.0,314
7,bty7-2jhb,6,BROOKLYN,FRANKLIN AVE,1927.0,6.0,303
8,bty7-2jhb,7,BROOKLYN,BAY 10 ST,6460.0,310.0,311
9,bty7-2jhb,8,LIC,WEST 17 ST,741.0,10.0,104


#Updated City Name Fixing
This part demonstrate the refined city name fixing strategy. The problem of (...) is resolved, which contributes to the improvement of Precision from 0.xx to 0.yy. Also, the Recall improves from 0.xx to 0.yy after this modification.

In [7]:
# Define Levenshtein distance function (from the mentioned link) - Levensthtein Algorithm
def levenshtein(s1, s2):

    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 
            deletions = current_row[j] + 1  
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

names_list = ds_Full\
              .to_df()\
              ['city_name'].tolist()
good_names = ['LONG ISLAND CITY', 
              'NEW YORK', 
              'FLUSHING', 
              'MANHATTAN', 
              'ROOSEVELT ISLAND', 
              'ENGLEWOOD CLIFF', 
              'BROOKLYN', 
              'BRONX',
              'ROCKAWAY POINT',
              'JAMAICA',
              'STATEN ISLAND',
              'NEW HYDE PARK',
              'FAR ROCKAWAY',
              'JERSEY CITY',
              'ROSLYN',
              'LAS VEGAS',
              'SECAUCUS',
              'QUEENS',
              'RICHMOND HILL',
              'OZONE PARK',
              'KATONAH',
              'ROSLYN HEIGHTS',
              'GLENDALE',
              'CORAL GABLES',
              'HOUSTON',
              'WOODHAVEN',
              'DOUGLASTON',
              'VASHON',
              'HAWTHORNE',
              'MONROE TWP']
# Define a function that returns the best match
def get_closest_match(name, real_names):

    levdist = [levenshtein(name, real_name) for real_name in real_names]

    for i in range(len(levdist)):
        if levdist[i] == min(levdist):
            return real_names[i]

# Loops the first list
final_list=[]
LIC=["L.I.C.","LIC", "LIC NY"]
for name in names_list: #For abbreviation, we seperate it to a different task
    if name in LIC:
      name="LONG ISLAND CITY"
    # elif name=="BK":
    #   name="BROOKLYN"
    # elif name=="NY":
    #   name="NEW YORK"
    else:
      name=get_closest_match(name, good_names)
    final_list.append(name)


# for name in names_list: #For abbreviation, we seperate it to a different task
#     name=get_closest_match(name, good_names)
#     final_list.append(name)


In [8]:
def correctCityName(city):
  city=final_list[0]
  final_list.pop(0)
  return city

In [9]:
ds_Update = ds_Update.update(columns="city_name", func=correctCityName)

In [10]:
ds_Update = ds_Update.to_df()

In [11]:
ds_Update.to_csv('./improved.csv')