# Misspellings of Brooklyn

This is an example notebook that uses the SOUNDEX function to identify potential misspellings of the city name BROOKLYN in the NYC DOB Job Application Filings dataset.

In [1]:
# Use the 'DOB Job Application Filings - Download' notebook to download the
# 'DOB Job Application Filings' dataset for this example.

datafile = './ic3t-wcy2.tsv.gz'

# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/ic3t-wcy2.tsv.gz'

In [2]:
# This example makes use of the streaming option that avoids loading the full
# data frame into memory.

from openclean.pipeline import stream

df = stream(datafile)

In [3]:
# Find entries where the Soundex of the City is the same as the soundex for 'BROOKLYN'
# but where the city name is not 'BROOKLYN', i.e., potential misspellings.

from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

brooklyn = df\
    .select('City ')\
    .update('City ', str.upper)\
    .filter(And(Eval('City ', Soundex()) == soundex('BROOKLYN'), Col('City ') != 'BROOKLYN'))\
    .distinct()

In [4]:
# Print (potential) misspellings in decreasing order of their
# frequency.

print('RANK\tCOUNT\tNAME')
for i, entry in enumerate(brooklyn.most_common()):
    key, count = entry
    print('{}.\t{}\t{}'.format(i + 1, count, key))

RANK	COUNT	NAME
1.	1059	BRKLYN
2.	749	BROOKYLN
3.	437	BROOKLY
4.	288	BROOKLYN,
5.	231	BROKLYN
6.	162	BRROKLYN
7.	162	BROOKLN
8.	107	BROOKLYLN
9.	84	BROOOKLYN
10.	70	BROOKLNY
11.	56	BROOKLYM
12.	54	BRO0KLYN
13.	49	BROOKKLYN
14.	45	BROOKLYNB
15.	45	BROOKLLYN
16.	35	BROOKLYN NY
17.	35	BROKKLYN
18.	33	BRROOKLYN
19.	29	BROOKLKYN
20.	28	BROOKLYNN
21.	27	BROOKLTN
22.	24	BROOKLYN`
23.	22	BROOKLUN
24.	22	BROOKLYNQ
25.	21	BROOKLINE
26.	20	BROOKLNYN
27.	19	BROOKLYN, NY
28.	13	BROOKLYB
29.	13	BROOKL
30.	12	BROOKLY N
31.	11	BROOKLYKN
32.	11	BERKELEY
33.	11	BROOKJLYN
34.	10	BROOKLYNS
35.	10	BR00KLYN
36.	10	BROOKYLYN
37.	10	BERKLEY
38.	8	BROOKL;YN
39.	8	BROOKLYNM
40.	7	BROOKLYN.
41.	7	BROOKLYTN
42.	7	BROOKLYHN
43.	7	BROOKLIN
44.	7	BROOKLVILLE
45.	6	BROOJLYN
46.	6	BROOKLYN HEIGHT
47.	6	BROOKLYNNY
48.	6	BROOKLOYN
49.	5	BROOKLRN
50.	5	BRIOOKLYN
51.	5	BRIIKLYN
52.	5	BROOKLEN
53.	4	BROOKLYN1
54.	4	BROOKLYN `
55.	4	BORRKLYN
56.	4	BROOKILYN
57.	4	BBROOKLYN
58.	3	BROOKLYBN
59.	3	BROKLY
60.	3	BROOKLYN=
61.	3	