# Missouri Sex Offender Registry

Data acquisition, documentation, carpentry, geocoding, and database loading for Missouri Sex Offender Registry (MSOR) and supporting info.   

In [1]:
# IMPORTS
import geopandas as gpd
import pandas as pd

# import os
# import urllib.request
# import requests
# import shutil
# from pathlib import Path
# from zipfile import ZipFile

import matplotlib.pyplot as plt
from matplotlib import pyplot

import folium

# from shapely.geometry import Point, Polygon

# from geopandas.tools import overlay

from geopy.geocoders import Nominatim # for geocoding

# import random # for obscuring sex offender names

In [2]:
# we need GeoAlchemy2 to run the geodataframe to_postgis method later

In [3]:
pip install GeoAlchemy2


Collecting GeoAlchemy2
  Downloading https://files.pythonhosted.org/packages/df/b4/94b1f707dc89d107ac0a49a1f36a45b8b57812e603951f84bef999df3e3b/GeoAlchemy2-0.10.2-py2.py3-none-any.whl
Installing collected packages: GeoAlchemy2
Successfully installed GeoAlchemy2-0.10.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
# a few more imports specfic to the database process
import geoalchemy2 
import getpass

import psycopg2
import numpy
from psycopg2.extensions import adapt, register_adapter, AsIs

from sqlalchemy import create_engine


In [5]:
# get user password for connecting to the db
mypasswd = getpass.getpass()

········


In [6]:
# set up db connection
conn = psycopg2.connect(database = 'cappsds_psmd39', 
                              user = 'psmd39', 
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)


In [7]:
# establish cursor and read the existing tables
cursor = conn.cursor()

cursor.execute("""SELECT relname FROM pg_class WHERE relkind='r'
                  AND relname !~ '^(pg_|sql_)';""") # "rel" is short for relation.

tables = [i[0] for i in cursor.fetchall()] # A list() of tables.
tables.sort()
tables


['country_borders',
 'gadm_admin_borders',
 'geonames_feature',
 'msorfailedgeocoding',
 'spatial_ref_sys',
 'stlchildcare',
 'stlnonrestrictedresidential',
 'stlnonrestrictedresparcels',
 'stlpubschools',
 'stlpvtschools',
 'stlresparcels',
 'stlrestrictedflat',
 'stlsexoffenders',
 'stlzoning']

## Failed geocoding
There were a lot of sex offender registry entries that failed geocoding. Possible solutions:
- Look for trends in the failed entries and work to resolve
- Try another geocoder  

### Get the entries that failed geocoding out of the database
In the prior notebook, we stored all these records in a dedicated table for easy access.

In [11]:
# test to make sure the load actually worked
# query the table and read data into a geodf 
sql = "select * from msorfailedgeocoding;"
msor_nogeo = pd.read_sql_query(sql, conn)
print(msor_nogeo.shape)
msor_nogeo.head()

(1334, 14)


Unnamed: 0,index,name,address,city,st,zip,county,offense,count,compliant,tier,date_of_birth,full_address,geocode
0,10,"ABDI, IBRAHIM A",3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO,63116,ST LOUIS CITY,SEXUAL MISCONDUCT-3RD,1,Y,1,1981-09-08,"3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO",
1,18,"ABERNATHY, RANDELL L",3866 S SPRING AVE APT 1S,SAINT LOUIS,MO,63116,ST LOUIS CITY,AGG CRIM SEX ASSAULT,2,Y,3,1969-07-30,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO",
2,40,"ACKLEY, CLIFFORD D",3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,ST LOUIS CITY,RAPE,1,Y,3,1964-10-25,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",
3,41,"ACKLEY, CLIFFORD D",3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,ST LOUIS CITY,STATUTORY RAPE-2ND DEGRE,1,Y,3,1964-10-25,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",
4,99,"ADAMS, QUINDARRYL L",4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO,63110,ST LOUIS CITY,SEXUAL BATTERY,1,Y,1,1990-10-30,"4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO",


### Set up the geocoder

In [13]:
# set up the geocoder
geolocator = Nominatim(timeout=10, user_agent = "myGeolocator")

In [15]:
# test out the geocoder with a single address
location = geolocator.geocode('120 CATALAN,ST LOUIS,MO')
print(location)
print((location.latitude, location.longitude))

St. Louis Skatium, 120, East Catalan Street, Patch, Saint Louis, Missouri, 63111, United States
(38.5396446, -90.26550765004728)


### THIS LOOKS PROMISING

In [17]:
# geocode_test_slice = msor_nogeo_test.sample(n=20)
geocode_test_slice = msor_nogeo.copy()
geocode_test_slice


Unnamed: 0,index,name,address,city,st,zip,county,offense,count,compliant,tier,date_of_birth,full_address,geocode
0,10,"ABDI, IBRAHIM A",3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO,63116,ST LOUIS CITY,SEXUAL MISCONDUCT-3RD,1,Y,1,1981-09-08,"3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO",
1,18,"ABERNATHY, RANDELL L",3866 S SPRING AVE APT 1S,SAINT LOUIS,MO,63116,ST LOUIS CITY,AGG CRIM SEX ASSAULT,2,Y,3,1969-07-30,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO",
2,40,"ACKLEY, CLIFFORD D",3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,ST LOUIS CITY,RAPE,1,Y,3,1964-10-25,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",
3,41,"ACKLEY, CLIFFORD D",3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,ST LOUIS CITY,STATUTORY RAPE-2ND DEGRE,1,Y,3,1964-10-25,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",
4,99,"ADAMS, QUINDARRYL L",4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO,63110,ST LOUIS CITY,SEXUAL BATTERY,1,Y,1,1990-10-30,"4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,26006,"YOUNG, TYRONE F",4962 THEKLA AVE FL 1,SAINT LOUIS,MO,63115,ST LOUIS CITY,DEVIATE SEXUAL ASSAULT,1,Y,3,1965-06-30,"4962 THEKLA AVE FL 1,SAINT LOUIS,MO",
1330,26011,"YOUNG, WILLIAM R",159 SHEPLEY DR APT 2,SAINT LOUIS,MO,63137,ST LOUIS,CHILD MOLESTATION,1,Y,3,1957-11-30,"159 SHEPLEY DR APT 2,SAINT LOUIS,MO",
1331,26100,"ZWEIFEL, JUDITH A",4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO,63110,ST LOUIS CITY,CHILD MOLEST-1ST DEGREE,1,Y,3,1968-10-16,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO",
1332,26101,"ZWEIFEL, JUDITH A",4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO,63110,ST LOUIS CITY,STAT RAPE-1ST DEG-PERS UNDER 14,9,Y,3,1968-10-16,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO",


In [181]:
# geocode_test_slice = geocode_test_slice.reset_index()


In [18]:
# copy the address to a new column and then remove the part that causes geocoding to fail
geocode_test_slice['new_address'] = geocode_test_slice['address']
geocode_test_slice['new_address'] = geocode_test_slice.address.str.split(' APT')

# the split (above) outputs a list of two elements: the part of the address before "APT" and the part after
# we only care about the part before
# convert those list items up into two columns, then keep the valuable column back in the original column
address_split = pd.DataFrame(geocode_test_slice["new_address"].to_list(), columns=['keep', 'trash'])
geocode_test_slice['new_address'] = address_split['keep']

geocode_test_slice[['address','city','st','zip','full_address','new_address']]

Unnamed: 0,address,city,st,zip,full_address,new_address
0,3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO,63116,"3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO",3764 CHIPPEWA ST
1,3866 S SPRING AVE APT 1S,SAINT LOUIS,MO,63116,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO",3866 S SPRING AVE
2,3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",3329 LAWN AVE
3,3329 LAWN AVE APT 4,SAINT LOUIS,MO,63139,"3329 LAWN AVE APT 4,SAINT LOUIS,MO",3329 LAWN AVE
4,4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO,63110,"4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO",4133 CLEVELAND AVE
...,...,...,...,...,...,...
1329,4962 THEKLA AVE FL 1,SAINT LOUIS,MO,63115,"4962 THEKLA AVE FL 1,SAINT LOUIS,MO",4962 THEKLA AVE FL 1
1330,159 SHEPLEY DR APT 2,SAINT LOUIS,MO,63137,"159 SHEPLEY DR APT 2,SAINT LOUIS,MO",159 SHEPLEY DR
1331,4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO,63110,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO",4447 CASTLEMAN AVE
1332,4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO,63110,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO",4447 CASTLEMAN AVE


In [19]:
# merge together street addres, city, and state to create the "full address"
geocode_test_slice['new_address'] = geocode_test_slice.new_address + "," + geocode_test_slice.city + "," + geocode_test_slice.st

# now remove some of the elements that trip up the geocoder, using the comma to avoid unwanted replacements elsewhere
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' RD,',',')
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' AVE,',',')
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' DR,',',')

# geocode_test_slice[['address','city','st','zip','full_address','new_address']]


In [20]:
# send the updated addresses back to the geocoder
geocode_test_slice['geocode'] = geocode_test_slice.new_address.apply(geolocator.geocode)


In [28]:
# check out how many values are still failing geocoding ("isnull == True")
geocode_test_slice['geocode'].isnull().value_counts()

False    903
True     431
Name: geocode, dtype: int64

In [21]:
geocode_test_slice[['address','full_address','new_address','geocode']]


Unnamed: 0,address,full_address,new_address,geocode
0,3764 CHIPPEWA ST APT 8,"3764 CHIPPEWA ST APT 8,SAINT LOUIS,MO","3764 CHIPPEWA ST,SAINT LOUIS,MO","(3764, Chippewa Street, Dutchtown, Saint Louis..."
1,3866 S SPRING AVE APT 1S,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO","3866 S SPRING,SAINT LOUIS,MO",
2,3329 LAWN AVE APT 4,"3329 LAWN AVE APT 4,SAINT LOUIS,MO","3329 LAWN,SAINT LOUIS,MO","(3329, Lawn Avenue, North Hampton, Saint Louis..."
3,3329 LAWN AVE APT 4,"3329 LAWN AVE APT 4,SAINT LOUIS,MO","3329 LAWN,SAINT LOUIS,MO","(3329, Lawn Avenue, North Hampton, Saint Louis..."
4,4133 CLEVELAND AVE APT 1W,"4133 CLEVELAND AVE APT 1W,SAINT LOUIS,MO","4133 CLEVELAND,SAINT LOUIS,MO","(4133, Cleveland Avenue, Shaw, Saint Louis, Mi..."
...,...,...,...,...
1329,4962 THEKLA AVE FL 1,"4962 THEKLA AVE FL 1,SAINT LOUIS,MO","4962 THEKLA AVE FL 1,SAINT LOUIS,MO",
1330,159 SHEPLEY DR APT 2,"159 SHEPLEY DR APT 2,SAINT LOUIS,MO","159 SHEPLEY,SAINT LOUIS,MO","(159, Shepley Drive, Glasgow Village, Saint Lo..."
1331,4447 CASTLEMAN AVE APT 1 WEST,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO","4447 CASTLEMAN,SAINT LOUIS,MO","(4447, Castleman Avenue, Southwest Garden, Tow..."
1332,4447 CASTLEMAN AVE APT 1 WEST,"4447 CASTLEMAN AVE APT 1 WEST,SAINT LOUIS,MO","4447 CASTLEMAN,SAINT LOUIS,MO","(4447, Castleman Avenue, Southwest Garden, Tow..."


In [22]:
geocode_test_slice_nona = geocode_test_slice.dropna().copy()
geocode_test_slice_nona.shape

(903, 15)

In [29]:
# get the latitude and longitude values from the geodata column and put them in their own columns for easier plotting
geocode_test_slice_nona['lat'] = [g.latitude for g in geocode_test_slice_nona.geocode]
geocode_test_slice_nona['long'] = [g.longitude for g in geocode_test_slice_nona.geocode]


#### Render a map that shows all the entries we recovered!

In [30]:
# create a base map centered on St. Louis
map_sexoffenders2 = folium.Map(
    location=[38.627003, -90.3],
    tiles='cartodbpositron',
    zoom_start=11,
)

# add a marker for each childcare facility
# label each facility with its name
for i in range(0,len(geocode_test_slice_nona)):
   folium.Marker(
      location=[geocode_test_slice_nona.iloc[i]['lat'], geocode_test_slice_nona.iloc[i]['long']],
      popup=geocode_test_slice_nona.iloc[i]['offense']
   ).add_to(map_sexoffenders2)

# display the map
map_sexoffenders2

In [58]:
# look at how many entries are still failing
nogeo_after_pass_2 = geocode_test_slice[geocode_test_slice['geocode'].isna()].copy()
nogeo_after_pass_2


Unnamed: 0,index,name,address,city,st,zip,county,offense,count,compliant,tier,date_of_birth,full_address,geocode,new_address
1,18,"ABERNATHY, RANDELL L",3866 S SPRING AVE APT 1S,SAINT LOUIS,MO,63116,ST LOUIS CITY,AGG CRIM SEX ASSAULT,2,Y,3,1969-07-30,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO",,"3866 S SPRING,SAINT LOUIS,MO"
5,109,"ADAMS, WILLIAM J",5340 GRANT ST FL 2ND,SAINT LOUIS,MO,63107,ST LOUIS CITY,STAT SODOMY-1ST DEG-PERS UND 14,1,Y,3,1967-10-26,"5340 GRANT ST FL 2ND,SAINT LOUIS,MO",,"5340 GRANT ST FL 2ND,SAINT LOUIS,MO"
9,212,"ALDRIDGE, SAMUEL A",120 W CATALAN AVE APT 201,ST LOUIS,MO,63111,ST LOUIS CITY,ATTEMPT RAPE,1,Y,3,1964-10-22,"120 W CATALAN AVE APT 201,ST LOUIS,MO",,"120 W CATALAN,ST LOUIS,MO"
10,213,"ALDRIDGE, SAMUEL A",120 W CATALAN AVE APT 201,ST LOUIS,MO,63111,ST LOUIS CITY,CHLD MOLST-2ND DEG-INJRY,1,Y,3,1964-10-22,"120 W CATALAN AVE APT 201,ST LOUIS,MO",,"120 W CATALAN,ST LOUIS,MO"
11,214,"ALDRIDGE, SAMUEL A",120 W CATALAN AVE APT 201,ST LOUIS,MO,63111,ST LOUIS CITY,RAPE,1,Y,3,1964-10-22,"120 W CATALAN AVE APT 201,ST LOUIS,MO",,"120 W CATALAN,ST LOUIS,MO"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,25991,"YOUNG, MARVIN",3922 GUSTINE AVE FL 1,SAINT LOUIS,MO,63116,ST LOUIS CITY,CHILD MOLEST-1ST DEGREE,1,Y,3,1955-10-22,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO",,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO"
1326,25992,"YOUNG, MARVIN",3922 GUSTINE AVE FL 1,SAINT LOUIS,MO,63116,ST LOUIS CITY,ENDANGER WELFARE CHILD,1,Y,3,1955-10-22,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO",,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO"
1327,25993,"YOUNG, MARVIN",3922 GUSTINE AVE FL 1,SAINT LOUIS,MO,63116,ST LOUIS CITY,SEX MISC-1ST-PRIOR CONV/WEAPON,1,Y,3,1955-10-22,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO",,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO"
1328,25994,"YOUNG, MARVIN",3922 GUSTINE AVE FL 1,SAINT LOUIS,MO,63116,ST LOUIS CITY,STATUTORY SODOMY-1ST DEG,2,Y,3,1955-10-22,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO",,"3922 GUSTINE AVE FL 1,SAINT LOUIS,MO"


In [191]:
# one remaining issue is addresses with floors listed e.g. "FL 1" or "FL 2ND"
# another issue is addresses with cardinal direction letters e.g. "N","S","E","W"

In [46]:
# test out the geocoder with a single address
location = geolocator.geocode('5340 GRANT ST,SAINT LOUIS,MO')
print(location)
print((location.latitude, location.longitude))

None


AttributeError: 'NoneType' object has no attribute 'latitude'

In [59]:
nogeo_after_pass_2 = nogeo_after_pass_2.reset_index()


In [176]:
# one remaining issue is addresses with floors listed e.g. "FL 1" or "FL 2ND"
# another issue is addresses with cardinal direction letters e.g. "N","S","E","W"

# copy the address to a new column and then remove the part that causes geocoding to fail

# set up a list containing the string elements we want to remove
to_remove = [' FL ',' APT',' NBR',' RM',' UNIT',' DEPT',' REAR']
tot_ct = 0

# copy the existing addresses to a new column to initialize the target of the for loop
nogeo_after_pass_2['new_address'] = nogeo_after_pass_2['address']
print("Dataframe has",len(nogeo_after_pass_2),"entries")

# loop through all the elements in the list, removing each from the address
for i in to_remove:
    # split() outputs a list of two elements: the part of the address before the match string [i] and the part after
    # we only care about the part before
    nogeo_after_pass_2['split'] = nogeo_after_pass_2['new_address'].str.split(i)
    # convert those list items into two columns in a new (temp) df, then store the usable column back in the original df
    address_split = pd.DataFrame(nogeo_after_pass_2['split'].to_list(), columns=['keep', 'trash'])
    # count how many items we modified
    loop_ct = address_split['trash'].notnull().sum()
    # keep a running total of the modifications we've made
    tot_ct = tot_ct + loop_ct
    # overwrite the "new_address" with the updated value. this can then be used in subsequent loops for new matches.
    nogeo_after_pass_2['new_address'] = address_split['keep']
    print('Removed "',i,'" from address',' (',loop_ct,' entries)',sep='')

print(tot_ct,'total modifications')

nogeo_after_pass_2[['zip','address','full_address','new_address']].head(25)

Dataframe has 431 entries
Removed " FL " from address (173 entries)
Removed " APT" from address (112 entries)
Removed " NBR" from address (3 entries)
Removed " RM" from address (53 entries)
Removed " UNIT" from address (17 entries)
Removed " DEPT" from address (1 entries)
Removed " REAR" from address (2 entries)
361 total modifications


Unnamed: 0,zip,address,full_address,new_address
0,63116,3866 S SPRING AVE APT 1S,"3866 S SPRING AVE APT 1S,SAINT LOUIS,MO",3866 S SPRING AVE
1,63107,5340 GRANT ST FL 2ND,"5340 GRANT ST FL 2ND,SAINT LOUIS,MO",5340 GRANT ST
2,63111,120 W CATALAN AVE APT 201,"120 W CATALAN AVE APT 201,ST LOUIS,MO",120 W CATALAN AVE
3,63111,120 W CATALAN AVE APT 201,"120 W CATALAN AVE APT 201,ST LOUIS,MO",120 W CATALAN AVE
4,63111,120 W CATALAN AVE APT 201,"120 W CATALAN AVE APT 201,ST LOUIS,MO",120 W CATALAN AVE
5,63111,120 W CATALAN AVE APT 201,"120 W CATALAN AVE APT 201,ST LOUIS,MO",120 W CATALAN AVE
6,63111,5001 IDAHO AVE FL 1ST,"5001 IDAHO AVE FL 1ST,SAINT LOUIS,MO",5001 IDAHO AVE
7,63104,1218 SOULARD ST FL 1ST,"1218 SOULARD ST FL 1ST,SAINT LOUIS,MO",1218 SOULARD ST
8,63104,1218 SOULARD ST FL 1ST,"1218 SOULARD ST FL 1ST,SAINT LOUIS,MO",1218 SOULARD ST
9,63107,1420 E LINTON AVE,"1420 E LINTON AVE,SAINT LOUIS,MO",1420 E LINTON AVE


In [134]:
# test out the geocoder with a single address
location = geolocator.geocode('7104 PAGE AVE,SAINT LOUIS,MO 	')
print(location)
print((location.latitude, location.longitude))

7104, Page Avenue, Pagedale, Saint Louis County, Missouri, 63133, United States
(38.676149571428574, -90.31305042857143)


In [179]:
# look at all the items we have not yet modified
nogeo_after_pass_2['match'] = nogeo_after_pass_2.address == nogeo_after_pass_2.new_address
nogeo_after_pass_2.loc[nogeo_after_pass_2['match'] == True][['zip','address','full_address','new_address']].tail(25)

Unnamed: 0,zip,address,full_address,new_address
214,63121,6606 BARR ST,"6606 BARR ST,SAINT LOUIS,MO",6606 BARR ST
215,63121,6606 BARR ST,"6606 BARR ST,SAINT LOUIS,MO",6606 BARR ST
221,63110,4752 BOTANICAL 1ST FL,"4752 BOTANICAL 1ST FL,ST LOUIS,MO",4752 BOTANICAL 1ST FL
232,63128,4229 VALLEY CREST HILLS DR,"4229 VALLEY CREST HILLS DR,SAINT LOUIS,MO",4229 VALLEY CREST HILLS DR
241,63133,30 ARCHWAY MANOR DR LOT 30,"30 ARCHWAY MANOR DR LOT 30,SAINT LOUIS,MO",30 ARCHWAY MANOR DR LOT 30
246,63141,13143 DARTAGNAN CT,"13143 DARTAGNAN CT,SAINT LOUIS,MO",13143 DARTAGNAN CT
248,63121,4705 OAKRIDGE BLVD,"4705 OAKRIDGE BLVD,SAINT LOUIS,MO",4705 OAKRIDGE BLVD
249,63121,4705 OAKRIDGE BLVD,"4705 OAKRIDGE BLVD,SAINT LOUIS,MO",4705 OAKRIDGE BLVD
256,63123,4645 TIEMANN AVE,"4645 TIEMANN AVE,SAINT LOUIS,MO",4645 TIEMANN AVE
259,63102,1621 N FIRST STREET,"1621 N FIRST STREET,ST LOUIS,MO",1621 N FIRST STREET


In [19]:
# merge together street addres, city, and state to create the "full address"
geocode_test_slice['new_address'] = geocode_test_slice.new_address + "," + geocode_test_slice.city + "," + geocode_test_slice.st

# now remove some of the elements that trip up the geocoder, using the comma to avoid unwanted replacements elsewhere
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' RD,',',')
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' AVE,',',')
geocode_test_slice['new_address'] = geocode_test_slice.new_address.str.replace(' DR,',',')

# geocode_test_slice[['address','city','st','zip','full_address','new_address']]


#### Test for County == St Louis City

In [82]:
nogeo_after_pass_2.county.value_counts()

ST LOUIS CITY    365
ST LOUIS          66
Name: county, dtype: int64

In [84]:
nogeo_after_pass_2.loc[nogeo_after_pass_2['county'] == "ST LOUIS"]

Unnamed: 0,level_0,index,name,address,city,st,zip,county,offense,count,compliant,tier,date_of_birth,full_address,geocode,new_address,new_address_split
12,43,769,"BACH, EDWARD E",9733 CRAYFORD RD NBR H,SAINT LOUIS,MO,63123,ST LOUIS,DEVIATE SEXUAL ASSAULT,1,Y,3,1957-07-14,"9733 CRAYFORD RD NBR H,SAINT LOUIS,MO",,9733 CRAYFORD RD NBR H,[9733 CRAYFORD RD NBR H]
13,44,770,"BACH, EDWARD E",9733 CRAYFORD RD NBR H,SAINT LOUIS,MO,63123,ST LOUIS,SEXUAL ASSAULT,2,Y,3,1957-07-14,"9733 CRAYFORD RD NBR H,SAINT LOUIS,MO",,9733 CRAYFORD RD NBR H,[9733 CRAYFORD RD NBR H]
20,64,1301,"BAXTER, JENNIFER L",9428 EDDIE AND PARK RD,SAINT LOUIS,MO,63126,ST LOUIS,ENDANGERING WELFARE OF A CHILD-1ST DEGREE,3,Y,3,1973-10-30,"9428 EDDIE AND PARK RD,SAINT LOUIS,MO",,9428 EDDIE AND PARK RD,[9428 EDDIE AND PARK RD]
23,76,1647,"BERTHOLF, JASON M",1939 N WARSON RD APT A,SAINT LOUIS,MO,63114,ST LOUIS,POSSESSION OF CHILD PORNOGRAPHY,1,Y,1,1982-04-06,"1939 N WARSON RD APT A,SAINT LOUIS,MO",,1939 N WARSON RD,"[1939 N WARSON RD, A]"
49,142,2859,"BROWN, ROMEL A",1405 DUNN RD RM 242,SAINT LOUIS,MO,63138,ST LOUIS,CHILD MOLEST-1ST DEGREE,1,Y,3,1980-06-13,"1405 DUNN RD RM 242,SAINT LOUIS,MO",,1405 DUNN RD RM 242,[1405 DUNN RD RM 242]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,1173,23524,"UNNERSTALL, NICHOLAS E",649 COEUR DE ROYALE DR UNIT B,SAINT LOUIS,MO,63141,ST LOUIS,ATMP ENTICE-ACTOR > 21/CHILD < 15,1,Y,2,1984-10-13,"649 COEUR DE ROYALE DR UNIT B,SAINT LOUIS,MO",,649 COEUR DE ROYALE DR UNIT B,[649 COEUR DE ROYALE DR UNIT B]
388,1186,23779,"VOGEL, CHRISTOPHER A",4121 SPRINGDALE AVE RM 104,SAINT LOUIS,MO,63134,ST LOUIS,SODOMY,1,Y,3,1966-07-31,"4121 SPRINGDALE AVE RM 104,SAINT LOUIS,MO",,4121 SPRINGDALE AVE RM 104,[4121 SPRINGDALE AVE RM 104]
396,1203,24031,"WALLHAUSER, RONALD G",1857 UNION RD APT A,SAINT LOUIS,MO,63125,ST LOUIS,STATUTORY SODOMY-2ND DEG,1,Y,2,1956-04-01,"1857 UNION RD APT A,SAINT LOUIS,MO",,1857 UNION RD,"[1857 UNION RD, A]"
398,1213,24212,"WASHINGTON, DEVONTA L",3325 W MILTON AVE UNIT B,SAINT LOUIS,MO,63114,ST LOUIS,SEX MISCD/ATMP INVL CHLD-1ST OFNS,1,Y,3,1992-06-14,"3325 W MILTON AVE UNIT B,SAINT LOUIS,MO",,3325 W MILTON AVE UNIT B,[3325 W MILTON AVE UNIT B]


Need to remove apartment numbers FIRST, then push through geocoder.
Then take remaining fails, remove AVE, ST, RD, then push through geocoder.

In [117]:
# geocode_test_slice.drop(['new_address'], inplace=True, axis=1)


# NEED TO CLEAN THE DATA MORE. SOME OFFENDERS ARE LISTED MULTIPLE TIMES.
Could remove these duplicates by checking for unique combos from multiple columns  
- name  
- address  
- city
- date of birth

# NEED TO CONSIDER ENTRIES WITHOUT ADDRESSES

Addresses like "Compliant/Pending Registration" and "HOMELESS" will always fail geocoding. We can probably just ignore/drop these, but they need to be explained.

In [193]:
# load this data into a new table so we can easily continue working with it later

# Set up database connection engine
# FORMAT: engine = create_engine('postgresql://user:password@host:5432/')
engine = create_engine('postgresql://psmd39:Mizzou23?@pgsql.dsa.lan:5432/cappsds_psmd39', echo=False)

# GeoDataFrame to PostGIS
msor_nogeo.to_sql(
    con=engine,
    name="msorfailedgeocoding",
    if_exists='replace'
)