In [37]:
import pandas as pd
import re
from typing import Union
from rich.progress import track
import requests
from geopy.geocoders import ArcGIS
import plotly.express as px

In [2]:
df = pd.read_csv('../data/Medical_Examiner_Case_Archive.csv')
print(df.shape)
df.sample(1)

(54215, 27)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Commissioner District,Incident Address,Incident City,Incident Zip Code,longitude,latitude,location,Residence City,Residence_Zip,OBJECTID
4264,ME2019-01439,01/08/2019 05:00:00 PM,03/27/2019 04:00:00 AM,35.0,Male,White,False,ACCIDENT,COMPLICATIONS OF FALL,,...,,1254 AMBER COURT,WOODSTOCK,60098,,,,Woodstock,60098,51038


In [11]:
54215 - 53488

727

In [3]:
df2 = df[df['Incident Address'].notna()]
df2.shape

(53488, 27)

In [4]:
df2['Incident Address'].value_counts()

Unknown                          92
UNKNOWN                          90
UNKNOWN LOCATION                 11
10000 W O'Hare Ave               10
unknown                          10
                                 ..
623 EAST 111TH STREET             1
11760 S. LAWLER AVENUE            1
3538 W CERMAK RD                  1
10935 S HALSTED  MORGAN PARK      1
130 EAST OAK STREET FLOOR #55     1
Name: Incident Address, Length: 51331, dtype: int64

In [5]:
'unk' in 'UNKNOWN location'.lower()

True

In [6]:
re.sub(r"apt.*|\#.*|.*nh,", "", "wowzer NH, cool thin # 12".lower()).strip()

'cool thin'

In [7]:
car_words = ('auto', 'motor')

def deal_with_commas(x: str) -> str:
    if ',' not in x:
        return x.strip().title()
    parts = x.split(',')
    result = ' '.join([z for z in parts if any(y for y in z if y.isnumeric())])
    return result.strip().title()


def remove_apartment_info(x: str) -> str:
    result = re.sub(r"apt.*|\#.*|.*nh,", "", x)
    return deal_with_commas(result)


def clean_address(row: pd.Series) -> Union[int, str]:
    cause = row.get('Primary Cause Line A')
    a = row.get('Incident Address')
    # removes if motor vehicle cause
    if pd.notna(cause):
        if any(word in cause.lower() for word in car_words):
            return pd.NA
    # handles 'unknown' and variations
    if pd.isna(a) or 'unk' in a.lower():
        return pd.NA
    return remove_apartment_info(a.lower())


In [8]:
df2['cleaned_address'] = df2.apply(lambda row: clean_address(row), axis=1)
df2['cleaned_address'].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['cleaned_address'] = df2.apply(lambda row: clean_address(row), axis=1)


NaN                                          2368
1301 Lee Street                                18
1920 Nerge Rd                                  14
9615 Knox Ave                                  14
345 Dixie Highway                              14
                                             ... 
2719 E. 92Nd St./Formerly The G-Spot Club       1
1023 Argyle                                     1
6631 N Milwaukee Avenue Regency Nh              1
6211 N Greenview 2Fl                            1
I  55 Northbound By Damen Avenue                1
Name: cleaned_address, Length: 47564, dtype: int64

In [9]:
df3 = df2[df2['cleaned_address'].notna()]
df3.shape

(51120, 28)

In [10]:
f"We removed {df.shape[0] - df3.shape[0]} records due to 'null-like' addresses"

"We removed 3095 records due to 'null-like' addresses"

See the benefit of cleaned addresses below:

In [12]:
df3[['Incident Address','cleaned_address']].sample(2)

Unnamed: 0,Incident Address,cleaned_address
13818,1775 DEMPSTER,1775 Dempster
25455,646 E.51st Street (Apt 1E),646 E.51St Street (


In [13]:
df3.columns

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'location', 'Residence City', 'Residence_Zip', 'OBJECTID',
       'cleaned_address'],
      dtype='object')

In [14]:
def make_address(row) -> str:
    street = row['cleaned_address'].strip()
    res_city_valid = pd.notna(row['Residence City'])
    inc_city_valid = pd.notna(row['Incident City'])
    if pd.notna(row['Incident City']):
        city = row['Incident City'].title().strip()
    elif pd.isna(row['Incident City']) and pd.notna(row['Residence City']):
        city = row['Residence City'].title().strip()
    else:
        city = ''
    zip_code = '' if pd.isna(row['Incident Zip Code']) else row['Incident Zip Code'].strip()
    address = f"{street} {city} {zip_code}"
    return address.strip()


def geo_query(row):
    address = make_address(row)
    url = f"https://my-geocoder.herokuapp.com/geocode?address={requests.utils.quote(address)}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        raise ValueError("Invalid response")


def self_geocode(row) -> tuple[float, float]:
    address = make_address(row)
    geocoder = ArcGIS()
    coded = geocoder.geocode(address)
    if coded:
        return {
            "latitude": coded.latitude,
            "longitude": coded.longitude,
            "score": coded.raw.get('score')
        }
    else:
        return None

In [15]:
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
tqdm.pandas()

df3['full_address'] = df3.apply(lambda row: make_address(row), axis=1)
df3['full_address'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['full_address'] = df3.apply(lambda row: make_address(row), axis=1)


4                 1049 Napleton Avenue
7                 166 N. Lamon Chicago
11         1306 S Kedzie Chicago 60623
12     8695 S Archer Ave Chicago 60638
13    11901 South Loomis Chicago 60643
Name: full_address, dtype: object

In [None]:
geolocator = ArcGIS()
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0)
df3['geo_location'] = df3['full_address'].progress_apply(geocode)

In [None]:
df3['coded_lat'] = df3['geo_location'].apply(lambda x: x.latitude if pd.notna(x) else None)
df3['coded_long'] = df3['geo_location'].apply(lambda x: x.longitude if pd.notna(x) else None)
df3['coded_score'] = df3['geo_location'].apply(lambda x: x.raw.get('score') if pd.notna(x) else None)

df3.drop('geo_location', axis=1, inplace=True)
print(df3.coded_score.describe())

## Calculate geopy distance from original lat/long

In [None]:
# use if need geocoded data
dff = pd.read_csv('../data/version3.csv')

In [17]:
from geopy import distance

def calc_distance(row):
    if pd.isna(row.latitude) or pd.isna(row.longitude) or pd.isna(row.coded_lat) or pd.isna(row.coded_long):
        return None
    d = distance.distance(
        (row.latitude, row.longitude),
        (row.coded_lat, row.coded_long)
    ).km
    return d

In [None]:
df3['distance'] = df3.apply(lambda row: calc_distance(row), axis=1)

In [18]:
df3.distance.describe().round(2)

count    46164.00
mean        11.86
std        295.27
min          0.00
25%          0.00
50%          0.00
75%          0.02
max      14996.75
Name: distance, dtype: float64

In [57]:
df3.coded_score.describe().round(2)

count    51105.00
mean        98.19
std          3.64
min         70.00
25%         98.53
50%         99.43
75%        100.00
max        100.00
Name: coded_score, dtype: float64

In [None]:
df3.to_csv('../data/version3.csv', index=False)