In [3]:
import pandas as pd
import re
from typing import Union
from rich.progress import track
import requests
from geopy.geocoders import ArcGIS


In [4]:
df = pd.read_csv('../data/Medical_Examiner_Case_Archive.csv')
print(df.shape)
df.sample(2)

(54215, 27)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Commissioner District,Incident Address,Incident City,Incident Zip Code,longitude,latitude,location,Residence City,Residence_Zip,OBJECTID
47311,ME2020-11845,10/29/2020 06:13:00 PM,10/29/2020 06:41:00 PM,39.0,Male,Black,False,HOMICIDE,MULTIPLE GUNSHOT WOUNDS,,...,1.0,3637 W. GRAND AVENUE,CHICAGO,60651,-87.717719,41.902657,"(41.90265655750123, -87.71771879044832)",Chicago,60624,118313
38435,ME2020-05621,05/02/2020 08:56:00 PM,05/15/2020 04:43:00 AM,62.0,Female,White,False,NATURAL,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,...,17.0,9300 W. BALLARD RD,DES PLAINES,60016,-87.857217,42.043664,"(42.04366441259753, -87.85721683887542)",Des Plaines,60016,112090


In [5]:
df2 = df[df['latitude'].isna()]
print(df2.shape)
df2.sample(2)

(6747, 27)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Commissioner District,Incident Address,Incident City,Incident Zip Code,longitude,latitude,location,Residence City,Residence_Zip,OBJECTID
8580,ME2017-04737,10/14/2017 07:57:00 PM,10/17/2017 01:36:00 PM,15.0,Female,White,False,SUICIDE,COMPLICATIONS OF HANGING,,...,,300 NORTH HOWARD,ELMHURST,60126,,,,Elmhurst,60126,52569
180,ME2015-04229,12/09/2009 09:00:00 AM,10/01/2015 08:35:00 PM,20.0,Male,White,True,HOMICIDE,COMPLICATIONS OF GUNSHOT WOUND OF HEAD,,...,,3400 BLOCK OF ARMITAGE,CHICAGO,60626,,,,Chicago,60626,68953


In [6]:
df3 = df2[df2['Incident Address'].notna()]
print(df3.shape)
df3['Incident Address'].value_counts()

(6020, 27)


Unknown                            92
UNKNOWN                            90
UNKNOWN LOCATION                   11
10000 W O'Hare Ave                 10
unknown                            10
                                   ..
101 EASTGATE DR                     1
Interstate 90 Mile Post 78..25      1
1ST & HARRISON AVENUES              1
Sunrise NH, 1601 Green Bay Road     1
14948 S FRANCISCO                   1
Name: Incident Address, Length: 5661, dtype: int64

In [7]:
df3.columns

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'location', 'Residence City', 'Residence_Zip', 'OBJECTID'],
      dtype='object')

In [8]:
'unk' in 'UNKNOWN location'.lower()

True

In [9]:
re.sub(r"apt.*|\#.*|.*nh,", "", "wowzer NH, cool thin # 12".lower()).strip()

'cool thin'

In [10]:
car_words = ('auto', 'motor')

def deal_with_commas(x: str) -> str:
    if ',' not in x:
        return x.strip().title()
    parts = x.split(',')
    result = ' '.join([z for z in parts if any(y for y in z if y.isnumeric())])
    return result.strip().title()


def remove_apartment_info(x: str) -> str:
    result = re.sub(r"apt.*|\#.*|.*nh,", "", x)
    return deal_with_commas(result)


def clean_address(row: pd.Series) -> Union[int, str]:
    cause = row.get('Primary Cause Line A')
    a = row.get('Incident Address')
    # removes if motor vehicle cause
    if pd.notna(cause):
        if any(word in cause.lower() for word in car_words):
            return pd.NA
    # handles 'unknown' and variations
    if pd.isna(a) or 'unk' in a.lower():
        return pd.NA
    return remove_apartment_info(a.lower())


In [11]:
df3['cleaned_address'] = df3.apply(lambda row: clean_address(row), axis=1)
df4 = df3[df3.cleaned_address.notna()]
print(df4.shape)
df4[['Incident Address','cleaned_address']].sample(2)

(4947, 28)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['cleaned_address'] = df3.apply(lambda row: clean_address(row), axis=1)


Unnamed: 0,Incident Address,cleaned_address
36174,2773 Skokie Valley Rd,2773 Skokie Valley Rd
2241,Interstate 294 Mile at Marker 19.75,Interstate 294 Mile At Marker 19.75


In [12]:
final_df = df.join(df4.cleaned_address)
print(final_df.shape)
final_df.head()

(54215, 28)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Incident Address,Incident City,Incident Zip Code,longitude,latitude,location,Residence City,Residence_Zip,OBJECTID,cleaned_address
0,ME2014-01224,10/26/2014 07:36:00 PM,10/26/2014 11:16:00 AM,71.0,Male,Black,False,NATURAL,CORONARY ATHEROSCLEROSIS,,...,UNK,,,,,,,,48793,
1,ME2015-03052,07/02/2015 12:00:00 PM,07/18/2015 05:15:00 PM,46.0,Male,White,False,NATURAL,COMPLICATIONS OF LIVER CIRRHOSIS,HEPATITIS C INFECTION,...,,,,,,,,,48852,
2,ME2015-04003,,,,,,False,,NONHUMAN REMAINS,,...,,,,,,,,,48903,
3,ME2015-03208,,,,,,False,,"NON HUMAN REMAINS, NO DEATH CERTIFICATE ISSUED",,...,,,,,,,,,48889,
4,ME2017-03711,08/10/2017 12:00:00 AM,08/10/2017 12:13:00 PM,71.0,Female,Other,False,NATURAL,ORGANIC CARDIOVASCULAR DISEASE,,...,1049 NAPLETON AVENUE,,,,,,,,48921,1049 Napleton Avenue


In [13]:
final_df.shape

(54215, 28)

In [14]:
len([x for x in final_df['Incident Address'] if pd.notna(x)])

53488

In [15]:
len([x for x in final_df['cleaned_address'] if pd.notna(x)])

4947

In [16]:
final_df.shape[0] - len([x for x in final_df['Incident Address'] if pd.notna(x)])

727

In [17]:
len([x for x in final_df['Incident Address'] if pd.notna(x)]) / final_df.shape[0]

0.9865904270035968

In [20]:
final_df.columns

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'location', 'Residence City', 'Residence_Zip', 'OBJECTID',
       'cleaned_address'],
      dtype='object')

In [21]:
final_df.to_csv('../data/cleaned_addresses_full.csv', index=False)

Wowzer this is interesting!

In [22]:
def make_address(row) -> str:
    street = '' if pd.isna(row['cleaned_address']) else row['cleaned_address'].strip()
    city = '' if pd.isna(row['Incident City']) else row['Incident City'].title().strip()
    zip_code = '' if pd.isna(row['Incident Zip Code']) else row['Incident Zip Code'].strip()
    address = f"{street} {city} {zip_code}"
    return address


def geo_query(row):
    address = make_address(row)
    url = f"https://my-geocoder.herokuapp.com/geocode?address={requests.utils.quote(address)}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        raise ValueError("Invalid response")


def self_geocode(row) -> tuple[float, float]:
    address = make_address(row)
    geocoder = ArcGIS()
    coded = geocoder.geocode(address)
    if coded:
        return {
            "latitude": coded.latitude,
            "longitude": coded.longitude,
            "score": coded.raw.get('score')
        }
    else:
        return None


In [25]:
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
tqdm.pandas()

df4['full_address'] = df4.apply(lambda row: make_address(row), axis=1)


geolocator = ArcGIS()
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0)
df4['geo_location'] = df4['full_address'].progress_apply(geocode)

df4['new_lat'] = df4['geo_location'].apply(lambda x: x.latitude if pd.notna(x) else None)
df4['new_long'] = df4['geo_location'].apply(lambda x: x.longitude if pd.notna(x) else None)
df4['score'] = df4['geo_location'].apply(lambda x: x.raw.get('score') if pd.notna(x) else None)

print(df4.score.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['full_address'] = df4.apply(lambda row: make_address(row), axis=1)
100%|██████████| 4947/4947 [58:13<00:00,  1.42it/s]94.84154735347839

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['geo_location'] = df4['full_address'].progress_apply(geocode)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [26]:
#df4.drop('geo_location', axis=1, inplace=True)
print(df4.columns)
print(df4.shape)
df4.sample(2)

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'location', 'Residence City', 'Residence_Zip', 'OBJECTID',
       'cleaned_address', 'full_address', 'geo_location', 'new_lat',
       'new_long', 'score'],
      dtype='object')
(4947, 33)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,location,Residence City,Residence_Zip,OBJECTID,cleaned_address,full_address,geo_location,new_lat,new_long,score
43071,ME2020-05665,05/15/2020 01:05:00 PM,05/15/2020 01:19:00 PM,70.0,Male,Black,False,NATURAL,ORGANIC CARDIOVASCULAR DISEASE,,...,,Chicago,60621,112115,6830 S Carrey,6830 S Carrey Chicago 60636,"(60636, Chicago, Illinois, (41.76773000000003,...",41.76773,-87.66387,80.87
34966,ME2020-05437,05/07/2020 02:48:00 PM,05/08/2020 11:04:00 PM,60.0,Male,White,True,NATURAL,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,...,,Chicago,60641,111902,4036 West Burley Ave,4036 West Burley Ave Chicago 60641,"(60641, Chicago, Illinois, (41.939910000000054...",41.93991,-87.733965,77.0


In [27]:
df4.to_csv('../data/recovered.csv', index=False)

In [28]:
final_df = df.join(df4.new_lat)
final_df = final_df.join(df4.new_long)
final_df = final_df.join(df4.score)
final_df['cleaned_address'] = final_df.apply(lambda row: clean_address(row), axis=1)
final_df.drop('location', axis=1, inplace=True)
print(final_df.shape)
final_df.sample(2)

(54215, 30)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Incident Zip Code,longitude,latitude,Residence City,Residence_Zip,OBJECTID,new_lat,new_long,score,cleaned_address
47160,ME2020-15364,11/13/2020 12:00:00 AM,12/22/2020 07:00:00 AM,74.0,Male,Black,False,NATURAL,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,...,60620,-87.646566,41.742436,Chicago,60620,121821,,,,8328 S. Peoria St.
6499,ME2019-00484,01/30/2019 10:19:00 AM,01/30/2019 10:39:00 AM,23.0,Female,White,False,SUICIDE,"ACETAMINOPHEN, CLONAZEPAM, ALPRAZOLAM, HYDROCO...",,...,60480,-87.859832,41.736874,Willow Springs,60480,60018,,,,320 Forest Avenue


In [29]:
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
tqdm.pandas()

geolocator = ArcGIS()
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0)
larger_df = final_df[pd.isna(final_df.new_lat)]
larger_df.shape

(49284, 30)

In [30]:
larger_df['full_address'] = larger_df.apply(lambda row: make_address(row), axis=1)
larger_df['geo_location'] = larger_df['full_address'].progress_apply(geocode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  larger_df['full_address'] = larger_df.apply(lambda row: make_address(row), axis=1)
100%|██████████| 49284/49284 [9:10:00<00:00,  1.49it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  larger_df['geo_location'] = larger_df['full_address'].progress_apply(geocode)


In [31]:
larger_df['revised_lat'] = larger_df['geo_location'].apply(lambda x: x.latitude if pd.notna(x) else None)
larger_df['revised_long'] = larger_df['geo_location'].apply(lambda x: x.longitude if pd.notna(x) else None)
larger_df['revised_score'] = larger_df['geo_location'].apply(lambda x: x.raw.get('score') if pd.notna(x) else None)

print(larger_df.revised_score.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  larger_df['revised_lat'] = larger_df['geo_location'].apply(lambda x: x.latitude if pd.notna(x) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  larger_df['revised_long'] = larger_df['geo_location'].apply(lambda x: x.longitude if pd.notna(x) else None)
98.57303776411841
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [32]:
larger_df.drop('geo_location', axis=1, inplace=True)
print(larger_df.columns)
print(larger_df.shape)
larger_df.sample(2)

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'Residence City', 'Residence_Zip', 'OBJECTID', 'new_lat', 'new_long',
       'score', 'cleaned_address', 'full_address', 'revised_lat',
       'revised_long', 'revised_score'],
      dtype='object')
(49284, 34)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,Residence_Zip,OBJECTID,new_lat,new_long,score,cleaned_address,full_address,revised_lat,revised_long,revised_score
46772,ME2021-03967,04/03/2021 03:00:00 PM,04/14/2021 09:01:00 PM,71.0,Male,White,False,NATURAL,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,...,60631,126477,,,,6811 W Raven St. Unit 1S,6811 W Raven St. Unit 1S Chicago 60631,41.99266,-87.797978,100.0
7430,ME2014-01981,12/19/2014 05:20:00 PM,12/19/2014 05:51:00 PM,30.0,Male,White,False,ACCIDENT,ACUTE HEROIN TOXICITY,,...,60927,60578,,,,24 East Roosevelt,24 East Roosevelt Chicago 60605,41.867546,-87.626738,98.53


In [33]:
last_df = df.join(df4.new_lat)
last_df = last_df.join(df4.new_long)
last_df = last_df.join(df4.score)
last_df = last_df.join(larger_df.revised_lat)
last_df = last_df.join(larger_df.revised_long)
last_df = last_df.join(larger_df.revised_score)
last_df.drop('location', axis=1, inplace=True)
print(last_df.shape)
last_df.sample(2)

(54215, 32)


Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,latitude,Residence City,Residence_Zip,OBJECTID,new_lat,new_long,score,revised_lat,revised_long,revised_score
52706,ME2020-12280,10/31/2020 12:00:00 PM,11/08/2020 11:55:00 AM,80.0,Male,White,True,NATURAL,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,...,41.762533,Chicago,60629,118743,,,,41.762533,-87.718957,98.53
17429,ME2018-02175,05/06/2018 10:39:00 PM,05/06/2018 11:03:00 PM,31.0,Male,Black,False,ACCIDENT,COMBINED DRUG TOXICITY (FENTANYL AND PHENCYCLI...,,...,41.871679,Chicago,60623,68267,,,,41.871507,-87.718379,98.53


In [34]:
last_df.to_csv('../data/final.csv', index=False)

## Create comp_lat/long columns

In [41]:
last_df['comp_lat'] = last_df.apply(lambda row: row.new_lat if pd.notna(row.new_lat) else row.revised_lat, axis=1)
last_df['comp_long'] = last_df.apply(lambda row: row.new_long if pd.notna(row.new_long) else row.revised_long, axis=1)


In [42]:
last_df.sample(2)

Unnamed: 0,Case Number,Date of Incident,Date of Death,Age,Gender,Race,Latino,Manner of Death,Primary Cause,Primary Cause Line A,...,OBJECTID,new_lat,new_long,score,revised_lat,revised_long,revised_score,comp_lat,comp_long,distance
38094,ME2019-05948,12/14/2019 07:00:00 PM,12/14/2019 07:17:00 PM,98.0,Female,Black,False,NATURAL,ORGANIC CARDIOVASCULAR DISEASE,,...,106121,,,,41.740723,-87.613325,98.53,41.740723,-87.613325,
5500,ME2017-03992,08/22/2017 12:33:00 AM,08/26/2017 03:02:00 PM,17.0,Male,White,False,HOMICIDE,COMPLICATIONS OF GUNSHOT WOUND OF HEAD,,...,57144,42.353584,-88.035459,98.51,,,,42.353584,-88.035459,


## Calculate geopy distance from original lat/long

In [43]:
from geopy import distance

def calc_distance(row):
    if pd.isna(row.latitude) or pd.isna(row.longitude) or pd.isna(row.comp_lat) or pd.isna(row.comp_long):
        return None
    d = distance.distance(
        (row.latitude, row.longitude),
        (row.comp_lat, row.comp_long)
    ).km
    return d
    

In [44]:
last_df['distance'] = last_df.apply(lambda row: calc_distance(row), axis=1)

In [47]:
last_df.distance.describe().round(2)

count    47428.00
mean        22.73
std        416.62
min          0.00
25%          0.00
50%          0.00
75%          0.03
max      17473.05
Name: distance, dtype: float64

In [49]:
last_df.score.mean()

94.84154735347839

In [48]:
last_df.to_csv('../data/last.csv', index=False)

In [54]:
df.shape[0] - df[(pd.notna(df['Incident City'])) & (pd.notna(df['Incident Zip Code']))].shape[0]

1461

In [56]:
df.shape[0] - df[pd.notna(df['Incident Address'])].shape[0]

727

In [75]:
city_match = 0
zip_match = 0
for i, row in df.iterrows():
    if pd.isna(row['Incident City']) or pd.isna(row['Residence City']):
        continue
    if pd.isna(row['Incident Zip Code']) or pd.isna(row['Residence_Zip']):
        continue
    if row['Incident City'].lower() == row['Residence City'].lower():
        city_match += 1
    if row['Incident Zip Code'] == row['Residence_Zip']:
        zip_match += 1

In [76]:
city_match

44005

In [77]:
zip_match

37807

In [79]:
44005 / df.shape[0]

0.811675735497556