In [197]:
from __future__ import annotations

import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import re
from collections import Counter
from scourgify import normalize_address_record
from scourgify.exceptions import UnParseableAddressError

pio.renderers.default = "notebook"
pd.set_option('display.max_columns', None)


df = pd.read_csv("../data/output/finalized.csv", low_memory=False)
cases = pd.read_csv('../data/raw/cases.csv', low_memory=False)
clean = pd.read_csv('../data/processed/clean_cases.csv', low_memory=False)
[c for c in df.columns if not c.endswith("_primary") and not c.endswith("_secondary")][-10:]

['cold_combined',
 'primary_combined',
 'incident_date',
 'death_date',
 'death_county',
 'death_state',
 'death_location',
 'death_location_1',
 'death_address',
 'incident_matches_death']

In [3]:
df.shape

(65299, 197)

In [4]:
df[[c for c in df.columns if not c.endswith("_primary") and not c.endswith("_secondary")]][pd.notna(df.death_address)].head()

Unnamed: 0,OID_,casenumber,age,gender,race,latino,cold_related,heat_related,commissioner_district,residence_city,residence_zip,covid_related,primarycause,manner,secondarycause,gunrelated,opioids,primarycause_linea,primarycause_lineb,primarycause_linec,incident_address,incident_address_city_sub,geocoded_score,geocoded_address,recovered,final_latitude,final_longitude,closest_pharmacy,CFNAME,CFTYPE,CFSUBTYPE,ADDRESS,GNISCODE,SOURCE,Jurisdicti,Community,STATEFP,COUNTYFP,GEOID,INTPTLAT,INTPTLON,LANDUSE,landuse_name,landuse_sub_name,landuse_major_name,death_datetime,death_time,death_year,death_month,death_day,death_week,motel,hot_combined,cold_combined,primary_combined,incident_date,death_date,death_county,death_state,death_location,death_location_1,death_address,incident_matches_death
3378,3379,ME2022-00492,84.0,Female,White,0,0,0,,Minneapolis,55426.0,1.0,PNEUMONIA,NATURAL,"LUNG CANCER, LIVER AND KIDNEY TRANSPLANT, BREA...",0.0,0.0,NOVEL CORONA (COVID-19) VIRAL INFECTION,,,MINNEAPOLIS,1,,Minneapolis,0,,,-1.0,,,,,,,,,,,,,,,,,,2022-01-06 20:35:00,20:35:00,2022.0,1.0,6.0,1.0,0.0,0,0,PNEUMONIANOVEL CORONA (COVID-19) VIRAL INFECTION,01/03/2022 12:00 AM,01/06/2022 08:35 PM,Cook County,IL,Hospital Inpatient Unit,Hospital,2100 PFINGSTEN ROAD GLENVIEW IL 60025,0
3379,3380,ME2022-00491,51.0,Female,White,0,0,0,6.0,Worth,60482.0,,,PENDING,,,,,,,10718 S. NEENAH AVENUE WORTH 60482,0,,,0,41.696773,-87.783007,0.622998,,,,,,,,,17.0,31.0,17031820000.0,41.685737,-87.779291,1111.0,Single-Family Detached,Residential,Urbanized,2022-01-06 18:52:00,18:52:00,2022.0,1.0,6.0,1.0,0.0,0,0,,01/06/2022 06:30 PM,01/06/2022 06:52 PM,,IL,,,10718 S. NEENAH AVENUE WORTH IL 60482,0
3381,3382,ME2022-00489,63.0,Male,Black,0,0,0,,Chicago,60636.0,1.0,ACUTE HYPOXIC RESPIRATORY FAILURE,NATURAL,HYPERTENSION; CORONARY ARTERY DISEASE; CONGEST...,0.0,0.0,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,,CHICAGO,1,100.0,"Chicago, Illinois",1,41.88425,-87.63245,0.078949,,,,,,,,,,,,,,,,,,2022-01-06 07:09:00,07:09:00,2022.0,1.0,6.0,1.0,0.0,0,0,ACUTE HYPOXIC RESPIRATORY FAILUREPNEUMONIANOVE...,01/04/2022 12:00 AM,01/06/2022 07:09 AM,Cook County,IL,HOSPITAL ICU,Hospital,5841 SOUTH MARYLAND AVENUE CHICAGO IL 60637,0
3382,3383,ME2022-00488,72.0,Female,White,0,0,0,10.0,Chicago,60646.0,1.0,ACUTE HYPOXEMIC RESPIRATORY FAILURE,NATURAL,,0.0,0.0,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,,5816 N. KILBOURN AVE. CHICAGO 60646,0,,,0,41.98581,-87.740981,0.724441,,,,,,,,,17.0,31.0,17031120000.0,41.991112,-87.740066,1111.0,Single-Family Detached,Residential,Urbanized,2022-01-06 19:42:00,19:42:00,2022.0,1.0,6.0,1.0,0.0,0,0,ACUTE HYPOXEMIC RESPIRATORY FAILUREPNEUMONIANO...,12/20/2021 08:48 PM,01/06/2022 07:42 PM,Cook County,IL,Hospital ICU,Hospital,1775 DEMPSTER STREET PARK RIDGE IL 60646,0
3383,3384,ME2022-00487,95.0,Female,White,0,0,0,15.0,Streamwood,60107.0,1.0,ACUTE RESPIRATORY FAILURE WITH HYPOXIA,NATURAL,ATRIAL FIBRILLATION; HYPERTENSION,0.0,0.0,PNEUMONIA,NOVEL CORONA (COVID-19) VIRAL INFECTION,,53 COOLIDGE COURT UNIT A STREAMWOOD 60107,0,,,0,42.021656,-88.1944,0.678775,,,,,,,,,17.0,31.0,17031800000.0,42.026254,-88.200662,1112.0,Single-Family Attached,Residential,Urbanized,2022-01-05 20:00:00,20:00:00,2022.0,1.0,5.0,1.0,0.0,0,0,ACUTE RESPIRATORY FAILURE WITH HYPOXIAPNEUMONI...,01/04/2022 07:24 PM,01/05/2022 08:00 PM,Cook County,IL,Hospital Inpatient Unit,Hospital,1555 BARRINGTON ROAD HOFFMAN ESTATES IL 60169,0


In [24]:
[c for c in df.columns if c.endswith('_x') or c.endswith('_y')]

[]

In [5]:
last_year = df[df.death_year == 2020]
total_records = last_year.shape[0]
office_count = last_year.covid_related.sum()
our_count = last_year[(last_year.covid_primary == 1) | (last_year.covid_secondary == 1)].shape[0]
print(f"Total records: {total_records}")
print(f"Office records: {office_count}")
print(f"Our records: {our_count}")
print(f"Office percentage: {round((office_count / total_records) * 100, 2)}%")
print(f"Our percentage: {round((our_count / total_records) * 100, 2)}%")


Total records: 16077
Office records: 8355.0
Our records: 8438
Office percentage: 51.97%
Our percentage: 52.48%


In [6]:
death_locations = pd.read_csv("../data/raw/death_locations.csv", low_memory=False)
death_locations.head()

Unnamed: 0,CASENUMBER,INCIDENT_DATE,DEATH_DATE,DEATH_STREET,DEATH_CITY,DEATH_COUNTY,DEATH_STATE,DEATH_ZIP,DEATH_LOCATION,DEATH_LOCATION_1
0,ME2022-00492,01/03/2022 12:00 AM,01/06/2022 08:35 PM,2100 Pfingsten Road,Glenview,Cook County,IL,60025,Hospital Inpatient Unit,Hospital
1,ME2022-00488,12/20/2021 08:48 PM,01/06/2022 07:42 PM,1775 Dempster Street,Park Ridge,Cook County,IL,60646,Hospital ICU,Hospital
2,ME2022-00491,01/06/2022 06:30 PM,01/06/2022 06:52 PM,10718 S. NEENAH AVENUE,Worth,,IL,60482,,
3,ME2022-00476,01/06/2022 05:01 PM,01/06/2022 05:26 PM,723 W GRAND AVE UNIT 301,Chicago,Cook County,IL,60654,FRIEND'S RESIDENCE,Friend's Home
4,ME2022-00475,01/06/2022 05:01 PM,01/06/2022 05:25 PM,723 W GRAND AVE UNIT 301,Chicago,Cook County,IL,60654,RESIDENCE,Residence


In [7]:
len([c for c in df.columns if c.endswith("_primary") or c.endswith("_secondary")])

134

In [8]:
df.closest_pharmacy.describe()

count    65299.000000
mean         1.343795
std         11.704726
min         -1.000000
25%          0.269778
50%          0.487465
75%          0.806819
max        572.601492
Name: closest_pharmacy, dtype: float64

In [157]:
drugs = pd.read_csv("../data/drugs/combined_drugs.csv")
print(drugs.shape)
drugs.head()

(56180, 6)


Unnamed: 0,record_id,drug_name,word_found,similarity_ratio,tags,level
0,ME2022-03836,Alcohol,ethanolism,0.94,drug;eth_alc,primary
1,ME2022-03818,Covid,corona,0.909091,covid,primary
2,ME2022-03813,Covid,corona,0.909091,covid,primary
3,ME2022-03803,Covid,corona,0.909091,covid,primary
4,ME2022-03786,Covid,corona,0.909091,covid,primary


In [158]:
drugs.similarity_ratio.describe()

count    56180.000000
mean         0.963721
std          0.043048
min          0.900000
25%          0.909091
50%          1.000000
75%          1.000000
max          1.000000
Name: similarity_ratio, dtype: float64

In [163]:
tag_counts = Counter((x for d in drugs.tags.values for x in d.split(";"))).most_common()
tag_counts = pd.DataFrame(tag_counts, columns=["tag", "count"])
tag_counts.to_csv("tag_counts.csv", index=False)

In [217]:
duped_addrs = df[df.incident_address.duplicated(keep=False)].sort_values(by='incident_address').incident_address.dropna()
normalized_addrs = []
for i, addr in duped_addrs.iteritems():
    try:
        normal = normalize_address_record(addr)
        normal_string = " ".join(y for y in normal.values() if y)
        normalized_addrs.append((i, normal_string, True, "valid"))
    except UnParseableAddressError as e:
        normalized_addrs.append((i, addr, False, "unparseable"))
    except ValueError as ve:
        normalized_addrs.append((i, addr, False, "invalid"))

normalized_addrs[0]
address_df = pd.DataFrame(normalized_addrs, columns=["index", "normalized_address", "valid", "reason"])
valid_addresses = address_df[address_df.valid == True]
valid_unique_addrs = valid_addresses[valid_addresses.normalized_address.duplicated(keep=False)].normalized_address.nunique()
print(f"{valid_unique_addrs} valid unique addresses")

1401 valid unique addresses


In [232]:
duped_addrs = df[
    df.incident_address.duplicated(keep=False)
].incident_address.dropna()
normalized_addrs = []
for i, addr in duped_addrs.iteritems():
    try:
        normal = normalize_address_record(addr)
        normal_string = " ".join(y for y in normal.values() if y)
        normalized_addrs.append((i, normal_string, True, "valid"))
    except UnParseableAddressError:
        normalized_addrs.append((i, addr, False, "unparseable"))
    except ValueError:
        normalized_addrs.append((i, addr, False, "invalid"))

address_df = pd.DataFrame(
    normalized_addrs, columns=["lookup_index", "normalized_address", "valid", "reason"]
)
valid_addresses = address_df[address_df.valid == True]
valid_unique_indices = valid_addresses[
    valid_addresses.normalized_address.duplicated(keep=False)
].lookup_index
df.index.isin(valid_unique_indices)

array([False, False, False, ..., False, False, False])

In [231]:
df[['incident_address', 'repeat_address']][df.repeat_address == True].head()

Unnamed: 0,incident_address,repeat_address
21,18200 S CICERO AVE COUNTRY CLUB HILLS 60478,True
33,6730 NORTHWEST HIGHWAY CHICAGO 60631,True
34,6730 NORTHWEST HIGHWAY CHICAGO 60631,True
43,6125 S KENWOOD AVE CHICAGO 60637,True
69,4201 SOUTH WABASH AVENUE CHICAGO 60653,True


In [62]:
df.recovered.sum()

7985

In [12]:
df[['final_latitude', 'final_longitude', 'geocoded_score']].describe()

Unnamed: 0,final_latitude,final_longitude,geocoded_score
count,64859.0,64859.0,7985.0
mean,41.839506,-87.740083,94.34577
std,0.181124,0.162753,7.551936
min,36.99654,-91.570947,70.0
25%,41.74769,-87.791694,90.82
50%,41.859363,-87.710238,98.11
75%,41.946636,-87.649182,100.0
max,42.569945,-87.380053,100.0


In [13]:
cases[['latitude', 'longitude']].describe()

Unnamed: 0,latitude,longitude
count,56874.0,56874.0
mean,41.843875,-87.727852
std,0.139809,0.116209
min,41.469732,-88.263178
25%,41.750306,-87.778183
50%,41.85873,-87.707254
75%,41.941603,-87.649747
max,42.153747,-87.524835


In [11]:
df[df.closest_pharmacy > 0].closest_pharmacy.describe() * 0.62137

count    40262.911890
mean         0.845682
std          7.300045
min          0.000059
25%          0.170317
50%          0.305664
75%          0.503423
max        355.797389
Name: closest_pharmacy, dtype: float64