In [1]:
import pandas as pd

raw_file_name = "CDC_drug_overdose_deaths_raw_data.csv"
url = 'https://data.cdc.gov/api/views/95ax-ymtc/rows.csv?accessType=DOWNLOAD'
# read data from CDC web site
dodr_data = pd.read_csv(url)
dodr_data.to_csv(raw_file_name)
dodr_data.head()


Unnamed: 0,INDICATOR,PANEL,PANEL_NUM,UNIT,UNIT_NUM,STUB_NAME,STUB_NAME_NUM,STUB_LABEL,STUB_LABEL_NUM,YEAR,YEAR_NUM,AGE,AGE_NUM,ESTIMATE,FLAG
0,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,1999,1,All ages,1.1,6.1,
1,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2000,2,All ages,1.1,6.2,
2,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2001,3,All ages,1.1,6.8,
3,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2002,4,All ages,1.1,8.2,
4,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2003,5,All ages,1.1,8.9,


In [68]:
# age-adjusted race rates only

od_race_data = dodr_data[dodr_data['UNIT_NUM'] == 1]

print(f"Age-adjusted race data shape: {od_race_data.shape}")
print(f"Original data shape: {dodr_data.shape}")

Age-adjusted race data shape: (2628, 15)
Original data shape: (6228, 15)


In [69]:
# crude age-related rates only

od_age_data = dodr_data[dodr_data['UNIT_NUM'] == 2]

print(f"Crude age-related data shape: {od_age_data.shape}")
print(f"Original data shape: {dodr_data.shape}")

Crude age-related data shape: (3600, 15)
Original data shape: (6228, 15)


In [70]:
#Parsing STUB_LABEL to extract Sex and Race

def parse_sex_race_stub_label(label):
    sex = None
    race_ethnicity = None
    if (":" in label):
        parts = label.split(":", 1)
        sex = parts[0].strip()
        race_ethnicity = parts[1].strip()
    else:
        sex = label
        race_ethnicity = "All"

    if ("All" in sex):
        sex = "All"
        
    return pd.Series({
        'sex': sex,
        'race_ethnicity': race_ethnicity
    })

def parse_sex_age_stub_label(label):
    sex = None
    age_group = None
    if (":" in label):
        parts = label.split(":", 1)
        sex = parts[0].strip()
        age_group = parts[1].strip()
    else:
        if ("Male" in label):
            sex = label
            age_group = "All"
        elif ("Female" in label):
            sex = label
            age_group = "All"
        elif ("All persons" in label):
            sex = "All"
            age_group = "All"
        else:
            sex = "All"
            age_group = label

    if ("All" in sex):
        sex = "All"
        
    return pd.Series({
        'sex': sex,
        'age_group': age_group
    })


In [71]:

df_race_data = od_race_data.copy()
df_race_data[['sex', 'race_ethnicity']] = df_race_data['STUB_LABEL'].apply(parse_sex_race_stub_label)

print({df_race_data.shape})
print(df_race_data[['STUB_LABEL', 'sex', 'race_ethnicity']].head(20))

print("\nUnique sex values:")
print(df_race_data['sex'].unique())

print("\nUnique race_ethnicity values:")
print(df_race_data['race_ethnicity'].unique())

print("\nValue counts for sex:")
print(df_race_data['sex'].value_counts())


{(2628, 17)}
     STUB_LABEL   sex race_ethnicity
0   All persons   All            All
1   All persons   All            All
2   All persons   All            All
3   All persons   All            All
4   All persons   All            All
5   All persons   All            All
6   All persons   All            All
7   All persons   All            All
8   All persons   All            All
9   All persons   All            All
10  All persons   All            All
11  All persons   All            All
12  All persons   All            All
13  All persons   All            All
14  All persons   All            All
15  All persons   All            All
16  All persons   All            All
17  All persons   All            All
18  All persons   All            All
19         Male  Male            All

Unique sex values:
['All' 'Male' 'Female']

Unique race_ethnicity values:
['All' 'White' 'Black or African American'
 'American Indian or Alaska Native' 'Asian or Pacific Islander'
 'Hispanic or Latino: All ra

In [72]:

df_age_data = od_age_data.copy()
df_age_data[['sex', 'age_group']] = df_age_data['STUB_LABEL'].apply(parse_sex_age_stub_label)

print({df_age_data.shape})
print(df_age_data[['STUB_LABEL', 'sex', 'age_group']].tail(25))

print("\nUnique sex values:")
print(df_age_data['sex'].unique())

print("\nUnique age_group values:")
print(df_age_data['age_group'].unique())

print("\nValue counts for sex:")
print(df_age_data['sex'].value_counts())


{(3600, 17)}
                     STUB_LABEL     sex          age_group
6203                45-54 years     All        45-54 years
6204                55-64 years     All        55-64 years
6205                65-74 years     All        65-74 years
6206                75-84 years     All        75-84 years
6207          85 years and over     All  85 years and over
6208                       Male    Male                All
6209                     Female  Female                All
6210       Male: Under 15 years    Male     Under 15 years
6211          Male: 15-24 years    Male        15-24 years
6212          Male: 25-34 years    Male        25-34 years
6213          Male: 35-44 years    Male        35-44 years
6214          Male: 45-54 years    Male        45-54 years
6215          Male: 55-64 years    Male        55-64 years
6216          Male: 65-74 years    Male        65-74 years
6217          Male: 75-84 years    Male        75-84 years
6218    Male: 85 years and over    Male  85

In [73]:
# Removing Unreliable estimates

print({df_age_data['FLAG'].notna().sum()})
df_age_data_clean = df_age_data[df_age_data['FLAG'].isna()].copy()
print(f"\nCleaned age row count: {len(df_age_data_clean)}")
print(f"Rows age removed: {len(df_age_data) - len(df_age_data_clean)}")

print(f"\nRemaining age rows with FLAG not null: {df_age_data_clean['FLAG'].notna().sum()}")

{696}

Cleaned age row count: 2904
Rows age removed: 696

Remaining age rows with FLAG not null: 0


In [74]:
#Removing Unreliable estimates

print({df_race_data['FLAG'].notna().sum()})
df_race_data_clean = df_race_data[df_race_data['FLAG'].isna()].copy()
print(f"\nCleaned crude row count: {len(df_race_data_clean)}")
print(f"Rows crude removed: {len(df_race_data) - len(df_race_data_clean)}")

print(f"\nRemaining crude rows with FLAG not null: {df_race_data_clean['FLAG'].notna().sum()}")

{415}

Cleaned crude row count: 2213
Rows crude removed: 415

Remaining crude rows with FLAG not null: 0


In [76]:
# TODO: DELETE. PANEL is used in analysis_utils.py, cannot be renamed here
#if 'PANEL' in df_race_data_clean.columns:
#    df_race_data_clean_final = df_race_data_clean.rename(columns={'PANEL': 'drug_type', 'PANEL_NUM': 'drug_type_code'})

#if 'PANEL' in df_age_data_clean.columns:
#    df_age_data_clean_final = df_age_data_clean.rename(columns={'PANEL': 'drug_type', 'PANEL_NUM': 'drug_type_code'})
# END OF TODO
df_race_data_clean_final = df_race_data_clean
df_age_data_clean_final = df_age_data_clean

In [77]:
print("\nWrite clean data to files: 'overdose_race_data_clean.csv' and 'overdose_age_data_clean.csv'")
df_race_data_clean_final.to_csv('overdose_race_data_clean.csv', index=False)
df_age_data_clean_final.to_csv('overdose_age_data_clean.csv', index=False)


Write clean data to files: 'overdose_race_data_clean.csv' and 'overdose_age_data_clean.csv'
