In [2]:
import pandas as pd

url = 'https://data.cdc.gov/api/views/95ax-ymtc/rows.csv?accessType=DOWNLOAD'
dodr_data = pd.read_csv(url)
dodr_data.head()


Unnamed: 0,INDICATOR,PANEL,PANEL_NUM,UNIT,UNIT_NUM,STUB_NAME,STUB_NAME_NUM,STUB_LABEL,STUB_LABEL_NUM,YEAR,YEAR_NUM,AGE,AGE_NUM,ESTIMATE,FLAG
0,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,1999,1,All ages,1.1,6.1,
1,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2000,2,All ages,1.1,6.2,
2,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2001,3,All ages,1.1,6.8,
3,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2002,4,All ages,1.1,8.2,
4,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2003,5,All ages,1.1,8.9,


In [10]:
#age-adjusted rates only

dodr_data['UNIT'] = dodr_data['UNIT'].str.strip()
age_adjusted = dodr_data[dodr_data['UNIT'] == 'Deaths per 100,000 resident population, age-adjusted']

print(f"Age-adjusted data shape: {age_adjusted.shape}")
print(age_adjusted.head())
print(f"Original data shape: {dodr_data.shape}")

Age-adjusted data shape: (2628, 15)
                   INDICATOR                     PANEL  PANEL_NUM  \
0  Drug overdose death rates  All drug overdose deaths          0   
1  Drug overdose death rates  All drug overdose deaths          0   
2  Drug overdose death rates  All drug overdose deaths          0   
3  Drug overdose death rates  All drug overdose deaths          0   
4  Drug overdose death rates  All drug overdose deaths          0   

                                                UNIT  UNIT_NUM STUB_NAME  \
0  Deaths per 100,000 resident population, age-ad...         1     Total   
1  Deaths per 100,000 resident population, age-ad...         1     Total   
2  Deaths per 100,000 resident population, age-ad...         1     Total   
3  Deaths per 100,000 resident population, age-ad...         1     Total   
4  Deaths per 100,000 resident population, age-ad...         1     Total   

   STUB_NAME_NUM   STUB_LABEL  STUB_LABEL_NUM  YEAR  YEAR_NUM       AGE  \
0              0 

In [4]:
print("\nBreakdown:")
print(dodr_data['UNIT'].value_counts())


Breakdown:
UNIT
Deaths per 100,000 resident population, crude           3600
Deaths per 100,000 resident population, age-adjusted    2628
Name: count, dtype: int64


In [5]:
#Parsing STUB_LABEL

age_adjusted = age_adjusted.copy()

def demo_stub(label):
    sex = None
    age_group = None
    race_ethnicity = None

    if label == "All persons":
        sex = "All"
        race_ethnicity = "All"
    elif label in ["Male", "Female"]:
        sex = label
        race_ethnicity = "All"
    elif ":" in label:
        parts = label.split(":", 1)
        sex = parts[0].strip()
        race_ethnicity = parts[1].strip()
    else:
        sex = "Unknown"
        race_ethnicity = label
    return pd.Series({
        'sex': sex,
        'age_group': age_group,
        'race_ethnicity': race_ethnicity
    })


age_adjusted[['sex', 'age_group', 'race_ethnicity']] = age_adjusted['STUB_LABEL'].apply(demo_stub)

print({age_adjusted.shape})
print(age_adjusted[['STUB_LABEL', 'sex', 'age_group', 'race_ethnicity']].head(20))

print("\nUnique sex values:")
print(age_adjusted['sex'].unique())

print("\nUnique race_ethnicity values:")
print(age_adjusted['race_ethnicity'].unique())

print("\nValue counts for sex:")
print(age_adjusted['sex'].value_counts())

    

{(2628, 18)}
     STUB_LABEL   sex age_group race_ethnicity
0   All persons   All      None            All
1   All persons   All      None            All
2   All persons   All      None            All
3   All persons   All      None            All
4   All persons   All      None            All
5   All persons   All      None            All
6   All persons   All      None            All
7   All persons   All      None            All
8   All persons   All      None            All
9   All persons   All      None            All
10  All persons   All      None            All
11  All persons   All      None            All
12  All persons   All      None            All
13  All persons   All      None            All
14  All persons   All      None            All
15  All persons   All      None            All
16  All persons   All      None            All
17  All persons   All      None            All
18  All persons   All      None            All
19         Male  Male      None            All


In [6]:
#Removing Unreliable estimates

print({age_adjusted['FLAG'].notna().sum()})
clean_overdose = age_adjusted[age_adjusted['FLAG'].isna()].copy()
print(f"\nCleaned row count: {len(clean_overdose)}")
print(f"Rows removed: {len(age_adjusted) - len(clean_overdose)}")

print(f"\nRemaining rows with FLAG not null: {clean_overdose['FLAG'].notna().sum()}")

{415}

Cleaned row count: 2213
Rows removed: 415

Remaining rows with FLAG not null: 0


In [7]:
print("\nSample of cleaned data:")
print(clean_overdose[['STUB_LABEL', 'sex', 'age_group', 'race_ethnicity', 'YEAR', 'ESTIMATE', 'FLAG']].head(20))



Sample of cleaned data:
     STUB_LABEL   sex age_group race_ethnicity  YEAR  ESTIMATE FLAG
0   All persons   All      None            All  1999       6.1  NaN
1   All persons   All      None            All  2000       6.2  NaN
2   All persons   All      None            All  2001       6.8  NaN
3   All persons   All      None            All  2002       8.2  NaN
4   All persons   All      None            All  2003       8.9  NaN
5   All persons   All      None            All  2004       9.4  NaN
6   All persons   All      None            All  2005      10.1  NaN
7   All persons   All      None            All  2006      11.5  NaN
8   All persons   All      None            All  2007      11.9  NaN
9   All persons   All      None            All  2008      11.9  NaN
10  All persons   All      None            All  2009      11.9  NaN
11  All persons   All      None            All  2010      12.3  NaN
12  All persons   All      None            All  2011      13.2  NaN
13  All persons   All  

In [8]:
#Parsing AGE column into age_groups

clean_overdose = pd.read_csv('cleaned_drug_overdose_data.csv')

if 'AGE' in clean_overdose.columns:
    print(f"Sample AGE values: {clean_overdose['AGE'].unique()[:10]}")

    clean_overdose['age_group'] = clean_overdose['AGE'].apply(
        lambda x: 'All' if (pd.isna(x) or str(x).strip() in ['All ages', 'All']) 
        else str(x).replace(' years', '').replace('years', '').strip()
    )
    print(clean_overdose[['AGE', 'age_group']].drop_duplicates().head(15))
    print(sorted([x for x in clean_overdose['age_group'].unique() if pd.notna(x)]))
    
    columns_to_delete = ['STUB_LABEL', 'AGE']
    existing_cols = [col for col in columns_to_delete if col in clean_overdose.columns]
    
    if existing_cols:
        clean_overdose = clean_overdose.drop(columns=existing_cols)   

    clean_overdose.to_csv('clean_overdose_final.csv', index=False)

Sample AGE values: ['All ages']
        AGE age_group
0  All ages       All
['All']


In [11]:
clean_overdose = pd.read_csv('clean_overdose_final.csv')

if 'PANE' in clean_overdose.columns:
    clean_overdose = clean_overdose.rename(columns={'PANEL': 'drug_type'})

clean_overdose.to_csv('clean_overdose_final.csv', index=False)