In [None]:
import pandas as pd

In [None]:
latest_df = pd.read_csv(
    "../../pipeline/pipeline_steps/input_files/2012-01-2024-08-overdoses.csv"
)

In [None]:
latest_df = latest_df.drop(columns=["DateofBirth"])

In [None]:
latest_df["DeathDate"] = pd.to_datetime(latest_df["DeathDate"])

In [None]:
latest_df["MonthYear"] = latest_df["DeathDate"].apply(lambda x: x.strftime("%Y-%m"))
latest_df["Year"] = latest_df["DeathDate"].apply(lambda x: x.year)
latest_df["YearWeek"] = latest_df["DeathDate"].apply(lambda x: x.strftime("%Y-%U"))

In [None]:
latest_df_w_zips = latest_df.dropna(subset="ZIPCODE")

In [None]:
latest_df_w_zips["ZIPCODE"] = latest_df_w_zips["ZIPCODE"].astype(int).astype(str)

In [None]:
# Define bins and labels
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = [
    "0-9",
    "10-19",
    "20-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90+",
]

# Assign each age to a bin
latest_df_w_zips["Age_Bin"] = pd.cut(
    latest_df_w_zips["Age"], bins=bins, labels=labels, right=False
)

In [None]:
def reassign_gender(row):
    if row == "M":
        return "male"
    elif row == "F":
        return "female"
    else:
        return row.casefold()

In [None]:
latest_df_w_zips["Gender"] = latest_df_w_zips["Gender"].apply(reassign_gender)

In [None]:
# Standardize gender values
latest_df_w_zips["Gender"] = (
    latest_df_w_zips["Gender"].str.strip().str.lower()
)  # Normalize case & trim spaces

# Define standardization mapping
gender_mapping = {
    "male": "Male",
    "female": "Female",
    "m": "Male",
    "f": "Female",
    "MALE": "Male",
    "FEMALE": "Female",
}

# Apply mapping & replace invalid values with "Unknown"
latest_df_w_zips["Gender"] = latest_df_w_zips["Gender"].replace(gender_mapping)

# Handle missing or empty values
latest_df_w_zips["Gender"] = latest_df_w_zips["Gender"].replace(
    {"": "Unknown"}
)  # Replace empty strings
latest_df_w_zips["Gender"] = latest_df_w_zips["Gender"].fillna(
    "Unknown"
)  # Replace NaNs

In [None]:
latest_df_w_zips["Race"] = latest_df_w_zips["Race"].fillna("UNKNOWN")

In [None]:
latest_df_w_zips = latest_df_w_zips.dropna(subset="CT20")

In [None]:
latest_df_w_zips["CT20"] = latest_df_w_zips["CT20"].astype(int).astype(str)

In [None]:
latest_df_w_zips.to_csv(
    "../../reports/deidentified_overdose_201201202408_zips_0311.csv"
)

In [None]:
latest_df.to_csv("../../reports/deidentified_overdose_201201202408.csv")

### Deidentify External Test

In [1]:
import pandas as pd

In [17]:
external_test = pd.read_csv("../../data/external_test_set_coroner.csv")

In [18]:
external_test.columns

Index(['Unnamed: 0', 'Assigned to', 'Case.Number', 'County', 'State', 'Age',
       'Gender.4', 'Age.Group', 'Gender...6', 'Race', 'Date.of.Death', 'Month',
       'Manner.of.Death', 'Primary.Cause',
       'Secondary.Cause...Injury.Description', 'text', 'Any Opioids', 'Heroin',
       'Fentanyl', 'Prescription.opioids', 'Methamphetamine', 'Cocaine',
       'Benzodiazepines', 'Alcohol', 'Others', 'Drug No Opioids', 'Any Drugs'],
      dtype='object')

In [19]:
external_test['Case.Number'] = pd.NA
external_test['Gender.4'] = pd.NA
external_test['Gender..6'] = pd.NA
external_test['Age'] = pd.NA
external_test['Race'] = pd.NA
external_test['Date.of.Death'] = pd.NA
external_test['Month'] = pd.NA
external_test['Age.Group'] = pd.NA

In [22]:
external_test.drop(columns=['Unnamed: 0'], inplace=True)

In [23]:
external_test.to_csv("../../data/external_test_set_coroner.csv")