<a href="https://colab.research.google.com/github/ProfessorPatrickSlatraigh/CST3512/blob/main/CST3512_groupby_NYCdeaths_18_May_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CST3512 `groupby()` Using  NYC Cause of Deaths  

<b>by Professor Patrick 10-Dec-2022 | updated: 2023, 2024</b>  


##Housekeeping

Import the usual suspects    

In [1]:
import pandas as pd
import numpy as np

In [2]:
from matplotlib import pyplot as plt

Create a copy of the datafile from a `URL`    

In [None]:
!curl "https://raw.githubusercontent.com/ProfessorPatrickSlatraigh/data/main/NYC_Leading_Causes_of_Death_Dec-2022.csv" -o nyc_death_causes.csv

Read the datafile to a dataframe    



---



##Read and Explore the Data  
  

In [63]:
cd_df = pd.read_csv("nyc_death_causes.csv")

In [None]:
cd_df.dtypes

In [None]:
cd_df.head(24)



---



##Data Wrangling  
  

Clean up any commas, spaces, or periods in the `Deaths` column

In [66]:
# Using the `.replace()` argument to ensure that `regex` is not used
# Default use of `regex` in `.replace()` could create unintended results
cd_df["Deaths"] = cd_df.Deaths.str.replace(',', '', regex=False)
cd_df["Deaths"] = cd_df.Deaths.str.replace(' ', '', regex=False)
cd_df["Deaths"] = cd_df.Deaths.str.replace('.', '0', regex=False)

In [67]:
cd_df["Deaths"] = cd_df["Deaths"].astype('str').astype('int')

In [None]:
cd_df.head(24)

In [None]:
# Scaffolding to explore rows by Boolean search
# cd_df.loc[(cd_df["Year"] == 2019) & (cd_df["RaceEthnicity"] == "Other Race/ Ethnicity") & (cd_df["LeadingCause"] == "Septicemia (A40-A41)")]

In [None]:
cd_df['LeadingCause'].unique()

In [12]:
enrich_df = pd.DataFrame({
    'LeadingCause' : ['Diseases of Heart (I00-I09, I11, I13, I20-I51)',
       'Malignant Neoplasms (Cancer: C00-C97)',
       'Mental and Behavioral Disorders due to Accidental Poisoning and Other Psychoactive Substance Use (F11-F16, F18-F19, X40-X42, X44)',
       'Diabetes Mellitus (E10-E14)',
       'Influenza (Flu) and Pneumonia (J09-J18)',
       'Cerebrovascular Disease (Stroke: I60-I69)',
       'Accidents Except Drug Poisoning (V01-X39, X43, X45-X59, Y85-Y86)',
       'Chronic Liver Disease and Cirrhosis (K70, K73-K74)',
       'Chronic Lower Respiratory Diseases (J40-J47)',
       'Essential Hypertension and Renal Diseases (I10, I12)',
       'All Other Causes',
       'Intentional Self-Harm (Suicide: U03, X60-X84, Y87.0)',
       "Alzheimer's Disease (G30)",
       'Assault (Homicide: U01-U02, Y87.1, X85-Y09)',
       'Mental and Behavioral Disorders due to Use of Alcohol (F10)',
       'Congenital Malformations, Deformations, and Chromosomal Abnormalities (Q00-Q99)',
       'Aortic Aneurysm and Dissection (I71)',
       'Viral Hepatitis (B15-B19)', 'Septicemia (A40-A41)',
       'Complications of Medical and Surgical Care (Y40-Y84, Y88)',
       'Human Immunodeficiency Virus Disease (HIV: B20-B24)',
       'Cholelithiasis and Disorders of Gallbladder (K80-K82)',
       'Certain Conditions originating in the Perinatal Period (P00-P96)',
       'Nephritis, Nephrotic Syndrome and Nephrisis (N00-N07, N17-N19, N25-N27)',
       'Insitu or Benign / Uncertain Neoplasms (D00-D48)',
       'Anemias (D50-D64)', "Parkinson's Disease (G20)",
       'Peptic Ulcer (K25-K28)', 'Assault (Homicide: Y87.1, X85-Y09)',
       'Accidents Except Drug Posioning (V01-X39, X43, X45-X59, Y85-Y86)',
       'Intentional Self-Harm (Suicide: X60-X84, Y87.0)',
       'Chronic Liver Disease and Cirrhosis (K70, K73)',
       'Atherosclerosis (I70)', 'Tuberculosis (A16-A19)'],
    'Cause' : ['Heart',
       'Neoplasms',
       'Drug',
       'Diabetes',
       'Flu/Pneumonia',
       'Cerebrovascular',
       'Accidents',
       'Liver / Cirrhosis',
       'Respiratory',
       'Renal',
       'All Other',
       'Self-Harm',
       "Alzheimer's",
       'Homicide',
       'Alcohol',
       'Congenital',
       'Aortic',
       'Hepatitis', 'Septicemia',
       'Surgical',
       'HIV',
       'Cholelithiasis',
       'Perinatal',
       'Nephritis',
       'Neoplasms',
       'Anemias', 'Parkinsons',
       'Peptic Ulcer', 'Assault',
       'Accidents',
       'Self-Harm',
       'Liver / Cirrhosis',
       'Atherosclerosis', 'Tuberculosis'],
    'Type' : ['Other_Organ',
       'All_Other',
       'Drug',
       'Disease',
       'Disease',
       'Other_Organ',
       'Accident_Crime',
       'Liver_Cirrhosis',
       'Other_Organ',
       'Other_Organ',
       'All_Other',
       'Accident_Crime',
       "Other_Organ",
       'Accident_Crime',
       'Behavior',
       'All_Other',
       'Other_Organ',
       'Disease', 'Disease',
       'Accident_Crime',
       'Disease',
       'Disease',
       'All_Other',
       'Disease',
       'All_Other',
       'All_Other', 'Disease',
       'Behavior', 'Accident_Crime',
       'Accident_Crime',
       'Behavior',
       'Liver_Cirrhosis',
       'Disease', 'Disease']
})



---



##Using `.groupby()`  

In [None]:
enrich_df

In [14]:
labeled_df = cd_df.merge(enrich_df)

In [None]:
labeled_df.groupby(["Year", "Type", "LeadingCause", "Cause", "Sex", "RaceEthnicity"]).sum(numeric_only=True).reset_index()

In [16]:
cause_df = labeled_df[["Year", "Type", "Cause", "Deaths"]]

In [None]:
cause_df

In the following code we use the `numeric_only=True` argument because the default for `numeric_only` is `False`. With the default case, the method will attempt to apply the sum operation to all columns, including those that are not numeric, which could lead to errors or unexpected behavior if the DataFrame contains non-numeric columns.

In [26]:
type_df = cause_df.groupby(by=["Type"]).sum(numeric_only=True)
del type_df['Year']

In [None]:
type_df



---



##Plotting Results  
  

In [None]:
# Assuming 'Deaths' is the name of the column with the death counts
explode_lst = [0, 0, 0, 0, 0, 0.85, 0]

# draw a pie chart with labels, percentages, and the 6th slice exploded
plt.pie(type_df['Deaths'], labels=type_df.index, explode=explode_lst, shadow=True, autopct='%1.1f%%')

# display the title
plt.title("Liver/Cirrhosis vs. Other Causes of Death")

# show (or save) the plot
plt.show()

In [31]:
cause_df = cause_df.reset_index()

In [32]:
cause_df = cause_df.set_axis(cause_df["Year"])

In [None]:
cause_df



---



##Using `.melt()`  
  

In [34]:
# function to unpivot the dataframe
type_time_df = cause_df.melt(id_vars =['Year','Deaths'], value_vars =['Type'])

In [35]:
type_time_df.rename(columns = {'value':'Type'}, inplace = True)

In [36]:
del type_time_df["variable"]

##Wrangling the `.melt()` Result  
  

In [None]:
type_time_df

In [None]:
type_time_df.set_index("Year")

In [39]:
type_stack_df = type_time_df.groupby(by=["Year", "Type"]).sum()

In [None]:
type_stack_df

In [41]:
type_stack_df.reset_index(inplace=True, level = ['Type'])



---



##Plotting the `.melt()` Result  
  

In [None]:
type_stack_df

In [57]:
# Organize the data with each Type of cause of death as a column
type_plot_df = type_stack_df.pivot_table(values='Deaths', index=type_stack_df.index, columns='Type', aggfunc='first')

In [None]:
type_plot_df

In [None]:

# First, ensure the DataFrame is pivoted correctly with years as the index and causes as columns
pivot_df = type_stack_df.pivot_table(values='Deaths', index='Year', columns='Type', aggfunc='sum')

# Now, plot the stacked line plot
plt.figure(figsize=(10, 6))  # Set the figure size for better readability

# The stackplot function requires x (years) and y (list of lists of deaths for each cause)
years = pivot_df.index
causes = [pivot_df[cause] for cause in pivot_df.columns]

plt.stackplot(years, causes, labels=pivot_df.columns)

# Adding labels and title for clarity
plt.title('Deaths by Cause Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Deaths')
plt.legend(loc='upper left')  # Adjust legend location as needed

plt.tight_layout()  # Adjust layout to not cut off labels
plt.show()




---

