# TRANSFORMATION - AGGREGATION OF DEATHS BY YEAR

## Scouting

In [2]:
import pandas as pd

DF = pd.read_csv('cleaned_us_deaths_sample.csv')

print(DF.head())

  Province_State   Admin2       UID iso2 iso3  code3    FIPS Country_Region  \
0        Alabama  Autauga  84001001   US  USA    840  1001.0             US   
1        Alabama  Baldwin  84001003   US  USA    840  1003.0             US   
2        Alabama  Barbour  84001005   US  USA    840  1005.0             US   
3        Alabama     Bibb  84001007   US  USA    840  1007.0             US   
4        Alabama   Blount  84001009   US  USA    840  1009.0             US   

         Lat      Long_  ...  2/28/23  3/1/23  3/2/23  3/3/23  3/4/23  3/5/23  \
0  32.539527 -86.644082  ...      230     232     232     232     232     232   
1  30.727750 -87.722071  ...      724     726     726     726     726     726   
2  31.868263 -85.387129  ...      103     103     103     103     103     103   
3  32.996421 -87.125115  ...      109     109     109     109     109     109   
4  33.982109 -86.567906  ...      261     261     261     261     261     261   

   3/6/23  3/7/23  3/8/23  3/9/23  
0 

In [3]:
print(DF.columns[:11])

Index(['Province_State', 'Admin2', 'UID', 'iso2', 'iso3', 'code3', 'FIPS',
       'Country_Region', 'Lat', 'Long_', 'Population'],
      dtype='object')


## Dummy

In [4]:
data = {
    'col1': ['foo'],
    'col2': ['bar'],
    '1/23/20': [1],
    '1/30/20': [2],
    '2/23/20': [3],
    '2/10/20': [4],
    '1/23/21': [2],
    '1/30/22': [8]
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,col1,col2,1/23/20,1/30/20,2/23/20,2/10/20,1/23/21,1/30/22
0,foo,bar,1,2,3,4,2,8


In [5]:
# Save the non-date columns
df_non_date = df[['col1', 'col2']]

# Convert the column names to datetime, errors='coerce' will convert unconvertable strings to NaT
df.columns = pd.to_datetime(df.columns, errors='coerce', format='%m/%d/%y')

# Create a new DataFrame to store the result
df_new = df[df.columns[~df.columns.isna()]].copy()

# Group by year and take the max value for each group
df_new.columns = df_new.columns.to_series().dt.to_period('Y')
df_new = df_new.groupby(df_new.columns, axis=1).max()

# Rename the columns
df_new.columns = df_new.columns.strftime('Total of deaths in %Y')

# Concatenate the non-date columns with the new DataFrame
df = pd.concat([df_non_date, df_new], axis=1)

# Print the DataFrame
print(df)


  col1 col2  Total of deaths in 2020  Total of deaths in 2021  \
0  foo  bar                        4                        2   

   Total of deaths in 2022  
0                        8  


## Generalization

In [6]:
# Save the non-date columns
DF_non_date = DF[DF.columns[:11]]

# Convert the column names to datetime, errors='coerce' will convert unconvertable strings to NaT
DF.columns = pd.to_datetime(DF.columns, errors='coerce', format='%m/%d/%y')

print(DF.columns)

DatetimeIndex([       'NaT',        'NaT',        'NaT',        'NaT',
                      'NaT',        'NaT',        'NaT',        'NaT',
                      'NaT',        'NaT',
               ...
               '2023-02-28', '2023-03-01', '2023-03-02', '2023-03-03',
               '2023-03-04', '2023-03-05', '2023-03-06', '2023-03-07',
               '2023-03-08', '2023-03-09'],
              dtype='datetime64[ns]', length=1154, freq=None)


In [7]:
# Create a new DataFrame to store the result
DF_new = DF[DF.columns[~DF.columns.isna()]].copy()

print(DF_new)

   2020-01-22  2020-01-23  2020-01-24  2020-01-25  2020-01-26  2020-01-27  \
0           0           0           0           0           0           0   
1           0           0           0           0           0           0   
2           0           0           0           0           0           0   
3           0           0           0           0           0           0   
4           0           0           0           0           0           0   
5           0           0           0           0           0           0   
6           0           0           0           0           0           0   
7           0           0           0           0           0           0   
8           0           0           0           0           0           0   

   2020-01-28  2020-01-29  2020-01-30  2020-01-31  ...  2023-02-28  \
0           0           0           0           0  ...         230   
1           0           0           0           0  ...         724   
2           0      

In [8]:
# Group by year and take the max value for each group
DF_new.columns = DF_new.columns.to_series().dt.to_period('Y')
DF_new = DF_new.groupby(DF_new.columns, axis=1).max()

print(DF_new)


   2020  2021  2022  2023
0    48   160   230   232
1   161   593   719   727
2    32    81   103   103
3    46    95   108   109
4    63   198   260   261
5    22    46    54    54
6    45   102   130   132
7   156   532   665   680
8    63   147   170   172


In [9]:
# Rename the columns
DF_new.columns = DF_new.columns.strftime('Total_deaths_%Y')

print(DF_new.columns)

Index(['Total_deaths_2020', 'Total_deaths_2021', 'Total_deaths_2022',
       'Total_deaths_2023'],
      dtype='object')


In [10]:
# Concatenate the non-date columns with the new DataFrame
DF = pd.concat([DF_non_date, DF_new], axis=1)

# Print the DataFrame
print(DF)

  Province_State    Admin2       UID iso2 iso3  code3    FIPS Country_Region  \
0        Alabama   Autauga  84001001   US  USA    840  1001.0             US   
1        Alabama   Baldwin  84001003   US  USA    840  1003.0             US   
2        Alabama   Barbour  84001005   US  USA    840  1005.0             US   
3        Alabama      Bibb  84001007   US  USA    840  1007.0             US   
4        Alabama    Blount  84001009   US  USA    840  1009.0             US   
5        Alabama   Bullock  84001011   US  USA    840  1011.0             US   
6        Alabama    Butler  84001013   US  USA    840  1013.0             US   
7        Alabama   Calhoun  84001015   US  USA    840  1015.0             US   
8        Alabama  Chambers  84001017   US  USA    840  1017.0             US   

         Lat      Long_  Population  Total_deaths_2020  Total_deaths_2021  \
0  32.539527 -86.644082       55869                 48                160   
1  30.727750 -87.722071      223234          

In [11]:
DF.to_csv('transformed_agg_yearly_us_deaths_sample.csv', index=False)