# Determine mortality crises

This notebook takes the results generated in "AnalyzeExcessMortality.ipynb" and identifies extended periods with excess mortality and groups them together as "mortality crises". 

The results are collected in a single dataframe, which is saved as a csv file.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib widget

# Load style
plt.style.use('PlotStyle.mplstyle')
import matplotlib.colors as colors
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

from datetime import datetime
from tqdm import tqdm

import os

# Load functions
import sys
sys.path.append("../../ExcessMortality")
import ExcessMortalityFunctions as emf


# saveFigures = True
# # saveFigures = False
# print('saveFigures is set to: '+str(saveFigures))
print('Done loading packages')

Done loading packages


In [2]:
# Set paths
pathData = '../Data/'
pathCollections = pathData + 'MortalityCollections/'
pathResults = pathData + 'AnalysisResults' 

In [3]:
dfRel = pd.read_csv('../SupplementaryTable_RelationalTable_ParishCounty.csv')
dfRel['StartDate'] = pd.to_datetime(dfRel.StartDate)
dfRel['EndDate'] = pd.to_datetime(dfRel.EndDate)

In [4]:
# Only consider counties that exists after 1810 
# (A large restructuring was done between 1800 and 1810)
print('Number of counties:',len(dfRel.AmtID.unique()))
allAmtIDs = dfRel[dfRel.EndDate > np.datetime64('1810-01-01')].AmtID.unique()

dfRel = dfRel[dfRel.AmtID.isin(allAmtIDs)]
print('Number of counties:',len(allAmtIDs))

# Drop Flensborg and Løgumkloster counties (since they only contains two parishes)
dfRel = dfRel.drop(dfRel[dfRel.AmtName.str.contains('Løgum')].index) 
dfRel = dfRel.drop(dfRel[dfRel.AmtName.str.contains('Flensbo')].index) 

allAmtIDs = dfRel.AmtID.unique()
print('Number of counties:',len(allAmtIDs))

Number of counties: 46
Number of counties: 27
Number of counties: 25


In [5]:
# Table with population data
dfPop = pd.read_excel(pathData + 'AmtPopulation.xlsx')
dfPop['DigDagID'] = dfPop['DigDagID'].fillna(0).astype(int) # Since excel sometimes saves ints as floats

In [6]:
# Define some helper functions for getting county-names and county-files
def getAmtName(amtID):
    # Gets the name of a given county from the ID
    return dfRel[dfRel.AmtID == amtID].AmtName.values[0]

def getAmtCollections(amtID,pathColl=pathCollections):
    # Function for getting all data-collections for a specific county, as well as the periods for which the county exists.

    # Get the name of all files in "collections" directory
    allCollections = np.array(os.listdir(pathColl))

    # Get the ones relevant to the current amt
    curCollectionsFilenames = allCollections[[int(x.split('_')[0]) == amtID for x in allCollections]]

    # Extract start and end dates
    allStarts = [x.split('_')[2] for x in curCollectionsFilenames]
    allEnds = [x.split('_')[3].split('.')[0] for x in curCollectionsFilenames]

    # Make a list of all dataframes
    alldfs = []
    for filename in curCollectionsFilenames:
        # Read the next file
        curdf = pd.read_csv(pathColl + filename)
        # Append to list
        alldfs.append(curdf)
    
    return alldfs,allStarts,allEnds

# Parameters used in main analysis

In [7]:
# Flags and analysis parameters used in main analysis 
numYears = 12 # Number of years on both sides of date to use for baseline calculations 
numYearsTot = (numYears*2) # The "name" of the baseline (i.e. +/- 5 years is a 10-year baseline, +/- 12 is a 24 year baseline)
thresholdExcess = 3 # Threshold (in terms of Z-scores) for identifying a day as having increased excess

# # For sensitivity analyses
# numYears = 6 # Number of years on both sides of date to use for baseline calculations 
# numYears = 9 # Number of years on both sides of date to use for baseline calculations 


# Determine directory in which results were saved
# pathResultsUpper = pathResults + f'_Years{numYears}_Threshold{thresholdExcess}/'
pathResultsUpper = pathResults + f'_NonSmoothed_Years{numYears}_Threshold{thresholdExcess}/'


In [8]:
# Define the agegroups analyzed
ageGroups = [
    ['Total'],
    ['Stillborn','0'],
    ['1-4','5-9', '10-14'],
    ['15-19', '20-24', '25-29', '30-34', '35-39'],
    ['40-44', '45-49', '50-54', '55-59'],
    ['60-64', '65-69', '70-74', '75-79', '80+']
]

# And the names used for directories and filenames
ageGroupNames = [
    'Total',
    'Infants_stillborn',
    '1-14',
    '15-39',
    '40-59',
    '60+'
]

# Parameters for identification of mortality crises

In [9]:
# Parameters to use here
thresholdLower = 2 # Lower threshold used for determining the start and end of periods (in terms of Z-scores)
maxDaysBelowThreshold = 7 # Number of days below thresholdLower before a period of excess is "stopped"
minimumLengthOfEpidemic = 0 # Minimal number of days above thresholdExcess which is counted as a period of excess 
excessCountThreshold = 50 # Only save mortality crises with more than this number of excess deaths

# # Extra parameters for sensitivity analysis
# minimumLengthOfEpidemic = 4 # Minimal number of days above thresholdExcess which is counted as a period of excess 
# maxDaysBelowThreshold = 4 # Number of days below thresholdLower before a period of excess is "stopped"
# maxDaysBelowThreshold = 10 # Number of days below thresholdLower before a period of excess is "stopped" 
# maxDaysBelowThreshold = 20 # Number of days below thresholdLower before a period of excess is "stopped" 
# excessCountThreshold = 20 # Only save mortality crises with more than this number of excess deaths


# Determine filename to use for final results
# finalResultsFilename = 'AllCrises'+f'_Years{numYears}_Threshold{thresholdExcess}_LowerThreshold{thresholdLower}_MaxDaysBelow{maxDaysBelowThreshold}_minLength{minimumLengthOfEpidemic}_minCount{excessCountThreshold}'
finalResultsFilename = 'AllCrises'+f'_NonSmoothed_Years{numYears}_Threshold{thresholdExcess}_LowerThreshold{thresholdLower}_MaxDaysBelow{maxDaysBelowThreshold}_minLength{minimumLengthOfEpidemic}_minCount{excessCountThreshold}'
finalResultsFilename

'AllCrises_NonSmoothed_Years12_Threshold3_LowerThreshold2_MaxDaysBelow7_minLength0_minCount50'

# Helper functions for saving season and quarter

In [10]:
def getSeason(d):
    curMonth = pd.to_datetime(d).month
    
    seasonDict = {
        1:'Winter',
        2:'Winter',
        3:'Spring',
        4:'Spring',
        5:'Spring',
        6:'Summer',
        7:'Summer',
        8:'Summer',
        9:'Fall',
        10:'Fall',
        11:'Fall',
        12:'Winter',
    }
    return seasonDict[curMonth]
    
def getQuarter(d):
    curMonth = pd.to_datetime(d).month
    
    QuarterDict = {
        1:'Q1',
        2:'Q1',
        3:'Q1',
        4:'Q2',
        5:'Q2',
        6:'Q2',
        7:'Q3',
        8:'Q3',
        9:'Q3',
        10:'Q4',
        11:'Q4',
        12:'Q4',
    }
    return QuarterDict[curMonth]

# Functions for interpolation of population counts

In [11]:
def getPopRow(amtID):
    curRow = dfPop[dfPop.DigDagID == amtID]
    if len(curRow) == 0:
        print(amtID)
        print('----- Amt not found in population table -----')

    elif len(curRow) > 1:
        # print(f'----- Amt ID {amtID} gives too many results -----')        
        return curRow
    else:
        return curRow 

def AmtIDToDailyPopulation(amtID,dateToFocusOn=np.datetime64('1870-01-01')):
    
    # Get current row from population table
    curPopRow = getPopRow(amtID)
    # Remove name and ID
    curPopRow = curPopRow.iloc[:,2:]

    # Special case for Århus Amt (for deciding whether to include Skanderborg Amt counts or not)
    if (amtID == 118846):
        if dateToFocusOn >= np.datetime64('1867-01-01'):
            curPopRow = curPopRow.iloc[1:]
        else:
            curPopRow = curPopRow.iloc[:1]
            
    # Transpose and rename
    curdfPop = curPopRow.T.reset_index()
    curdfPop = curdfPop.rename(columns={curdfPop.columns[0]:'Year',curdfPop.columns[1]:'Population'})

    # Get years-values as dates        
    curdfPop['Date'] = [np.datetime64(str(x)+'-01-01') for x in curdfPop.Year]

    # Make a dataframe with all dates
    dfAllDates = pd.DataFrame(
        {'Date':np.arange(curdfPop.Date.iloc[0],curdfPop.Date.iloc[-1],np.timedelta64(1,'D'))}
    ) 

    # Merge with empty
    curdfPop = pd.merge(curdfPop,dfAllDates,on='Date',how='outer').sort_values('Date').drop(columns='Year')

    ## New addition during review: Exponential interpolation rather than linear
    # Log population counts before interpolation
    curdfPop['Population'] = np.log(curdfPop['Population'])

    # Carry out interpolation
    curdfPopInterpolate = curdfPop.set_index('Date').interpolate()

    ## New addition during review: Exponential interpolation rather than linear
    # Take exponential of interpolated number 
    curdfPopInterpolate['Population'] = np.exp(curdfPopInterpolate['Population'])
    curdfPop['Population'] = np.exp(curdfPop['Population'])
    
    return curdfPop,curdfPopInterpolate


# Run through all saved files, identify crises and collect them in a dataframe

In [12]:
# Define dataframe to collect results in 
# dfCrisesCollect = pd.DataFrame(
#     columns = [
#         'Amt',
#         'Start',
#         'End',
#         'NumberOfDays',
#         'DayWithMostBurials',
#         'Excess',
#         'ExcessPct',
#         'GenderRatio',
#         'TimeOfYear',
#         'Season'
#     ]
# ) 

dfCrisesCollectAge = pd.DataFrame(
    columns = [
        'Amt',
        'Start',
        'End',
        'NumberOfDays',
        'DayWithMostDeaths',
        'Excess',
        'ExcessPct',
        'GenderRatio',
        'TimeOfYear',
        'Season',
        'PopulationEstimate',
        'Exc_Infants_stillborn',
        'Exc_1-14',
        'Exc_15-39',
        'Exc_40-59',
        'Exc_60+',
        'Pct_Infants_stillborn',
        'Pct_1-14',
        'Pct_15-39',
        'Pct_40-59',
        'Pct_60+',
        'DataSum_Infants_stillborn',
        'DataSum_1-14',
        'DataSum_15-39',
        'DataSum_40-59',
        'DataSum_60+',
        'Baseline_Infants_stillborn',
        'Baseline_1-14',
        'Baseline_15-39',
        'Baseline_40-59',
        'Baseline_60+',
    ]
) 


In [13]:
# Prepare progressbar and go through all counties
pbar = tqdm(allAmtIDs)
for curAmtID in pbar:

    # Get county name, data and periods
    curAmtName = getAmtName(curAmtID)
    alldfs,allStarts,allEnds = getAmtCollections(curAmtID)


    # Go through each possible period
    for i in range(len(allStarts)):

        # Get the dataframe, start date and end date
        curdf = alldfs[i].copy()
        curStart = allStarts[i]
        curEnd = allEnds[i]

        # Make sure that the "Date" columns in the data-dataframe is a datetime64 object
        curdf['Date'] = pd.to_datetime(curdf['Date'])

        # Update progressbar
        pbar.set_postfix(
            {
                'Amt':curAmtName,
                'Period':i,
                'Total periods':len(allStarts),
                'Start':curStart,
                'End':curEnd,
            }
        )

        # Determine filename results-file
        curFileName =  str(int(curAmtID)) + '_'+curAmtName + '_'+pd.to_datetime(curStart).strftime('%Y-%m-%d') +'_'+pd.to_datetime(curEnd).strftime('%Y-%m-%d')
        # curFileName = curFileName + '_Total.csv' # Use results from analysis of all ages
        curFileName = curFileName + '.csv' 

        # Load analysis file
        dfPeriod = pd.read_csv(pathResultsUpper+curFileName)
        # Make sure date is a datetime64 object
        dfPeriod['Date'] = pd.to_datetime(dfPeriod.Date)

        # Restrict to valid period 
        dfPeriod = dfPeriod[(dfPeriod.Date >= curStart) & (dfPeriod.Date < curEnd)].reset_index(drop=True)

        # Get results and re-calculate excess
        curTime = dfPeriod.Date 
        curExcess = dfPeriod.Total_Data - dfPeriod.Total_Baseline 
        # curExcess = dfPeriod.Total_Data7DayMean - dfPeriod.Total_Baseline  
        
        curZscore = dfPeriod.Total_Zscore 
        
        # Use function from ExcessMortalityFunctions to determine mortality crisis period
        dateGroups,allExcess  = emf.determineMortalityCrisis(curTime,curExcess,curZscore,upperThreshold=thresholdExcess,lowerThreshold=thresholdLower,maxDaysBelowThreshold=maxDaysBelowThreshold,minDurationOfCrisis=minimumLengthOfEpidemic,returnExcessCount=True)
        
        # Go through each mortality crisis
        for excID in range(len(dateGroups)):
            
            # Get current start, end and total excess
            curGroup = dateGroups[excID]
            curExc = allExcess[excID]

            # If the period is significant enough
            if (curExc >= excessCountThreshold):
                
                # Get start and end
                curCrisisStart = curGroup[0]
                curCrisisEnd = curGroup[1]

                # Determine duraction (in days)
                curDuration = int((curCrisisEnd - curCrisisStart)/np.timedelta64(1,'D'))

                # Calculate gender ratio of all deaths in period from data-collection dataframe
                dfDataCrisis = curdf[(curdf.Date >= curCrisisStart) & (curdf.Date <= curCrisisEnd)]
                GenderRatioInPeriod = dfDataCrisis.Male.sum()/(dfDataCrisis.Male.sum()+dfDataCrisis.Female.sum())

                # Determine the date (during the crisis) where raw data is highest.
                curDeadliestDay = dfDataCrisis.iloc[dfDataCrisis.Total.argmax()]['Date']
                
                # Make a dataframe consisting of analysis-results only during period
                dfCrisis = dfPeriod[(dfPeriod.Date >= curCrisisStart) & (dfPeriod.Date <= curCrisisEnd)]

                # Total deaths in period, data
                curTotData = dfCrisis.Total_Data.sum()
                # curTotData = dfCrisis.Total_Data7DayMean.sum() 
                
                # Total deaths in period, baseline
                curTotBase = dfCrisis.Total_Baseline.sum()
                # Excess deaths in entire period, in percent
                curExcPct = (curTotData - curTotBase)/curTotBase
                # curExcPct = int(np.round(100 * curExcPct))
                curExcPct = np.round(100 * curExcPct)

                # Get population estimate (rounded to an integer)
                _,populationInterpolated = AmtIDToDailyPopulation(curAmtID,curDeadliestDay)
                curPopulation = np.round(populationInterpolated.loc[curDeadliestDay]['Population'])

                # Collect results as a row to add to dataframe
                curRowToAdd = pd.Series({
                    'Amt': curAmtName,
                    'Start': curCrisisStart,
                    'End': curCrisisEnd,
                    'NumberOfDays': curDuration,
                    'DayWithMostDeaths': curDeadliestDay,
                    # 'DayWithMostBurials': curDeadliestDay,
                    'Excess': int(np.round(curExc)),
                    'ExcessPct': curExcPct,
                    'GenderRatio': GenderRatioInPeriod,
                    'TimeOfYear': getQuarter(curDeadliestDay),
                    'Season': getSeason(curDeadliestDay),
                    'PopulationEstimate': curPopulation,
                })

                #### Get results of age-specific analysis
                # Go through each agegroup
                for ageIndex in range(len(ageGroups)):
                    
                    # Get the agegroup and the name of the group
                    curAgeGroup = ageGroups[ageIndex]
                    curAgeName = ageGroupNames[ageIndex]

                    # Calculate age-specific measures in excess-mortality-period
                    curTotDataAge = dfCrisis[curAgeName+'_Data'].sum()
                    # curTotDataAge = dfCrisis[curAgeName+'_Data7DayMean'].sum() 
                    
                    curTotBaseAge = dfCrisis[curAgeName+'_Baseline'].sum()

                    curExcAge = curTotDataAge - curTotBaseAge
                    curExcPctAge = (curExcAge)/curTotBaseAge
                    # curExcPctAge = int(np.round(100 * curExcPctAge))
                    curExcPctAge = np.round(100 * curExcPctAge)


                    curRowToAdd['Exc_'+curAgeName] = curExcAge
                    curRowToAdd['Pct_'+curAgeName] = curExcPctAge
                    curRowToAdd['DataSum_'+curAgeName] = curTotDataAge
                    curRowToAdd['Baseline_'+curAgeName] = curTotBaseAge
                    
                # Add row to primary dataframe
                dfCrisesCollectAge.loc[len(dfCrisesCollectAge)] = curRowToAdd
                                

  0%|          | 0/25 [00:00<?, ?it/s, Amt=Sorø Amt, Period=0, Total periods=1, Start=1810-01-01, End=1915-01-01]

  curExcPctAge = (curExcAge)/curTotBaseAge
100%|██████████| 25/25 [01:28<00:00,  3.54s/it, Amt=Nordborg Amt, Period=1, Total periods=2, Start=1867-09-22, End=1915-01-01]


In [15]:
# Remove crises in earliest period of data, since data is not reliable in this period
dfCrisesCollectAge = dfCrisesCollectAge.drop(dfCrisesCollectAge[dfCrisesCollectAge.End < np.datetime64('1820-01-01')].index)

In [16]:
# # Sort by total excess
dfCrisesCollectAge = dfCrisesCollectAge.sort_values('Excess',ascending=False)

In [17]:
# # Save results
dfCrisesCollectAge.to_csv(pathData + finalResultsFilename+'.csv',index=False)


In [18]:
# Also save file as excel (with dates as strings to avoid excel-problems)
dfCrisesCollectAgeExcel = dfCrisesCollectAge.copy()
dfCrisesCollectAgeExcel['Start'] = dfCrisesCollectAgeExcel['Start'].astype(str)
dfCrisesCollectAgeExcel['End'] = dfCrisesCollectAgeExcel['End'].astype(str)
dfCrisesCollectAgeExcel['DayWithMostDeaths'] = dfCrisesCollectAgeExcel['DayWithMostDeaths'].astype(str)

dfCrisesCollectAgeExcel.to_excel(pathData + finalResultsFilename+'.xlsx',index=False)

# Extract some numbers for main text

In [59]:
len(dfCrisesCollectAge)

# tempdf = pd.read_csv("C:/NonSyncedFiles/GithubRepos/Backup_SignatureFeatures_2024January/AllCrises_NonSmoothed_Years12_Threshold3_LowerThreshold2_MaxDaysBelow7_minLength0_minCount50.csv")
# tempdf['Start'] = pd.to_datetime(tempdf['Start'])
# tempdf['End'] = pd.to_datetime(tempdf['End'])
# tempdf['DayWithMostDeaths'] = pd.to_datetime(tempdf['DayWithMostDeaths'])
# len(tempdf)
# # dfCrisesCollectAge.iloc[28]
# # pd.concat([tempdf,dfCrisesCollectAge.reset_index(drop=True)]).iloc[:,:6].drop_duplicates(keep=False).sort_index()
# # pd.concat([tempdf,dfCrisesCollectAge.reset_index(drop=True)]).iloc[:,:6].drop_duplicates(keep=False).sort_values('DayWithMostDeaths')
# pd.concat([tempdf,dfCrisesCollectAge.reset_index(drop=True)]).iloc[:,:7].drop_duplicates(keep=False).sort_values('DayWithMostDeaths')
# # display(dfCrisesCollectAge.reset_index(drop=True).iloc[:,:6].head())
# # display(tempdf.iloc[:,:6].head())

418

In [19]:
display(dfCrisesCollectAge.head())
print('Number of crises:',len(dfCrisesCollectAge))
print('Total excess:',dfCrisesCollectAge.Excess.sum())

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,...,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+
324,Staden København,1853-07-02,1853-09-07,67,1853-07-28,3833,546.0,0.468578,Q3,Summer,...,212.0,555.0,1069.0,1438.0,989.0,130.480378,110.718309,126.5008,109.150856,106.94298
272,Maribo Amt,1831-08-01,1832-07-06,340,1831-09-08,2181,150.0,0.504945,Q3,Fall,...,420.0,368.0,605.0,946.0,1170.0,249.792074,301.259705,189.946481,207.321237,337.495203
22,Præstø Amt,1831-07-30,1831-12-14,137,1831-08-28,1563,392.0,0.512232,Q3,Summer,...,195.0,127.0,215.0,541.0,854.0,79.046466,61.886264,47.638109,48.519376,109.552978
0,Sorø Amt,1831-08-05,1832-01-18,166,1831-08-25,1230,261.0,0.495591,Q3,Summer,...,243.0,157.0,203.0,394.0,649.0,100.149275,67.222551,56.794885,62.29792,117.473185
387,Holbæk Amt,1831-08-02,1832-02-06,188,1831-08-26,1217,209.0,0.528365,Q3,Summer,...,199.0,163.0,226.0,424.0,703.0,85.152793,83.962899,69.535545,72.929956,152.730832


Number of crises: 418
Total excess: 60179


In [None]:
dfCrisesCollectAge.tail()
dfCrisesCollectAge.sort_values('NumberOfDays')

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,...,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+
306,Maribo Amt,1872-11-13,1872-11-14,1,1872-11-13,60,700.0,0.5,Q4,Fall,...,3.0,23.0,18.0,14.0,10.0,1.575758,1.507905,0.772727,1.061265,2.338768
361,Staden København,1894-07-02,1894-07-05,3,1894-07-02,53,86.0,0.573913,Q3,Summer,...,30.0,26.0,13.0,23.0,19.0,15.186924,9.0,9.07971,9.922101,14.776515
366,Staden København,1901-08-10,1901-08-17,7,1901-08-16,77,59.0,0.533654,Q3,Summer,...,77.0,32.0,22.0,29.0,47.0,39.61973,16.958333,17.038044,20.038561,31.062088
68,Københavns Amt,1899-03-08,1899-03-15,7,1899-03-09,55,63.0,0.432624,Q1,Spring,...,32.0,18.0,11.0,16.0,61.0,17.161749,9.034209,10.07971,13.757246,33.168478
362,Staden København,1857-11-21,1857-11-28,7,1857-11-23,52,66.0,0.564885,Q4,Fall,...,12.0,21.0,27.0,19.0,34.0,12.791667,13.038867,13.385869,12.717391,16.197464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,Holbæk Amt,1829-07-11,1829-12-18,160,1829-08-28,590,136.0,0.523902,Q3,Summer,...,81.0,187.0,110.0,229.0,354.0,63.697176,57.979523,54.424133,57.539007,112.275793
0,Sorø Amt,1831-08-05,1832-01-18,166,1831-08-25,1230,261.0,0.495591,Q3,Summer,...,243.0,157.0,203.0,394.0,649.0,100.149275,67.222551,56.794885,62.29792,117.473185
408,Bornholms Amt,1857-08-27,1858-02-20,177,1858-01-10,321,140.0,0.483636,Q1,Winter,...,78.0,254.0,62.0,48.0,95.0,27.303345,22.517411,14.528883,23.349034,70.251181
387,Holbæk Amt,1831-08-02,1832-02-06,188,1831-08-26,1217,209.0,0.528365,Q3,Summer,...,199.0,163.0,226.0,424.0,703.0,85.152793,83.962899,69.535545,72.929956,152.730832
