# Main analysis
This notebooks carries out the main analysis of excess mortality

The notebooks goes through each county-file (for each possible period), and determines a mortality baseline using the functions in the ExcessMortalityFunctions repository.


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib widget

# Load style
plt.style.use('PlotStyle.mplstyle')
import matplotlib.colors as colors
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

from datetime import datetime
from tqdm import tqdm

import os

# Load functions
import sys
sys.path.append("../../ExcessMortality")
import ExcessMortalityFunctions as emf

saveFigures = True
# saveFigures = False
print('saveFigures is set to: '+str(saveFigures))
print('Done loading packages')

saveFigures is set to: True
Done loading packages


In [2]:
# Set paths
pathData = '../Data/'
pathCollections = pathData + 'MortalityCollections/'
pathResults = pathData + 'AnalysisResults' 

In [3]:
dfRel = pd.read_csv('../SupplementaryTable_RelationalTable_ParishCounty.csv')
dfRel['StartDate'] = pd.to_datetime(dfRel.StartDate)
dfRel['EndDate'] = pd.to_datetime(dfRel.EndDate)

In [4]:
# Only consider counties that exists after 1810 
# (A large restructuring was done between 1800 and 1810)
print('Number of counties:',len(dfRel.AmtID.unique()))
allAmtIDs = dfRel[dfRel.EndDate > np.datetime64('1810-01-01')].AmtID.unique()

dfRel = dfRel[dfRel.AmtID.isin(allAmtIDs)]
print('Number of counties:',len(allAmtIDs))

# Drop Flensborg and Løgumkloster counties (since they only contains two parishes)
dfRel = dfRel.drop(dfRel[dfRel.AmtName.str.contains('Løgum')].index) 
dfRel = dfRel.drop(dfRel[dfRel.AmtName.str.contains('Flensbo')].index) 

allAmtIDs = dfRel.AmtID.unique()
print('Number of counties:',len(allAmtIDs))

Number of counties: 46
Number of counties: 27
Number of counties: 25


In [5]:
# Define some helper functions for getting county-names and county-files
def getAmtName(amtID):
    # Gets the name of a given county from the ID
    return dfRel[dfRel.AmtID == amtID].AmtName.values[0]

def getAmtCollections(amtID,pathColl=pathCollections):
    # Function for getting all data-collections for a specific county, as well as the periods for which the county exists.

    # Get the name of all files in "collections" directory
    allCollections = np.array(os.listdir(pathColl))

    # Get the ones relevant to the current amt
    curCollectionsFilenames = allCollections[[int(x.split('_')[0]) == amtID for x in allCollections]]

    # Extract start and end dates
    allStarts = [x.split('_')[2] for x in curCollectionsFilenames]
    allEnds = [x.split('_')[3].split('.')[0] for x in curCollectionsFilenames]

    # Make a list of all dataframes
    alldfs = []
    for filename in curCollectionsFilenames:
        # Read the next file
        curdf = pd.read_csv(pathColl + filename)
        # Append to list
        alldfs.append(curdf)
    
    return alldfs,allStarts,allEnds

# def getAmtCollections(amtID):
#     # Function for getting all data-collections for a specific county, as well as the periods for which the county exists.

#     # Get the name of all files in "collections" directory
#     allCollections = np.array(os.listdir(pathData))

#     # Get the ones relevant to the current amt
#     curCollectionsFilenames = allCollections[[int(x.split('_')[0]) == amtID for x in allCollections]]

#     # Extract start and end dates
#     allStarts = [x.split('_')[2] for x in curCollectionsFilenames]
#     allEnds = [x.split('_')[3].split('.')[0] for x in curCollectionsFilenames]

#     # Make a list of all dataframes
#     alldfs = []
#     for filename in curCollectionsFilenames:
#         # Read the next file
#         curdf = pd.read_csv(pathData + filename)
#         # Append to list
#         alldfs.append(curdf)
    
#     return alldfs,allStarts,allEnds

In [6]:
# Helper function for summing columns
def sumColumns(curdf,columnsToUse=['Total']):
    # Returns the sum of the columns specified
    return curdf[columnsToUse].sum(axis=1).copy()

# Set parameters for analysis

In [7]:
# Flags and analysis parameters
numYears = 12 # Number of years on both sides of date to use for baseline calculations 
numYearsTot = (numYears*2) # The "name" of the baseline (i.e. +/- 5 years is a 10-year baseline, +/- 12 is a 24 year baseline)
thresholdExcess = 3 # Threshold (in terms of Z-scores) for identifying a day as having increased excess

# # For sensitivity analyses
# numYears = 6 # Number of years on both sides of date to use for baseline calculations 
# numYears = 9 # Number of years on both sides of date to use for baseline calculations 

In [8]:
# Define the agegroups to analyze
ageGroups = [
    ['Total'],
    ['Stillborn','0'],
    ['1-4','5-9', '10-14'],
    ['15-19', '20-24', '25-29', '30-34', '35-39'],
    ['40-44', '45-49', '50-54', '55-59'],
    ['60-64', '65-69', '70-74', '75-79', '80+']
]

# And the names to use for directories and filenames
ageGroupNames = [
    'Total',
    'Infants_stillborn',
    '1-14',
    '15-39',
    '40-59',
    '60+'
]

# Run analysis on 7-day mean data

In [9]:

# pathToSaveResultsInUpper = pathToSaveResultsIn + f'_Years{numYears}_Threshold{thresholdExcess}/'

# # Create directory if it doesn't already exist
# try:
#     os.mkdir(pathToSaveResultsInUpper)
#     print('Created directory')
# except:
#     2+2
    

In [10]:
# ##################################################
# ##### Run analysis for each of the agegroups #####
# ############ Takes about half an hour ############ 
# ##################################################

# # Prepare progressbar and go through each county
# pbar = tqdm(allAmtIDs)
# for curAmtID in pbar:

#     # Get county name, data and periods
#     curAmtName = ps.getAmtName(curAmtID)
#     alldfs,allStarts,allEnds = ps.getAmtCollections(curAmtID)

#     # Go through each possible period
#     for i in range(len(allStarts)):

#         # Get the dataframe, start date and end date
#         curdf = alldfs[i].copy()
#         curStart = allStarts[i]
#         curEnd = allEnds[i]

#         # Ensure date is datetime
#         curdf['Date'] = pd.to_datetime(curdf.Date)
#         # Set date as index
#         curdf = curdf.set_index('Date')
        
#         # Prepare dataframe to save to file
#         dfToSave = curdf.copy()

#         for ageIndex in range(len(ageGroups)):
#             # Get current agegroups and name
#             curAgeGroup = ageGroups[ageIndex]
#             curAgeName = ageGroupNames[ageIndex]

#             # Update progressbar
#             pbar.set_postfix(
#                 {
#                     'Amt':curAmtName,
#                     'Period':i+1,
#                     'Total periods':len(allStarts),
#                     'Start':curStart,
#                     'End':curEnd,
#                     'Agegroup':curAgeName,
#                 }
#             )
            

#             # Sum the columns of the given agegroups
#             curSeries = sumColumns(curdf,curAgeGroup)

#             # Calculate the 7-day average to avoid trouble with Sundays having more burials than other weekdays
#             curSeriesRn = curSeries.rolling(window=7,center=True).mean()

#             # this_curTime,this_curVals,this_corrMean,this_corrStd,this_postResi,this_postResiStd,this_postResiPct = emf.runFullAnalysisDailySeries(curSeriesRn,numYears=numYears,ZscoreThreshold=thresholdExcess)
#             # curBaseline,curStandardDeviation,curExcess,curZscore,curExcessPct
#             curBaseline,curStandardDeviation,curExcess,curZscore,curExcessPct = emf.runFullAnalysisDailySeries(curSeriesRn,numYears=numYears,ZscoreThreshold=thresholdExcess)
            
#             # Make a dataframe for results
#             dfResults = pd.DataFrame(
#                 {
#                     # curAgeName+'_Data':curSeries,
#                     curAgeName+'_Data7DayMean':curSeriesRn,
#                     curAgeName+'_Baseline':curBaseline, 
#                     curAgeName+'_StandardDeviation':curStandardDeviation,
#                     curAgeName+'_Zscore':curZscore, 
#                 }
#             )

#             # Merge with data and results-so-far
#             dfToSave = pd.merge(dfToSave,dfResults, left_index=True, right_index=True)
    
#         # Determine filename to save file as
#         curFileName =  str(int(curAmtID)) + '_'+curAmtName + '_'+pd.to_datetime(curStart).strftime('%Y-%m-%d') +'_'+pd.to_datetime(curEnd).strftime('%Y-%m-%d')
#         # curFileName = curFileName + '_'+curAgeName+'.csv'
#         curFileName = curFileName + '.csv'
        
#         # # Save county results to file
#         # dfToSave.reset_index().to_csv('test.csv')
#         # Only save analysis results, to save space in github repo
#         # dfToSave.iloc[:,curdf.shape[1]:].reset_index().to_csv(pathToSaveResultsInUpper + curFileName,index=False) # Saves everything, with very high float precision
#         dfToSave.iloc[:,curdf.shape[1]:].reset_index().round(7).to_csv(pathToSaveResultsInUpper + curFileName,index=False) # Rounds everything to 7 decimals before saving, reduces filesize significantly compared to the line above
# 
# print('Done with analysis')

# Run analysis without smoothing with 7-days mean

In [11]:

pathToSaveResultsInUpperNonSmooth = pathToSaveResultsIn + f'_NonSmoothed_Years{numYears}_Threshold{thresholdExcess}/'

# Create directory if it doesn't already exist
try:
    os.mkdir(pathToSaveResultsInUpperNonSmooth)
    print('Created directory')
except:
    2+2
    

In [12]:
##################################################
##### Run analysis for each of the agegroups #####
############ Takes about half an hour ############ 
##################################################

# Prepare progressbar and go through each county
pbar = tqdm(allAmtIDs)
for curAmtID in pbar:

    # Get county name, data and periods
    curAmtName = getAmtName(curAmtID)
    alldfs,allStarts,allEnds = getAmtCollections(curAmtID)

    # Go through each possible period
    for i in range(len(allStarts)):

        # Get the dataframe, start date and end date
        curdf = alldfs[i].copy()
        curStart = allStarts[i]
        curEnd = allEnds[i]

        # Ensure date is datetime
        curdf['Date'] = pd.to_datetime(curdf.Date)
        # Set date as index
        curdf = curdf.set_index('Date')
        
        # Prepare dataframe to save to file
        dfToSave = curdf.copy()

        for ageIndex in range(len(ageGroups)):
            # Get current agegroups and name
            curAgeGroup = ageGroups[ageIndex]
            curAgeName = ageGroupNames[ageIndex]

            # Update progressbar
            pbar.set_postfix(
                {
                    'Amt':curAmtName,
                    'Period':i+1,
                    'Total periods':len(allStarts),
                    'Start':curStart,
                    'End':curEnd,
                    'Agegroup':curAgeName,
                }
            )
            

            # Sum the columns of the given agegroups
            curSeries = sumColumns(curdf,curAgeGroup)

            # # Calculate the 7-day average to avoid trouble with Sundays having more burials than other weekdays
            # curSeriesRn = curSeries.rolling(window=7,center=True).mean()

            # this_curTime,this_curVals,this_corrMean,this_corrStd,this_postResi,this_postResiStd,this_postResiPct = emf.runFullAnalysisDailySeries(curSeriesRn,numYears=numYears,ZscoreThreshold=thresholdExcess)
            # curBaseline,curStandardDeviation,curExcess,curZscore,curExcessPct
            # curBaseline,curStandardDeviation,curExcess,curZscore,curExcessPct = emf.runFullAnalysisDailySeries(curSeriesRn,numYears=numYears,ZscoreThreshold=thresholdExcess)
            curBaseline,curStandardDeviation,curExcess,curZscore,curExcessPct = emf.runFullAnalysisDailySeries(curSeries,numYears=numYears,ZscoreThreshold=thresholdExcess)
            
            # Make a dataframe for results
            dfResults = pd.DataFrame(
                {
                    curAgeName+'_Data':curSeries,
                    # curAgeName+'_Data7DayMean':curSeriesRn,
                    curAgeName+'_Baseline':curBaseline, 
                    curAgeName+'_StandardDeviation':curStandardDeviation,
                    curAgeName+'_Zscore':curZscore, 
                }
            )

            # Merge with data and results-so-far
            dfToSave = pd.merge(dfToSave,dfResults, left_index=True, right_index=True)
    
        # Determine filename to save file as    
        curFileName =  str(int(curAmtID)) + '_'+curAmtName + '_'+pd.to_datetime(curStart).strftime('%Y-%m-%d') +'_'+pd.to_datetime(curEnd).strftime('%Y-%m-%d')
        # curFileName = curFileName + '_'+curAgeName+'.csv'
        curFileName = curFileName + '.csv'
        
        # # Save county results to file
        # dfToSave.reset_index().to_csv('test.csv')
        # Only save analysis results, to save space in github repo
        # dfToSave.iloc[:,curdf.shape[1]:].reset_index().to_csv(pathToSaveResultsInUpperNonSmooth + curFileName,index=False) # Saves everything, with very high float precision
        dfToSave.iloc[:,curdf.shape[1]:].reset_index().round(7).to_csv(pathToSaveResultsInUpperNonSmooth + curFileName,index=False) # Rounds everything to 7 decimals before saving, reduces filesize significantly compared to the line above


print('Done with analysis')

  0%|          | 0/25 [00:00<?, ?it/s, Amt=Sorø Amt, Period=1, Total periods=1, Start=1810-01-01, End=1915-01-01, Agegroup=Total]

100%|██████████| 25/25 [42:06<00:00, 101.08s/it, Amt=Nordborg Amt, Period=2, Total periods=2, Start=1867-09-22, End=1915-01-01, Agegroup=60+]                   

Done with analysis



