# Notebook for calculating the total of the groups presented in the main paper
Note that the groups themselves were decided through a combination of inspection of the quantitative measures calculated in the analysis and manual inspection of data from individual crises (as well as checks with historical records)

Hence this notebooks primarily exists to groups together crises for calculating the numbers reported in the text.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib widget

# Load style
plt.style.use('PlotStyle.mplstyle')
import matplotlib.colors as colors
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 50)

saveFigures = True
# saveFigures = False
print('saveFigures is set to: '+str(saveFigures))
print('Done loading packages')

saveFigures is set to: True
Done loading packages


In [2]:
# Set paths
pathData = '../Data/'
pathResults = '../Data/AnalysisResults'
pathFigs = '../Figures/'

In [3]:
# Flags and analysis parameters used in main analysis, set here to load correct results
numYears = 12 # Number of years on both sides of date to use for baseline calculations 
numYearsTot = (numYears*2) # The "name" of the baseline (i.e. +/- 5 years is a 10-year baseline, +/- 12 is a 24 year baseline)
thresholdExcess = 3 # Threshold (in terms of Z-scores) for identifying a day as having increased excess

# Additional parameters used
thresholdLower = 2 # Lower threshold used for determining the start and end of periods (in terms of Z-scores)
maxDaysBelowThreshold = 7 # Number of days below thresholdLower before a period of excess is "stopped"
minimumLengthOfEpidemic = 0 # Minimal number of days above thresholdExcess which is counted as a period of excess 
excessCountThreshold = 50 # Only save mortality crises with more than this number of excess deaths

# Various tests for sensitivity
# maxDaysBelowThreshold = 4 # Number of days below thresholdLower before a period of excess is "stopped"
# minimumLengthOfEpidemic = 4 # Minimal number of days above thresholdExcess which is counted as a period of excess 
# numYears = 6 # Number of years on both sides of date to use for baseline calculations 
# maxDaysBelowThreshold = 7 # Number of days below thresholdLower before a period of excess is "stopped"
# excessCountThreshold = 20 # Only save mortality crises with more than this number of excess deaths


# Determine filename to use for final results
# finalResultsFilename = 'AllCrises'+f'_Years{numYears}_Threshold{thresholdExcess}_LowerThreshold{thresholdLower}_MaxDaysBelow{maxDaysBelowThreshold}_minLength{minimumLengthOfEpidemic}_minCount{excessCountThreshold}'
finalResultsFilename = 'AllCrises'+f'_NonSmoothed_Years{numYears}_Threshold{thresholdExcess}_LowerThreshold{thresholdLower}_MaxDaysBelow{maxDaysBelowThreshold}_minLength{minimumLengthOfEpidemic}_minCount{excessCountThreshold}'
finalResultsFilename = finalResultsFilename +'_Clustered'
# finalResultsFilename = finalResultsFilename +'_Clustered_Grouped'
finalResultsFilename 

'AllCrises_NonSmoothed_Years12_Threshold3_LowerThreshold2_MaxDaysBelow7_minLength0_minCount50_Clustered'

In [4]:
# Load the table of results
dfCrises = pd.read_csv(pathData + finalResultsFilename + '.csv')
dfCrises['Start'] = pd.to_datetime(dfCrises['Start'])
dfCrises['End'] = pd.to_datetime(dfCrises['End'])
dfCrises['DayWithMostDeaths'] = pd.to_datetime(dfCrises['DayWithMostDeaths'])

In [5]:
# Add a flag for group
dfCrises['Group'] = 'None'

# Cholera

Mainly identified by timing, cluster and elderly mortality.

In [6]:
# Deadliest crises in 1853 is cholera. Get the cluster-ID
cholCluster = dfCrises[(dfCrises.DayWithMostDeaths.dt.year == 1853)].iloc[0].Cluster

# All crises with same age-pattern in 1853 appears to be cholera
dfChol = dfCrises[(dfCrises.Cluster == cholCluster)&(dfCrises.DayWithMostDeaths.dt.year == 1853)].copy()

# except a single crisis in Svendborg, which is in the same cluster, but without no elderly mortality. Sorted out here:
dfChol = dfChol[dfChol['FracAll_60+'] > 0.001]

# Add label to main dataframe
dfCrises.loc[(dfCrises.Cluster == cholCluster)&(dfCrises.DayWithMostDeaths.dt.year == 1853),'Group'] = 'Cholera (1853)'

# Print results 
print(dfChol.Amt.values)
print(f'Total excess: {dfChol.Excess.sum()}')
print(f'Indices: {dfChol.index.values}')
print(dfChol.Cluster.value_counts())
dfChol

['Staden København' 'Københavns Amt' 'Ålborg Amt' 'Århus Amt'
 'Frederiksborg Amt' 'Hjørring Amt']
Total excess: 5154
Indices: [  0  11  18  62 340 399]
Cluster
D    6
Name: count, dtype: int64


Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
0,Staden København,1853-07-02,1853-09-07,67,1853-07-28,3833,546.0,0.468578,Q3,Summer,139473.0,81.519622,444.281691,942.4992,1328.849144,882.05702,62.0,401.0,745.0,1217.0,825.0,212.0,555.0,1069.0,1438.0,989.0,130.480378,110.718309,126.5008,109.150857,106.94298,0.022157,0.120755,0.256169,0.361178,0.239741,3679.206677,D,0.0,0.0,0.0,0.9866,0.0134,0.0,
11,Københavns Amt,1853-07-16,1853-09-24,70,1853-07-31,578,206.0,0.480186,Q3,Summer,76787.0,16.390904,97.367909,149.32477,167.477201,155.673631,27.0,240.0,431.0,424.0,250.0,77.0,138.0,184.0,207.0,218.0,60.609096,40.632091,34.67523,39.522799,62.326369,0.02796,0.16609,0.254719,0.285683,0.265548,586.234414,D,0.0,0.0,0.0011,0.7465,0.2524,0.0,
18,Ålborg Amt,1853-08-09,1853-09-17,39,1853-08-18,434,330.0,0.485866,Q3,Summer,69476.0,19.101878,56.522083,119.507082,126.132553,121.216074,49.0,306.0,825.0,910.0,394.0,58.0,75.0,134.0,140.0,152.0,38.898122,18.477917,14.492918,13.867448,30.783926,0.04317,0.127739,0.270085,0.285058,0.273947,442.479669,D,0.0,0.0,0.0003,0.793,0.2067,0.0,
62,Århus Amt,1853-07-15,1853-09-15,62,1853-08-24,201,138.0,0.488439,Q3,Summer,43290.0,8.343036,23.270502,56.005618,79.659821,47.305049,42.0,94.0,311.0,519.0,177.0,28.0,48.0,74.0,95.0,74.0,19.656964,24.729498,17.994382,15.340179,26.694951,0.03888,0.108445,0.260996,0.371229,0.22045,214.584026,D,0.0,0.0,0.0,0.9954,0.0046,0.0,
340,Frederiksborg Amt,1853-07-15,1853-08-13,29,1853-08-04,57,58.0,0.532051,Q3,Summer,77726.0,9.376153,5.274449,17.887493,11.916761,14.59305,88.0,34.0,136.0,85.0,60.0,20.0,21.0,31.0,26.0,39.0,10.623847,15.725551,13.112507,14.083239,24.40695,0.158789,0.089325,0.302932,0.201815,0.247139,59.047907,D,0.0,0.0,0.0065,0.6896,0.3038,0.0,
399,Hjørring Amt,1853-08-14,1853-08-31,17,1853-08-14,51,96.0,0.596154,Q3,Summer,74944.0,3.839448,10.508379,10.477626,11.644575,10.094203,62.0,124.0,232.0,347.0,93.0,10.0,19.0,15.0,15.0,21.0,6.160552,8.491621,4.522374,3.355425,10.905797,0.082455,0.225675,0.225014,0.250076,0.21678,46.56423,D,0.0,0.0,0.0649,0.6187,0.3165,0.0,


In [7]:
# The 1857 cholera was only in Sorø amt (and also the only crisis in Sorø in 1857)
dfChol2 = dfCrises[(dfCrises.Cluster == cholCluster)&(dfCrises.DayWithMostDeaths.dt.year == 1857) & (dfCrises.Amt.str.contains('Sorø'))].copy()

dfCrises.loc[(dfCrises.Cluster == cholCluster)&(dfCrises.DayWithMostDeaths.dt.year == 1857),'Group'] = 'Cholera (1857)'

print(dfChol2.Amt.values)
print(f'Total excess: {dfChol2.Excess.sum()}')
print(f'Indices: {dfChol2.index.values}')
print(dfChol2.Cluster.value_counts())
dfChol2


['Sorø Amt']
Total excess: 431
Indices: [20]
Cluster
D    1
Name: count, dtype: int64


Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
20,Sorø Amt,1857-09-03,1857-10-24,51,1857-09-13,431,293.0,0.49827,Q3,Fall,78369.0,59.13914,98.851967,90.269904,115.338439,74.557213,165.0,393.0,540.0,618.0,216.0,95.0,124.0,107.0,134.0,109.0,35.86086,25.148033,16.730096,18.661561,34.442787,0.134973,0.225609,0.206022,0.263236,0.170161,438.156663,D,0.0,0.0,0.0394,0.8564,0.1042,0.0,


In [8]:
print('Total excess for cholera:',pd.concat([dfChol,dfChol2]).Excess.sum())

Total excess for cholera: 5585


# Harvest epidemics, 1826-1832
Between 1825 and 1833, most deadly between July to October, mostly cluster E

A number of crises in 1832 are also included, although the age-cluster differs a little (cluster C or D instead of E).

In [9]:

dfHarvest = dfCrises.copy()
dfHarvest = dfHarvest[dfHarvest.DayWithMostDeaths > np.datetime64('1825')]
dfHarvest = dfHarvest[dfHarvest.DayWithMostDeaths <= np.datetime64('1833')]

dfHarvest = dfHarvest[dfHarvest.DayWithMostDeaths.dt.month > 7]
dfHarvest = dfHarvest[dfHarvest.DayWithMostDeaths.dt.month <=10]


# Add some extra crises from 1832 in the same amts
dfHarvestExtra = dfCrises[(dfCrises.DayWithMostDeaths.dt.year == 1832) & (dfCrises.Amt.isin(dfHarvest.Amt.unique()))].copy()

# Add extra to dfHarvest
dfHarvest = pd.concat([dfHarvest,dfHarvestExtra]).sort_index()

# Sort away Ribe amt
dfHarvest = dfHarvest[~dfHarvest.Amt.str.contains('Ribe')]

print(dfHarvest.Amt.unique())
print(f'Total excess: {dfHarvest.Excess.sum()}')
print(f'Part of that which is from excess in extra crises in 1832: {dfHarvestExtra.Excess.sum()}')
print(f'Indices: {dfHarvest.index.values}')

print(dfHarvest.Cluster.value_counts())
# Update main dataframe
dfCrises.loc[dfHarvest.index,'Group'] = 'Harvest-epidemics'


['Maribo Amt' 'Præstø Amt' 'Sorø Amt' 'Holbæk Amt' 'Københavns Amt'
 'Frederiksborg Amt' 'Svendborg Amt' 'Odense Amt']
Total excess: 10974
Part of that which is from excess in extra crises in 1832: 773
Indices: [  1   2   3   4   6  10  24  26  41  43  52  54  60  61  72  82 122 126
 130 144 151 166 186 193 212 256 276]
Cluster
E    21
D     3
C     2
F     1
Name: count, dtype: int64


# Child mortality in 1829

High mortality among 1-14 in first half of 1829. 

Cluster C. One of them gets classified as B, but has been determined to be related.

A few crises occur at the same time, but all with mortality among the elderly. These end up in clusters E or F, and are omitted from this group

In [10]:

df1829 = dfCrises.copy()
df1829 = df1829[df1829.DayWithMostDeaths >= np.datetime64('1829')]
df1829 = df1829[df1829.DayWithMostDeaths <= np.datetime64('1830')]

df1829 = df1829[df1829.DayWithMostDeaths.dt.month > 0]
df1829 = df1829[df1829.DayWithMostDeaths.dt.month <=7]

# Remove any with a lot of elderly mortality
df1829 = df1829[df1829.Cluster != 'E']
df1829 = df1829[df1829.Cluster != 'F']

print(df1829.Amt.unique())
print(f'Total excess: {df1829.Excess.sum()}')
print(f'Indices: {df1829.index.values}')

# Update main dataframe
dfCrises.loc[df1829.index,'Group'] = 'Child mortality 1829'
# # df1829.iloc[:,:-6]
display(df1829['Cluster'].value_counts())


['Sorø Amt' 'Maribo Amt' 'Holbæk Amt' 'Svendborg Amt' 'Odense Amt'
 'Københavns Amt' 'Præstø Amt' 'Frederiksborg Amt']
Total excess: 2720
Indices: [  8  17  30  44  65  70  77  94 199 271 291 296 358]


Cluster
C    12
B     1
Name: count, dtype: int64

# 1891/1892 Pandemic Flu

Everything around new years 1891/1892

All are cluster F, except Åbenrå which gets classified as cluster E

In [11]:
dfFlu1892 = dfCrises.copy()


dfFlu1892 = dfFlu1892[dfFlu1892.DayWithMostDeaths >= np.datetime64('1891-11')]
dfFlu1892 = dfFlu1892[dfFlu1892.DayWithMostDeaths <= np.datetime64('1892-03')]

dfFlu1892  
print(np.sort(dfFlu1892.Amt.unique()))

russfluAmt = list(np.sort(dfFlu1892.Amt.unique()))
print(f'Total excess: {dfFlu1892.Excess.sum()}')
print(f'Indices: {dfFlu1892.index.values}')

# Update main dataframe
dfCrises.loc[dfFlu1892.index,'Group'] = 'Pandemic Flu (1891/1892)'

print('Clusters:')
print(dfFlu1892.Cluster.value_counts())
# dfFlu1892

['Bornholms Amt' 'Frederiksborg Amt' 'Haderslev Amt' 'Hjørring Amt'
 'Holbæk Amt' 'Københavns Amt' 'Maribo Amt' 'Odense Amt' 'Præstø Amt'
 'Randers Amt' 'Ribe Amt' 'Ringkøbing Amt' 'Sorø Amt' 'Staden København'
 'Svendborg Amt' 'Thisted Amt' 'Tønder Amt' 'Vejle Amt' 'Viborg Amt'
 'Åbenrå Amt' 'Ålborg Amt' 'Århus Amt']
Total excess: 6462
Indices: [  7   9  12  19  23  29  31  32  34  35  38  42  46  53  55 105 111 128
 210 217 224 240 301 328 385]
Clusters:
Cluster
F    24
E     1
Name: count, dtype: int64


In [12]:
print('Counties not experiencing flu: ')
print(np.setdiff1d(dfCrises.Amt.unique(),dfFlu1892.Amt.unique()))
print('(i.e. counties that either do not exist or arent part of Denmark at the time)')

Counties not experiencing flu: 
['Nordborg Amt' 'Skanderborg Amt' 'Sønderborg Amt']
(i.e. counties that either do not exist or arent part of Denmark at the time)


# Pandemic flu, spring 1900 (The so-called "Pseudo-pandemic")

As 1892-flu, based on timing and all in cluster F (except Vejle which ends up in E)

In [13]:
dfPseudo = dfCrises.copy()


dfPseudo = dfPseudo[dfPseudo.DayWithMostDeaths >= np.datetime64('1900-01')]
dfPseudo = dfPseudo[dfPseudo.DayWithMostDeaths <= np.datetime64('1900-08')]

# print(np.sort(dfCrises.Amt.unique()))
print(np.sort(dfPseudo.Amt.unique()))
pseuAmt = list(np.sort(dfPseudo.Amt.unique()))
print(f'Total excess: {dfPseudo.Excess.sum()}')
print(f'Indices: {dfPseudo.index.values}')

# Update main dataframe
dfCrises.loc[dfPseudo.index,'Group'] = 'Pandemic flu (1900)'

print('Clusters:')
print(dfPseudo.Cluster.value_counts())
# dfPseudo

['Frederiksborg Amt' 'Haderslev Amt' 'Hjørring Amt' 'Holbæk Amt'
 'Maribo Amt' 'Odense Amt' 'Præstø Amt' 'Sorø Amt' 'Svendborg Amt'
 'Thisted Amt' 'Vejle Amt' 'Ålborg Amt']
Total excess: 1874
Indices: [ 39  48  49  64  89 101 129 132 223 231 272 287 306 414]
Clusters:
Cluster
F    13
E     1
Name: count, dtype: int64


In [14]:
dfBothFlu = pd.concat([dfFlu1892,dfPseudo])
print('Number of crises in clusters:',dfBothFlu.Cluster.value_counts().to_dict())
print('Total mortality for pandemic influenza:',dfBothFlu.Excess.sum())

Number of crises in clusters: {'F': 37, 'E': 2}
Total mortality for pandemic influenza: 8336


In [75]:
dfPseudo.Amt.unique()

array(['Odense Amt', 'Maribo Amt', 'Hjørring Amt', 'Holbæk Amt',
       'Præstø Amt', 'Svendborg Amt', 'Frederiksborg Amt', 'Vejle Amt',
       'Thisted Amt', 'Haderslev Amt', 'Sorø Amt', 'Ålborg Amt'],
      dtype=object)

In [74]:
print('Counties not experiencing Pseudo-influenza: ')
print(np.setdiff1d(dfCrises.Amt.unique(),dfPseudo.Amt.unique()))

Counties not experiencing Pseudo-influenza: 
['Bornholms Amt' 'Københavns Amt' 'Nordborg Amt' 'Randers Amt' 'Ribe Amt'
 'Ringkøbing Amt' 'Skanderborg Amt' 'Staden København' 'Sønderborg Amt'
 'Tønder Amt' 'Viborg Amt' 'Åbenrå Amt' 'Århus Amt']


# Scarlatina


Criteria:

Age: Mostly cluster B (Almost all 1-14)

Geography: Mid and northern Jutland

Time: Winter 1857/1858

Comment: High infant mortality is observed in the rest of the country in the fall/winter the following years (in 1858/1859: Odense, Svendborg, Præstø, Maribo, and 1859/1860 in Maribo). 

Further investigation of historical records may reveal this to also be scarlet fever, but it is omitted from the count for the winter 1857/1858.

Something also seems to occur on Bornholm, but is also ignored here.

Extra: Three mortality crises are related, but seem to also have a wave of some disease among elderly, so they end up with a strange age-distribution (and not cluster A)
<!-- (232,117,180) -->

In [15]:
amtJutlandMidNorth = ['Thisted Amt','Randers Amt','Århus Amt','Ålborg Amt','Ringkøbing Amt','Viborg Amt','Hjørring Amt','Skanderborg Amt']

In [56]:
# dfScar = dfCrises.copy()

# dfScar = dfScar[dfScar.DayWithMostDeaths >= np.datetime64('1857')]
# dfScar = dfScar[dfScar.DayWithMostDeaths <= np.datetime64('1859')]

# dfScar = dfScar[dfScar.Amt.isin(amtJutlandMidNorth)]
# dfScar

# dfCrises[dfCrises.Amt == 'Hjørring Amt']

In [58]:
dfScar = dfCrises.copy()

dfScar = dfScar[dfScar.DayWithMostDeaths >= np.datetime64('1857')]
dfScar = dfScar[dfScar.DayWithMostDeaths <= np.datetime64('1859')]

dfScar = dfScar[dfScar.Amt.isin(amtJutlandMidNorth)]

dfScar = dfScar[dfScar.Cluster != 'E']
dfScar = dfScar[dfScar.Cluster != 'F']

print(dfScar.Amt.unique())
print(f'Total excess: {dfScar.Excess.sum()}')
print(f'Indices: {dfScar.index.values}')
 
# Update main dataframe
dfCrises.loc[dfScar.index,'Group'] = 'Scarlatina'

dfScar.Cluster.value_counts()

['Thisted Amt' 'Ålborg Amt' 'Viborg Amt' 'Randers Amt' 'Århus Amt'
 'Skanderborg Amt']
Total excess: 2241
Indices: [  5  22  37  86 110 154 170 180 246 247 313 325 331]


Cluster
B    9
C    4
Name: count, dtype: int64

In [59]:
dfScar[dfScar.Cluster != 'B'].Excess.sum()

360

In [60]:
dfScar[['Amt','Cluster']]

Unnamed: 0,Amt,Cluster
5,Thisted Amt,B
22,Ålborg Amt,B
37,Viborg Amt,B
86,Viborg Amt,B
110,Randers Amt,C
154,Århus Amt,B
170,Århus Amt,C
180,Skanderborg Amt,C
246,Skanderborg Amt,C
247,Thisted Amt,B


# Determine how big a proportion of all excess deaths that has been identified as disease-related

In [18]:

numFound    = dfCrises[dfCrises.Group != 'None']['Excess'].sum()
numNotFound = dfCrises[dfCrises.Group == 'None']['Excess'].sum()
print(str(numFound) + ' burials grouped')
print(str(numNotFound) + ' burials still ungrouped')
print(f'Hence, {100 * numFound/(numFound+numNotFound):2.0f}% of burials has been classified')

30198 burials grouped
29981 burials still ungrouped
Hence, 50% of burials has been classified


# War

Crises that coincide with wars or at least occur in years with major wars.

Manual checks of these suggest that many were probably all war-related, as also suggested by the gender ratio. 

This group is however not included directly in paper, as further validation with historical sources is required, and since war-related mortality may also be more spuriously related to war, e.g. due to troop movements or generally worsened conditions during wartime.

In [19]:
dfWar = dfCrises.copy()

dfWar = dfWar[(dfWar.DayWithMostDeaths.dt.year == 1864) | (dfWar.DayWithMostDeaths.dt.year == 1849)]

dfWar.sort_values(by='GenderRatio',ascending=False)


print(np.sort(dfWar.Amt.unique()))
print(f'Total excess: {dfWar.Excess.sum()}')
# print(f'Indices: {dfWar.index.values}')

# Update main dataframe
dfCrises.loc[dfWar.index,'Group'] = 'Possibly war-related'

['Haderslev Amt' 'Maribo Amt' 'Nordborg Amt' 'Odense Amt' 'Præstø Amt'
 'Skanderborg Amt' 'Staden København' 'Svendborg Amt' 'Sønderborg Amt'
 'Vejle Amt' 'Åbenrå Amt' 'Ålborg Amt']
Total excess: 3665


In [20]:

numFound    = dfCrises[dfCrises.Group != 'None']['Excess'].sum()
numNotFound = dfCrises[dfCrises.Group == 'None']['Excess'].sum()
print(str(numFound) + ' burials grouped')
print(str(numNotFound) + ' burials still ungrouped')
print(f'Hence, {100 * numFound/(numFound+numNotFound):2.0f}% of burials has been classified')

33863 burials grouped
26316 burials still ungrouped
Hence, 56% of burials has been classified


# Save main table

In [21]:
# Save the results in the 'Data' directory
curFileName = finalResultsFilename + '_Grouped'
dfCrises.to_csv(pathData+ curFileName + '.csv',index=False)

# Save a copy to the main directory
Reorder results, rename some columns and save to top directory

In [22]:
saveToMainFile = True
# saveToMainFile = False

In [23]:
dfCrisesMainFile = dfCrises.copy()

In [24]:
ageGroupNames = ['Infants_stillborn','1-14','15-39','40-59','60+']
ageExc = ['Exc_'+x for x in ageGroupNames]
agePct = ['Pct_'+x for x in ageGroupNames]
ageDataSum = ['DataSum_'+x for x in ageGroupNames]
ageBaseline = ['Baseline_'+x for x in ageGroupNames]
ageFracAll = ['FracAll_'+x for x in ageGroupNames]


clusterNames = ['ClusterA_Prob','ClusterB_Prob','ClusterC_Prob','ClusterD_Prob','ClusterE_Prob','ClusterF_Prob']


In [25]:
dfCrisesMainFile.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinite values with nan

# Round some of the columns to make it easier to work with for others
dfCrisesMainFile['ExcessPct'] = dfCrisesMainFile['ExcessPct'].astype(int) # Is already an integer, so no need to round
dfCrisesMainFile['PopulationEstimate'] = dfCrisesMainFile['PopulationEstimate'].astype('Int64') # Is already an integer, so no need to round. Has to be Int64 as some are NaN 
dfCrisesMainFile['GenderRatio'] = dfCrisesMainFile['GenderRatio'].round(4)

dfCrisesMainFile[ageExc] = dfCrisesMainFile[ageExc].round(2)
dfCrisesMainFile[ageBaseline] = dfCrisesMainFile[ageBaseline].round(2)
dfCrisesMainFile[ageFracAll] = dfCrisesMainFile[ageFracAll].round(4)
dfCrisesMainFile[agePct] = dfCrisesMainFile[agePct].round().astype('Int64')
dfCrisesMainFile[ageDataSum] = dfCrisesMainFile[ageDataSum].round().astype('Int64')

dfCrisesMainFile[clusterNames] = dfCrisesMainFile[clusterNames].round(4)

dfCrisesMainFile['AgeSplitSumExc'] = dfCrisesMainFile['AgeSplitSumExc'].round(2)

In [26]:
# Reorder manually
# print(dfCrisesMainFile.columns)
newOrder = [
    'Amt',
    'Group',
    'Start',
    'End',
    'DayWithMostDeaths',
    'NumberOfDays',
    'Excess',
    'ExcessPct',
    'GenderRatio',
    'Cluster',
    'Season',
    'TimeOfYear',
    'PopulationEstimate',
    'Exc_Infants_stillborn',
    'Exc_1-14',
    'Exc_15-39',
    'Exc_40-59',
    'Exc_60+',
    'Pct_Infants_stillborn',
    'Pct_1-14',
    'Pct_15-39',
    'Pct_40-59',
    'Pct_60+',
    'DataSum_Infants_stillborn',
    'DataSum_1-14',
    'DataSum_15-39',
    'DataSum_40-59',
    'DataSum_60+',
    'Baseline_Infants_stillborn',
    'Baseline_1-14',
    'Baseline_15-39',
    'Baseline_40-59',
    'Baseline_60+',
    'FracAll_Infants_stillborn',
    'FracAll_1-14',
    'FracAll_15-39',
    'FracAll_40-59',
    'FracAll_60+',
    'AgeSplitSumExc',
    'ClusterA_Prob',
    'ClusterB_Prob',
    'ClusterC_Prob',
    'ClusterD_Prob',
    'ClusterE_Prob',
    'ClusterF_Prob',
]


In [27]:
# Reorder
dfCrisesMainFile = dfCrisesMainFile[newOrder]

In [28]:
# Rename columns to more readable names
ageExcNew = ['Excess_'+x for x in ageGroupNames]
agePctNew = ['PercentExcess_'+x for x in ageGroupNames]
ageFracAllNew = ['FractionOfAllExcess_'+x for x in ageGroupNames]
clusterProbsFull = [x+'ability' for x in clusterNames]

dfCrisesMainFile = dfCrisesMainFile.rename(columns=dict(zip(ageExc,ageExcNew)))
dfCrisesMainFile = dfCrisesMainFile.rename(columns=dict(zip(agePct,agePctNew)))
dfCrisesMainFile = dfCrisesMainFile.rename(columns=dict(zip(ageFracAll,ageFracAllNew)))
dfCrisesMainFile = dfCrisesMainFile.rename(columns=dict(zip(clusterNames,clusterProbsFull)))
dfCrisesMainFile = dfCrisesMainFile.rename(columns={'AgeSplitSumExc':'SumOfAgeSpecificExcess'})

In [29]:
dfCrisesMainFile['Amt'] = dfCrisesMainFile.Amt.str.replace('Amt','County')
dfCrisesMainFile['Amt'] = dfCrisesMainFile.Amt.str.replace('Staden København','Copenhagen City')
dfCrisesMainFile['Amt'] = dfCrisesMainFile.Amt.str.replace('Københavns','Copenhagen')


In [30]:
# dfCrisesMainFile.head()
if saveToMainFile:
    dfCrisesMainFile.to_csv('../SupplementaryTable_AllCrises.csv',index=False)

    # Also save file as excel (with dates as strings to avoid excel-problems)
    dfCrisesMainFileExcel = dfCrisesMainFile.copy()
    dfCrisesMainFileExcel['Start'] = dfCrisesMainFileExcel['Start'].astype(str)
    dfCrisesMainFileExcel['End'] = dfCrisesMainFileExcel['End'].astype(str)
    dfCrisesMainFileExcel['DayWithMostDeaths'] = dfCrisesMainFileExcel['DayWithMostDeaths'].astype(str)

    dfCrisesMainFileExcel.to_excel('../SupplementaryTable_AllCrises.xlsx',index=False)

# Various checks for use in article text

In [31]:
dfCrises.groupby('Group')['Excess'].sum()

Group
Child mortality 1829         2720
Cholera (1853)               5217
Cholera (1857)                550
Harvest-epidemics           10974
None                        26316
Pandemic Flu (1891/1892)     6462
Pandemic flu (1900)          1874
Possibly war-related         3665
Scarlatina                   2401
Name: Excess, dtype: int64

In [32]:
# dfCrises.groupby('Group').sum()['Excess']
finalResultsFilename

'AllCrises_NonSmoothed_Years12_Threshold3_LowerThreshold2_MaxDaysBelow7_minLength0_minCount50_Clustered'

In [33]:
# dfCrises[dfCrises.DayWithMostBurials.dt.year == 1872]
dfCrises[dfCrises.DayWithMostDeaths.dt.year == 1872]

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
311,Maribo Amt,1872-11-13,1872-11-14,1,1872-11-13,60,700.0,0.5,Q4,Fall,92469.0,1.424242,21.492095,17.227273,12.938735,7.661232,90.0,1425.0,2229.0,1219.0,328.0,3.0,23.0,18.0,14.0,10.0,1.575758,1.507905,0.772727,1.061265,2.338768,0.023447,0.353817,0.283606,0.213006,0.126124,60.743577,D,0.0,0.0,0.397,0.5801,0.0229,0.0,


In [61]:
# dfCrises[(dfCrises.Cluster == cholCluster)]

# dfCrises[dfCrises.DayWithMostBurials.dt.year == 1872]
dfCrises[dfCrises.DayWithMostDeaths.dt.year == 1877]
dfCrises[dfCrises.DayWithMostDeaths.dt.year == 1874]
# dfCrises[dfCrises.DayWithMostDeaths.dt.year == 1875]

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
241,Københavns Amt,1874-05-02,1874-05-18,16,1874-05-02,69,55.0,0.507692,Q2,Spring,111181.0,16.02388,15.515481,17.108696,13.653656,12.942029,64.0,69.0,133.0,71.0,36.0,41.0,38.0,30.0,33.0,49.0,24.97612,22.484519,12.891304,19.346344,36.057971,0.21296,0.206203,0.227377,0.181459,0.172001,75.243742,C,0.0002,0.0,0.6831,0.185,0.1317,0.0,
268,Københavns Amt,1874-07-26,1874-08-12,17,1874-08-10,65,57.0,0.52514,Q3,Summer,111659.0,47.817029,7.847826,3.956192,5.173395,2.025362,132.0,49.0,30.0,37.0,8.0,84.0,24.0,17.0,19.0,27.0,36.182971,16.152174,13.043808,13.826605,24.974638,0.715612,0.117448,0.059207,0.077423,0.030311,66.819805,A,1.0,0.0,0.0,0.0,0.0,0.0,
416,Århus Amt,1874-06-12,1874-06-30,18,1874-06-15,50,37.0,0.526882,Q2,Summer,132028.0,2.167984,21.459721,13.242095,7.739977,9.82971,12.0,78.0,64.0,48.0,32.0,21.0,49.0,34.0,24.0,41.0,18.832016,27.540279,20.757905,16.260023,31.17029,0.039824,0.394194,0.243244,0.142176,0.180562,54.439488,C,0.0,0.0007,0.9967,0.0004,0.0021,0.0,


In [64]:

dfCrises[dfCrises.DayWithMostDeaths.dt.year == 1889]

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
51,Staden København,1889-07-01,1889-08-03,33,1889-07-29,245,48.0,0.493386,Q3,Summer,309050.0,123.182642,37.860107,33.766963,7.996424,-5.548254,89.0,49.0,47.0,11.0,-5.0,261.0,115.0,106.0,82.0,101.0,137.817358,77.139893,72.233037,74.003576,106.548254,0.607391,0.186681,0.166499,0.039429,0.0,202.806136,A,1.0,0.0,0.0,0.0,0.0,0.0,
168,Århus Amt,1889-06-28,1889-07-23,25,1889-07-02,87,45.0,0.55914,Q3,Summer,156331.0,18.53195,26.413867,5.819782,3.392457,12.70471,46.0,84.0,21.0,14.0,26.0,59.0,58.0,33.0,27.0,62.0,40.46805,31.586133,27.180218,23.607543,49.29529,0.277164,0.395046,0.087041,0.050738,0.190012,66.862766,C,0.0,0.0439,0.956,0.0,0.0,0.0001,
242,Københavns Amt,1889-07-12,1889-07-30,18,1889-07-29,69,44.0,0.482456,Q3,Summer,151222.0,39.526374,21.306607,14.181983,-0.8706,-0.55863,87.0,108.0,80.0,-4.0,-1.0,85.0,41.0,32.0,22.0,42.0,45.473626,19.693393,17.818017,22.8706,42.55863,0.526913,0.284031,0.189055,0.0,0.0,75.014963,A,0.9777,0.0,0.0223,0.0,0.0,0.0,
379,Odense Amt,1889-07-12,1889-07-22,10,1889-07-17,53,91.0,0.544643,Q3,Summer,135775.0,26.050725,8.838622,7.29529,1.488801,13.724638,238.0,123.0,84.0,23.0,65.0,37.0,16.0,16.0,8.0,35.0,10.949275,7.161378,8.70471,6.511199,21.275362,0.453861,0.153988,0.1271,0.025938,0.239113,57.398076,A,0.8512,0.0,0.1481,0.0,0.0001,0.0006,


In [35]:
dfCrises.Amt.value_counts().index

Index(['Staden København', 'Maribo Amt', 'Odense Amt', 'Københavns Amt',
       'Præstø Amt', 'Århus Amt', 'Sorø Amt', 'Holbæk Amt',
       'Frederiksborg Amt', 'Ringkøbing Amt', 'Svendborg Amt', 'Vejle Amt',
       'Hjørring Amt', 'Ålborg Amt', 'Thisted Amt', 'Ribe Amt',
       'Skanderborg Amt', 'Haderslev Amt', 'Randers Amt', 'Viborg Amt',
       'Bornholms Amt', 'Sønderborg Amt', 'Åbenrå Amt', 'Nordborg Amt',
       'Tønder Amt'],
      dtype='object', name='Amt')

In [36]:
allAmts = [
'Staden København', 
'Maribo Amt', 
'Odense Amt', 
'Københavns Amt',
'Præstø Amt', 
'Århus Amt', 
'Sorø Amt', 
'Holbæk Amt',
'Frederiksborg Amt', 
'Ringkøbing Amt', 
'Svendborg Amt', 
'Vejle Amt',
'Hjørring Amt', 
'Ålborg Amt', 
'Thisted Amt', 
'Ribe Amt',
'Skanderborg Amt', 
'Haderslev Amt', 
'Randers Amt', 
'Viborg Amt',
'Bornholms Amt', 
'Sønderborg Amt', 
'Åbenrå Amt', 
'Nordborg Amt',
'Tønder Amt',
]

In [70]:
allAmts = [
'Staden København', 
'Københavns Amt',
'Frederiksborg Amt', 
'Holbæk Amt',
'Sorø Amt', 
'Præstø Amt', 
'Maribo Amt', 
'Bornholms Amt', 
'Odense Amt', 
'Svendborg Amt', 
'Hjørring Amt',
'Thisted Amt', 
'Ålborg Amt', 
'Viborg Amt',
'Ringkøbing Amt', 
'Ribe Amt',
'Randers Amt', 
'Skanderborg Amt', 
'Århus Amt', 
'Vejle Amt',
'Sønderborg Amt', 
'Haderslev Amt', 
'Åbenrå Amt', 
'Nordborg Amt',
'Tønder Amt',
]

In [71]:
# for amt in dfCrises.Amt.unique():
for amt in allAmts:

    curdfAmt = dfCrises[dfCrises.Amt == amt]
    print(f'{amt:18} , {len(curdfAmt):2.0f} , {curdfAmt.Excess.sum():5.0f}')

Staden København   , 53 , 11090
Københavns Amt     , 28 ,  3823
Frederiksborg Amt  , 19 ,  1763
Holbæk Amt         , 21 ,  4305
Sorø Amt           , 22 ,  4511
Præstø Amt         , 25 ,  4532
Maribo Amt         , 41 ,  6796
Bornholms Amt      ,  7 ,   886
Odense Amt         , 33 ,  4068
Svendborg Amt      , 17 ,  1755
Hjørring Amt       , 15 ,  1375
Thisted Amt        , 11 ,  1448
Ålborg Amt         , 13 ,  1901
Viborg Amt         ,  9 ,  1131
Ringkøbing Amt     , 19 ,  1757
Ribe Amt           , 11 ,   741
Randers Amt        ,  9 ,  1039
Skanderborg Amt    ,  9 ,   607
Århus Amt          , 23 ,  2513
Vejle Amt          , 16 ,  1624
Sønderborg Amt     ,  3 ,   948
Haderslev Amt      ,  9 ,  1083
Åbenrå Amt         ,  2 ,   285
Nordborg Amt       ,  2 ,   140
Tønder Amt         ,  1 ,    58


In [39]:
# Define the agegroups analyzed
ageGroups = [
    ['Total'],
    ['Stillborn','0'],
    ['1-4','5-9', '10-14'],
    ['15-19', '20-24', '25-29', '30-34', '35-39'],
    ['40-44', '45-49', '50-54', '55-59'],
    ['60-64', '65-69', '70-74', '75-79', '80+']
]

# And the names used for directories and filenames
ageGroupNames = [
    'Total',
    'Infants_stillborn',
    '1-14',
    '15-39',
    '40-59',
    '60+'
]

ageGroupNamesBaseline = ['Baseline_'+x for x in ageGroupNames[1:]]
ageGroupNamesDataSum = ['DataSum_'+x for x in ageGroupNames[1:]]


In [40]:
dfHarvest[ageGroupNamesDataSum].sum().sum() / dfHarvest[ageGroupNamesBaseline].sum().sum()

2.982705446054282

In [47]:
dfChol[dfChol.Amt.str.contains('Køben')].Excess.sum()
# dfChol[dfChol.Amt.str.contains('Staden ')]


4411

In [48]:
dfChol[dfChol.Amt.str.contains('Køben')]

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
0,Staden København,1853-07-02,1853-09-07,67,1853-07-28,3833,546.0,0.468578,Q3,Summer,139473.0,81.519622,444.281691,942.4992,1328.849144,882.05702,62.0,401.0,745.0,1217.0,825.0,212.0,555.0,1069.0,1438.0,989.0,130.480378,110.718309,126.5008,109.150857,106.94298,0.022157,0.120755,0.256169,0.361178,0.239741,3679.206677,D,0.0,0.0,0.0,0.9866,0.0134,0.0,
11,Københavns Amt,1853-07-16,1853-09-24,70,1853-07-31,578,206.0,0.480186,Q3,Summer,76787.0,16.390904,97.367909,149.32477,167.477201,155.673631,27.0,240.0,431.0,424.0,250.0,77.0,138.0,184.0,207.0,218.0,60.609096,40.632091,34.67523,39.522799,62.326369,0.02796,0.16609,0.254719,0.285683,0.265548,586.234414,D,0.0,0.0,0.0011,0.7465,0.2524,0.0,


In [43]:
dfChol.Excess.sum()

5154

In [50]:
dfScar.head(1)

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
5,Thisted Amt,1857-11-21,1858-04-02,132,1858-01-23,659,162.0,0.494845,Q1,Winter,53386.0,33.94634,594.252136,33.646821,9.460964,3.609919,72.0,995.0,151.0,40.0,3.0,81.0,654.0,56.0,33.0,114.0,47.05366,59.747864,22.353179,23.539036,110.390081,0.050297,0.880483,0.049853,0.014018,0.005349,674.916179,B,0.0,1.0,0.0,0.0,0.0,0.0,


In [65]:
dfCrises.Excess.sum()

60179

In [69]:
dfCrises[dfCrises.Amt == 'Odense Amt'].iloc[1:2]

Unnamed: 0,Amt,Start,End,NumberOfDays,DayWithMostDeaths,Excess,ExcessPct,GenderRatio,TimeOfYear,Season,PopulationEstimate,Exc_Infants_stillborn,Exc_1-14,Exc_15-39,Exc_40-59,Exc_60+,Pct_Infants_stillborn,Pct_1-14,Pct_15-39,Pct_40-59,Pct_60+,DataSum_Infants_stillborn,DataSum_1-14,DataSum_15-39,DataSum_40-59,DataSum_60+,Baseline_Infants_stillborn,Baseline_1-14,Baseline_15-39,Baseline_40-59,Baseline_60+,FracAll_Infants_stillborn,FracAll_1-14,FracAll_15-39,FracAll_40-59,FracAll_60+,AgeSplitSumExc,Cluster,ClusterA_Prob,ClusterB_Prob,ClusterC_Prob,ClusterD_Prob,ClusterE_Prob,ClusterF_Prob,Group
14,Odense Amt,1849-05-08,1849-09-09,124,1849-07-08,508,88.0,0.632841,Q3,Summer,101343.0,26.60799,110.365307,97.145798,36.929296,32.133729,26.0,104.0,112.0,48.0,20.0,130.0,216.0,184.0,114.0,194.0,103.39201,105.634693,86.854202,77.070704,161.866271,0.087762,0.364023,0.320421,0.121806,0.105988,303.18212,C,0.0,0.0,0.9865,0.012,0.0015,0.0,Possibly war-related
