In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import altair as alt
import seaborn as sns
import numpy as np

# Data Inputs and descriptions

- underlying_data_2020_06_01 


### 1. load cross reference data and group by LSOA11CD

In [2]:
cross_ref = pd.read_csv("Output_Area_to_LSOA_to_MSOA_to_Local_Authority_District_(December_2017)_Lookup_with_Area_Classifications_in_Great_Britain.csv")

cross_ref.drop(['OA11CD'], axis=1, inplace=True)
cross_ref.drop(['LAD17CD'], axis=1, inplace=True)

In [3]:
lsoa_to_msoa = cross_ref.groupby(['LSOA11CD','LSOA11NM','MSOA11CD','MSOA11NM']).count().reset_index()
lsoa_to_msoa.count()

LSOA11CD    4835
LSOA11NM    4835
MSOA11CD    4835
MSOA11NM    4835
dtype: int64

### 2. load london only covid data and group it all into one table
From this one dataset (containing data up to 28 May, 2020) we load and then merge the following:
- deaths 
- population over 70
- occupation at risk
- ethnic_group
- medical_conditions

In [4]:
deaths = pd.read_excel("underlying_data_2020_06_01.xlsx", sheet_name='1 deaths')
population = pd.read_excel("underlying_data_2020_06_01.xlsx", sheet_name='2 population')
occupation = pd.read_excel("underlying_data_2020_06_01.xlsx", sheet_name='3 occupation')
ethnicity = pd.read_excel("underlying_data_2020_06_01.xlsx", sheet_name='4 ethnic_group')
health = pd.read_excel("underlying_data_2020_06_01.xlsx", sheet_name='7 medical_conditions')

london_covid_total = pd.merge(deaths, population, left_on='MSOA11CD', right_on='MSOA11CD', how = 'inner')
london_covid_total = pd.merge(london_covid_total, occupation, left_on='MSOA11CD', right_on='MSOA11CD', how = 'inner')
london_covid_total = pd.merge(london_covid_total, ethnicity, left_on='MSOA11CD', right_on='MSOA11CD', how = 'inner')
london_covid_total = pd.merge(london_covid_total, health, left_on='MSOA11CD', right_on='MSOA11CD', how = 'inner')

london_covid_total.count()

MSOA11CD                         983
MSOA11NM_x                       983
Local authority_x                983
covid_19_deaths                  983
covid_19_deaths_per_thousand     983
MSOA11NM_y                       983
Local authority_y                983
total_population_mid_2018        983
over_70_prop                     983
MSOA11NM_x                       983
Local authority_x                983
proportion_at_risk_jobs          983
insecure_proportion              983
MSOA11NM_y                       983
Local Authority                  983
all_bame_prop                    983
all_black_prop                   983
pakistani_or_bangladeshi_prop    983
all_indian_prop                  983
MSOA11NM                         983
Local authority_y                983
total_registered_patients        983
Hypertension                     983
Obesity (18+)                    983
Diabetes                         983
Asthma                           983
Coronary heart disease           983
d

In [5]:
london_covid_total.drop(['MSOA11NM_x'], axis=1, inplace=True)
london_covid_total.drop(['MSOA11NM_y'], axis=1, inplace=True)
london_covid_total.drop(['Local authority_x'], axis=1, inplace=True)
london_covid_total.drop(['Local authority_y'], axis=1, inplace=True)
london_covid_total.head()

Unnamed: 0,MSOA11CD,covid_19_deaths,covid_19_deaths_per_thousand,total_population_mid_2018,over_70_prop,proportion_at_risk_jobs,insecure_proportion,Local Authority,all_bame_prop,all_black_prop,pakistani_or_bangladeshi_prop,all_indian_prop,MSOA11NM,total_registered_patients,Hypertension,Obesity (18+),Diabetes,Asthma,Coronary heart disease
0,E02000001,3,0.34459,8706,0.123593,0.120794,0.077743,City of London,0.213695,0.026169,0.033627,0.029288,City of London 001,8584,7.95,3.62,2.7,3.08,1.5
1,E02000002,2,0.259067,7720,0.103886,0.345813,0.318841,Barking and Dagenham,0.350111,0.167232,0.04797,0.039852,Barking and Dagenham 001,8315,9.26,6.15,4.99,3.91,1.88
2,E02000003,3,0.271469,11051,0.085965,0.309099,0.26085,Barking and Dagenham,0.453858,0.161075,0.104629,0.080836,Barking and Dagenham 002,11873,10.33,7.6,5.79,4.64,1.83
3,E02000004,9,1.366535,6586,0.124658,0.311407,0.26557,Barking and Dagenham,0.19023,0.104982,0.016338,0.007603,Barking and Dagenham 003,6852,9.34,8.36,4.95,3.58,1.93
4,E02000005,5,0.488806,10229,0.066282,0.337135,0.322392,Barking and Dagenham,0.337304,0.168769,0.058748,0.035155,Barking and Dagenham 004,11150,9.37,8.79,5.15,4.26,1.47


### 3. load deprivation data
The domains of deprivation gives us a ranking from 1 to many thousands for deprivation along a number of dimensions:
- income_rank
- environment_rank
- education_rank
- health_rank
- housing_rank
- living_environment_rank

In [6]:
uk_deprivation_data = pd.read_excel("File_2_-_IoD2019_Domains_of_Deprivation.xlsx", sheet_name='IoD2019 Domains',
                                    usecols = "A,B,C,D,G,I,K, M, Q, S", names=['lsoa_code', 'lsoa_name', 
                                                            'local_auth_code', 'local_auth_name',  'income_rank', 
                                                            'employment_rank', 'education_rank', 'health_rank',
                                                            'housing_rank', 'living_environment_rank'])

london_deprivation = uk_deprivation_data[uk_deprivation_data['local_auth_code'].str.contains('E09')]
print(london_deprivation.count())
london_deprivation.head()

lsoa_code                  4835
lsoa_name                  4835
local_auth_code            4835
local_auth_name            4835
income_rank                4835
employment_rank            4835
education_rank             4835
health_rank                4835
housing_rank               4835
living_environment_rank    4835
dtype: int64


Unnamed: 0,lsoa_code,lsoa_name,local_auth_code,local_auth_name,income_rank,employment_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E01000001,City of London 001A,E09000001,City of London,32831,32742,32842,32113,7319,7789
1,E01000002,City of London 001B,E09000001,City of London,29901,31190,32832,29705,11707,13070
2,E01000003,City of London 001C,E09000001,City of London,18510,15103,26386,17600,2157,4092
3,E01000005,City of London 001E,E09000001,City of London,6029,7833,12370,17907,2217,9397
4,E01000006,Barking and Dagenham 016A,E09000002,Barking and Dagenham,14023,21692,17511,21581,1033,10629


### Now merge deprivation and cross reference to get the link between lsoa and msoa

In [7]:
# Now merge with cross ref and then group by msoa code

print(london_deprivation.count())
london_deprivation_new = pd.merge(london_deprivation, lsoa_to_msoa, left_on='lsoa_code', right_on='LSOA11CD', how = 'inner')

print(london_deprivation_new.count())

london_deprivation_new.head()

lsoa_code                  4835
lsoa_name                  4835
local_auth_code            4835
local_auth_name            4835
income_rank                4835
employment_rank            4835
education_rank             4835
health_rank                4835
housing_rank               4835
living_environment_rank    4835
dtype: int64
lsoa_code                  4835
lsoa_name                  4835
local_auth_code            4835
local_auth_name            4835
income_rank                4835
employment_rank            4835
education_rank             4835
health_rank                4835
housing_rank               4835
living_environment_rank    4835
LSOA11CD                   4835
LSOA11NM                   4835
MSOA11CD                   4835
MSOA11NM                   4835
dtype: int64


Unnamed: 0,lsoa_code,lsoa_name,local_auth_code,local_auth_name,income_rank,employment_rank,education_rank,health_rank,housing_rank,living_environment_rank,LSOA11CD,LSOA11NM,MSOA11CD,MSOA11NM
0,E01000001,City of London 001A,E09000001,City of London,32831,32742,32842,32113,7319,7789,E01000001,City of London 001A,E02000001,City of London 001
1,E01000002,City of London 001B,E09000001,City of London,29901,31190,32832,29705,11707,13070,E01000002,City of London 001B,E02000001,City of London 001
2,E01000003,City of London 001C,E09000001,City of London,18510,15103,26386,17600,2157,4092,E01000003,City of London 001C,E02000001,City of London 001
3,E01000005,City of London 001E,E09000001,City of London,6029,7833,12370,17907,2217,9397,E01000005,City of London 001E,E02000001,City of London 001
4,E01000006,Barking and Dagenham 016A,E09000002,Barking and Dagenham,14023,21692,17511,21581,1033,10629,E01000006,Barking and Dagenham 016A,E02000017,Barking and Dagenham 016


### Now group by MSOA and then merge with London covid data to get one large dataset

In [8]:
london_deprivation_msoa = london_deprivation_new.groupby(['local_auth_code', 
                                                          'local_auth_name', 'MSOA11CD', 'MSOA11NM']).median().reset_index()

print(london_deprivation_msoa.count())
london_deprivation_msoa.head()

local_auth_code            983
local_auth_name            983
MSOA11CD                   983
MSOA11NM                   983
income_rank                983
employment_rank            983
education_rank             983
health_rank                983
housing_rank               983
living_environment_rank    983
dtype: int64


Unnamed: 0,local_auth_code,local_auth_name,MSOA11CD,MSOA11NM,income_rank,employment_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E09000001,City of London,E02000001,City of London 001,30837.5,30316.5,30856.5,23587.0,2635.5,5940.5
1,E09000002,Barking and Dagenham,E02000002,Barking and Dagenham 001,4670.0,6519.5,7695.5,14751.5,844.5,14801.0
2,E09000002,Barking and Dagenham,E02000003,Barking and Dagenham 002,11740.5,16384.0,15123.5,15674.5,1912.5,10359.0
3,E09000002,Barking and Dagenham,E02000004,Barking and Dagenham 003,12869.0,14473.0,13759.5,15516.0,4320.0,16398.5
4,E09000002,Barking and Dagenham,E02000005,Barking and Dagenham 004,9665.0,10100.0,8974.0,14418.0,1689.0,6036.0


### Now merge with the covid deaths data based on msoa code

In [9]:
london_covid_all = pd.merge(london_covid_total, london_deprivation_msoa, 
                            left_on='MSOA11CD', right_on='MSOA11CD', how = 'inner')

print(london_covid_all.count())
london_covid_all.head()

MSOA11CD                         983
covid_19_deaths                  983
covid_19_deaths_per_thousand     983
total_population_mid_2018        983
over_70_prop                     983
proportion_at_risk_jobs          983
insecure_proportion              983
Local Authority                  983
all_bame_prop                    983
all_black_prop                   983
pakistani_or_bangladeshi_prop    983
all_indian_prop                  983
MSOA11NM_x                       983
total_registered_patients        983
Hypertension                     983
Obesity (18+)                    983
Diabetes                         983
Asthma                           983
Coronary heart disease           983
local_auth_code                  983
local_auth_name                  983
MSOA11NM_y                       983
income_rank                      983
employment_rank                  983
education_rank                   983
health_rank                      983
housing_rank                     983
l

Unnamed: 0,MSOA11CD,covid_19_deaths,covid_19_deaths_per_thousand,total_population_mid_2018,over_70_prop,proportion_at_risk_jobs,insecure_proportion,Local Authority,all_bame_prop,all_black_prop,...,Coronary heart disease,local_auth_code,local_auth_name,MSOA11NM_y,income_rank,employment_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E02000001,3,0.34459,8706,0.123593,0.120794,0.077743,City of London,0.213695,0.026169,...,1.5,E09000001,City of London,City of London 001,30837.5,30316.5,30856.5,23587.0,2635.5,5940.5
1,E02000002,2,0.259067,7720,0.103886,0.345813,0.318841,Barking and Dagenham,0.350111,0.167232,...,1.88,E09000002,Barking and Dagenham,Barking and Dagenham 001,4670.0,6519.5,7695.5,14751.5,844.5,14801.0
2,E02000003,3,0.271469,11051,0.085965,0.309099,0.26085,Barking and Dagenham,0.453858,0.161075,...,1.83,E09000002,Barking and Dagenham,Barking and Dagenham 002,11740.5,16384.0,15123.5,15674.5,1912.5,10359.0
3,E02000004,9,1.366535,6586,0.124658,0.311407,0.26557,Barking and Dagenham,0.19023,0.104982,...,1.93,E09000002,Barking and Dagenham,Barking and Dagenham 003,12869.0,14473.0,13759.5,15516.0,4320.0,16398.5
4,E02000005,5,0.488806,10229,0.066282,0.337135,0.322392,Barking and Dagenham,0.337304,0.168769,...,1.47,E09000002,Barking and Dagenham,Barking and Dagenham 004,9665.0,10100.0,8974.0,14418.0,1689.0,6036.0


In [10]:
london_covid_all.drop(['Local Authority'], axis=1, inplace=True)
london_covid_all.drop(['total_registered_patients'], axis=1, inplace=True)
london_covid_all.drop(['MSOA11NM_y'], axis=1, inplace=True)

london_covid_all.rename(columns = {'MSOA11CD':'msoa_code'}, inplace = True)
london_covid_all.rename(columns = {'MSOA11NM_x':'msoa_name'}, inplace = True)
london_covid_all.rename(columns = {'Hypertension':'hypertension'}, inplace = True)
london_covid_all.rename(columns = {'Obesity (18+)':'obesity'}, inplace = True)
london_covid_all.rename(columns = {'Diabetes':'diabetes'}, inplace = True)
london_covid_all.rename(columns = {'Asthma':'asthma'}, inplace = True)
london_covid_all.rename(columns = {'Coronary heart disease':'heart_disease'}, inplace = True)

london_covid_all.head()


Unnamed: 0,msoa_code,covid_19_deaths,covid_19_deaths_per_thousand,total_population_mid_2018,over_70_prop,proportion_at_risk_jobs,insecure_proportion,all_bame_prop,all_black_prop,pakistani_or_bangladeshi_prop,...,asthma,heart_disease,local_auth_code,local_auth_name,income_rank,employment_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E02000001,3,0.34459,8706,0.123593,0.120794,0.077743,0.213695,0.026169,0.033627,...,3.08,1.5,E09000001,City of London,30837.5,30316.5,30856.5,23587.0,2635.5,5940.5
1,E02000002,2,0.259067,7720,0.103886,0.345813,0.318841,0.350111,0.167232,0.04797,...,3.91,1.88,E09000002,Barking and Dagenham,4670.0,6519.5,7695.5,14751.5,844.5,14801.0
2,E02000003,3,0.271469,11051,0.085965,0.309099,0.26085,0.453858,0.161075,0.104629,...,4.64,1.83,E09000002,Barking and Dagenham,11740.5,16384.0,15123.5,15674.5,1912.5,10359.0
3,E02000004,9,1.366535,6586,0.124658,0.311407,0.26557,0.19023,0.104982,0.016338,...,3.58,1.93,E09000002,Barking and Dagenham,12869.0,14473.0,13759.5,15516.0,4320.0,16398.5
4,E02000005,5,0.488806,10229,0.066282,0.337135,0.322392,0.337304,0.168769,0.058748,...,4.26,1.47,E09000002,Barking and Dagenham,9665.0,10100.0,8974.0,14418.0,1689.0,6036.0


### ONS monthly death data
The London death data only extends to 17th April and so doesn't really encompass the entire first wave, and for that we would need data extending to the end of June, 2020. 
- We therefore load additional ONS monthly death data from March to end of July to provide a more comprehensive set of death data that covers the vast majority of deaths within the first wave. THis data is also at MSOA level

In [11]:
ONS_covid_deaths = pd.read_excel("covidlocalareadeprivationupdate.xlsx", sheet_name='Table 5', 
                                 usecols = "A,Q:U", names=['msoa_code', 'march_deaths', 
                                                           'april_deaths', 'may_deaths', 'june_deaths', 'july_deaths'], skiprows=12)

ONS_covid_deaths['ons_total_deaths'] = ONS_covid_deaths.march_deaths + ONS_covid_deaths.april_deaths + ONS_covid_deaths.may_deaths + ONS_covid_deaths.june_deaths + ONS_covid_deaths.july_deaths

print(ONS_covid_deaths.count())
ONS_covid_deaths.head()

msoa_code           7201
march_deaths        7201
april_deaths        7201
may_deaths          7201
june_deaths         7201
july_deaths         7201
ons_total_deaths    7201
dtype: int64


Unnamed: 0,msoa_code,march_deaths,april_deaths,may_deaths,june_deaths,july_deaths,ons_total_deaths
0,E02000001,1,2,1,0,0,4
1,E02000002,1,7,0,0,0,8
2,E02000003,0,7,1,0,0,8
3,E02000004,2,8,2,0,0,12
4,E02000005,2,4,1,0,0,7


#### Merge with london_covid_all

In [12]:
london_covid_all_ons = pd.merge(london_covid_all, ONS_covid_deaths, 
                                left_on='msoa_code', right_on='msoa_code', how = 'inner')

print(london_covid_all_ons.count())
london_covid_all_ons.head(10)

msoa_code                        983
covid_19_deaths                  983
covid_19_deaths_per_thousand     983
total_population_mid_2018        983
over_70_prop                     983
proportion_at_risk_jobs          983
insecure_proportion              983
all_bame_prop                    983
all_black_prop                   983
pakistani_or_bangladeshi_prop    983
all_indian_prop                  983
msoa_name                        983
hypertension                     983
obesity                          983
diabetes                         983
asthma                           983
heart_disease                    983
local_auth_code                  983
local_auth_name                  983
income_rank                      983
employment_rank                  983
education_rank                   983
health_rank                      983
housing_rank                     983
living_environment_rank          983
march_deaths                     983
april_deaths                     983
m

Unnamed: 0,msoa_code,covid_19_deaths,covid_19_deaths_per_thousand,total_population_mid_2018,over_70_prop,proportion_at_risk_jobs,insecure_proportion,all_bame_prop,all_black_prop,pakistani_or_bangladeshi_prop,...,education_rank,health_rank,housing_rank,living_environment_rank,march_deaths,april_deaths,may_deaths,june_deaths,july_deaths,ons_total_deaths
0,E02000001,3,0.34459,8706,0.123593,0.120794,0.077743,0.213695,0.026169,0.033627,...,30856.5,23587.0,2635.5,5940.5,1,2,1,0,0,4
1,E02000002,2,0.259067,7720,0.103886,0.345813,0.318841,0.350111,0.167232,0.04797,...,7695.5,14751.5,844.5,14801.0,1,7,0,0,0,8
2,E02000003,3,0.271469,11051,0.085965,0.309099,0.26085,0.453858,0.161075,0.104629,...,15123.5,15674.5,1912.5,10359.0,0,7,1,0,0,8
3,E02000004,9,1.366535,6586,0.124658,0.311407,0.26557,0.19023,0.104982,0.016338,...,13759.5,15516.0,4320.0,16398.5,2,8,2,0,0,12
4,E02000005,5,0.488806,10229,0.066282,0.337135,0.322392,0.337304,0.168769,0.058748,...,8974.0,14418.0,1689.0,6036.0,2,4,1,0,0,7
5,E02000007,9,0.889065,10123,0.080707,0.350692,0.36183,0.328177,0.227164,0.026846,...,8286.0,11879.0,375.0,14002.0,5,5,2,1,0,13
6,E02000008,10,0.791452,12635,0.072497,0.330408,0.357394,0.30357,0.17374,0.035699,...,9793.0,10842.0,1183.0,7318.0,0,11,0,0,0,11
7,E02000009,7,0.603865,11592,0.047533,0.332381,0.331237,0.388207,0.188326,0.071352,...,9315.5,10245.0,950.5,7396.0,0,7,0,0,0,7
8,E02000010,6,0.652387,9197,0.064586,0.330713,0.355459,0.307951,0.190482,0.03401,...,10318.0,12006.0,902.0,11351.0,0,7,0,0,0,7
9,E02000011,6,0.85923,6983,0.092081,0.325423,0.307084,0.33635,0.185227,0.047357,...,11974.0,12000.5,2195.0,11636.0,1,7,2,2,0,12


In [13]:
# first add a field for ons deaths as a proportion of population and then reorder
london_covid_all_ons['ons_deaths_per_thousand'] = (1000*london_covid_all_ons.ons_total_deaths) / london_covid_all_ons.total_population_mid_2018
# now reorder before melting
london_covid_all_ons = london_covid_all_ons[['msoa_code', 'msoa_name', 'local_auth_code', 'local_auth_name', 
                                     'covid_19_deaths', 'covid_19_deaths_per_thousand', 'ons_total_deaths', 
                                     'ons_deaths_per_thousand', 'march_deaths', 'april_deaths', 'may_deaths',
                                     'june_deaths', 'july_deaths', 'total_population_mid_2018',
                                     'over_70_prop', 'proportion_at_risk_jobs', 'insecure_proportion', 
                                     'all_bame_prop', 'all_black_prop', 'pakistani_or_bangladeshi_prop',
                                     'all_indian_prop', 'hypertension',
                                     'obesity', 'diabetes', 'asthma', 'heart_disease', 'income_rank', 
                                     'education_rank', 'health_rank', 'housing_rank', 'living_environment_rank'
                                    ]]

print(london_covid_all_ons.count())
london_covid_all_ons.head()

msoa_code                        983
msoa_name                        983
local_auth_code                  983
local_auth_name                  983
covid_19_deaths                  983
covid_19_deaths_per_thousand     983
ons_total_deaths                 983
ons_deaths_per_thousand          983
march_deaths                     983
april_deaths                     983
may_deaths                       983
june_deaths                      983
july_deaths                      983
total_population_mid_2018        983
over_70_prop                     983
proportion_at_risk_jobs          983
insecure_proportion              983
all_bame_prop                    983
all_black_prop                   983
pakistani_or_bangladeshi_prop    983
all_indian_prop                  983
hypertension                     983
obesity                          983
diabetes                         983
asthma                           983
heart_disease                    983
income_rank                      983
e

Unnamed: 0,msoa_code,msoa_name,local_auth_code,local_auth_name,covid_19_deaths,covid_19_deaths_per_thousand,ons_total_deaths,ons_deaths_per_thousand,march_deaths,april_deaths,...,hypertension,obesity,diabetes,asthma,heart_disease,income_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E02000001,City of London 001,E09000001,City of London,3,0.34459,4,0.459453,1,2,...,7.95,3.62,2.7,3.08,1.5,30837.5,30856.5,23587.0,2635.5,5940.5
1,E02000002,Barking and Dagenham 001,E09000002,Barking and Dagenham,2,0.259067,8,1.036269,1,7,...,9.26,6.15,4.99,3.91,1.88,4670.0,7695.5,14751.5,844.5,14801.0
2,E02000003,Barking and Dagenham 002,E09000002,Barking and Dagenham,3,0.271469,8,0.723916,0,7,...,10.33,7.6,5.79,4.64,1.83,11740.5,15123.5,15674.5,1912.5,10359.0
3,E02000004,Barking and Dagenham 003,E09000002,Barking and Dagenham,9,1.366535,12,1.822047,2,8,...,9.34,8.36,4.95,3.58,1.93,12869.0,13759.5,15516.0,4320.0,16398.5
4,E02000005,Barking and Dagenham 004,E09000002,Barking and Dagenham,5,0.488806,7,0.684329,2,4,...,9.37,8.79,5.15,4.26,1.47,9665.0,8974.0,14418.0,1689.0,6036.0


### We now want to normalise the ranking and other fields so they are between 0 and 1
And then replace in london_covid_all

In [14]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

df_numeric = london_covid_all_ons[['hypertension', 'obesity', 'diabetes', 'asthma', 'heart_disease',
                                  'income_rank', 'education_rank', 'health_rank', 
                                   'housing_rank', 'living_environment_rank']].copy()
min_max = min_max_scaler.fit_transform(df_numeric)

num_col_names = df_numeric.columns
df_min_max = pd.DataFrame(min_max, columns=num_col_names)
df_min_max.head(10)

Unnamed: 0,hypertension,obesity,diabetes,asthma,heart_disease,income_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,0.324074,0.210938,0.120575,0.296296,0.332103,0.937041,0.933212,0.678423,0.089869,0.206279
1,0.434343,0.493304,0.373894,0.488426,0.472325,0.100337,0.134818,0.371421,0.028259,0.559091
2,0.524411,0.655134,0.462389,0.657407,0.453875,0.326416,0.390872,0.403492,0.064998,0.382217
3,0.441077,0.739955,0.369469,0.412037,0.490775,0.3625,0.343853,0.397985,0.147816,0.6227
4,0.443603,0.787946,0.391593,0.569444,0.321033,0.260052,0.17889,0.359833,0.05731,0.210082
5,0.506734,0.616071,0.348451,0.481481,0.372694,0.149898,0.155173,0.271612,0.012109,0.527276
6,0.518519,0.799107,0.439159,0.564815,0.357934,0.171769,0.207122,0.23558,0.039904,0.261129
7,0.474747,0.628348,0.384956,0.511574,0.332103,0.121409,0.190662,0.214837,0.031906,0.264235
8,0.534512,0.824777,0.359513,0.418981,0.317343,0.13503,0.225219,0.276025,0.030237,0.421717
9,0.570707,0.741071,0.373894,0.372685,0.332103,0.284849,0.282304,0.275834,0.074716,0.433065


In [15]:
# now merge the datasets, having first dropped the original columns
london_covid_new = london_covid_all_ons[['msoa_code', 'msoa_name', 'local_auth_code', 'local_auth_name', 'covid_19_deaths',
                                        'covid_19_deaths_per_thousand', 'ons_total_deaths', 'ons_deaths_per_thousand', 
                                        'march_deaths', 'april_deaths', 'may_deaths', 'june_deaths', 'july_deaths',
                                        'total_population_mid_2018', 'over_70_prop', 'proportion_at_risk_jobs', 'insecure_proportion',
                                        'all_bame_prop', 'all_black_prop',
                                        'pakistani_or_bangladeshi_prop', 'all_indian_prop']].copy()

london_covid_merged = pd.merge(left=london_covid_new, left_index=True,
                  right=df_min_max, right_index=True,
                  how='inner')

london_covid_merged.head()

Unnamed: 0,msoa_code,msoa_name,local_auth_code,local_auth_name,covid_19_deaths,covid_19_deaths_per_thousand,ons_total_deaths,ons_deaths_per_thousand,march_deaths,april_deaths,...,hypertension,obesity,diabetes,asthma,heart_disease,income_rank,education_rank,health_rank,housing_rank,living_environment_rank
0,E02000001,City of London 001,E09000001,City of London,3,0.34459,4,0.459453,1,2,...,0.324074,0.210938,0.120575,0.296296,0.332103,0.937041,0.933212,0.678423,0.089869,0.206279
1,E02000002,Barking and Dagenham 001,E09000002,Barking and Dagenham,2,0.259067,8,1.036269,1,7,...,0.434343,0.493304,0.373894,0.488426,0.472325,0.100337,0.134818,0.371421,0.028259,0.559091
2,E02000003,Barking and Dagenham 002,E09000002,Barking and Dagenham,3,0.271469,8,0.723916,0,7,...,0.524411,0.655134,0.462389,0.657407,0.453875,0.326416,0.390872,0.403492,0.064998,0.382217
3,E02000004,Barking and Dagenham 003,E09000002,Barking and Dagenham,9,1.366535,12,1.822047,2,8,...,0.441077,0.739955,0.369469,0.412037,0.490775,0.3625,0.343853,0.397985,0.147816,0.6227
4,E02000005,Barking and Dagenham 004,E09000002,Barking and Dagenham,5,0.488806,7,0.684329,2,4,...,0.443603,0.787946,0.391593,0.569444,0.321033,0.260052,0.17889,0.359833,0.05731,0.210082


In [16]:
london_covid_merged.count()

msoa_code                        983
msoa_name                        983
local_auth_code                  983
local_auth_name                  983
covid_19_deaths                  983
covid_19_deaths_per_thousand     983
ons_total_deaths                 983
ons_deaths_per_thousand          983
march_deaths                     983
april_deaths                     983
may_deaths                       983
june_deaths                      983
july_deaths                      983
total_population_mid_2018        983
over_70_prop                     983
proportion_at_risk_jobs          983
insecure_proportion              983
all_bame_prop                    983
all_black_prop                   983
pakistani_or_bangladeshi_prop    983
all_indian_prop                  983
hypertension                     983
obesity                          983
diabetes                         983
asthma                           983
heart_disease                    983
income_rank                      983
e

In [17]:
# now pro rate the monthly deaths data
london_covid_merged['march_deaths_per_thousand'] = (1000*london_covid_merged.march_deaths) / london_covid_merged.total_population_mid_2018
london_covid_merged['april_deaths_per_thousand'] = (1000*london_covid_merged.april_deaths) / london_covid_merged.total_population_mid_2018
london_covid_merged['may_deaths_per_thousand'] = (1000*london_covid_merged.may_deaths) / london_covid_merged.total_population_mid_2018
london_covid_merged['june_deaths_per_thousand'] = (1000*london_covid_merged.june_deaths) / london_covid_merged.total_population_mid_2018
london_covid_merged['july_deaths_per_thousand'] = (1000*london_covid_merged.july_deaths) / london_covid_merged.total_population_mid_2018

# now delete original fields
london_covid_merged.drop(['march_deaths'], axis=1, inplace=True)
london_covid_merged.drop(['april_deaths'], axis=1, inplace=True)
london_covid_merged.drop(['may_deaths'], axis=1, inplace=True)
london_covid_merged.drop(['june_deaths'], axis=1, inplace=True)
london_covid_merged.drop(['july_deaths'], axis=1, inplace=True)

## Now load weekly cases and merge with london_covid_merged
- weekly cases are for whole country so need to extract London cases

In [19]:
# now load up weekly cases by cases and then extract data for weeks 10-31
# week 5 ends on 2nd February. so 10-31 covers the period March to end of July
uk_covid_cases = pd.read_excel("MSOAs_latest.xlsx", sheet_name='AmendedData')

uk_covid_cases = uk_covid_cases[['msoa_code', 'msoa_name', 10, 11, 12, 13, 14, 15,
                                             16, 17, 18, 19, 20, 21, 22, 
                                             23, 24, 25, 26, 27, 28, 29, 30, 31]].copy()

uk_covid_cases['total_cases'] = uk_covid_cases[10] + uk_covid_cases[11] + uk_covid_cases[12] + uk_covid_cases[13]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[14] + uk_covid_cases[15] + uk_covid_cases[16]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[17] + uk_covid_cases[18] + uk_covid_cases[19]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[20] + uk_covid_cases[21] + uk_covid_cases[22]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[23] + uk_covid_cases[24] + uk_covid_cases[25]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[26] + uk_covid_cases[27] + uk_covid_cases[28]
uk_covid_cases['total_cases'] = uk_covid_cases['total_cases'] + uk_covid_cases[29] + uk_covid_cases[30] + uk_covid_cases[31]

print(uk_covid_cases['total_cases'].count())
uk_covid_cases.head()


6791


Unnamed: 0,msoa_code,msoa_name,10,11,12,13,14,15,16,17,...,23,24,25,26,27,28,29,30,31,total_cases
0,E02000001,City of London,0,0,4,3,3,0,0,3,...,0,0,0,0,0,0,0,0,0,13
1,E02000002,Marks Gate,0,0,0,0,5,3,6,0,...,0,0,0,0,0,0,0,3,0,17
2,E02000003,Chadwell Heath East,0,0,0,3,10,10,3,6,...,0,0,0,0,0,0,0,6,0,41
3,E02000004,Eastbrookend,0,0,0,6,10,4,3,5,...,0,0,0,0,0,0,0,0,0,34
4,E02000005,Becontree Heath,0,0,0,4,6,9,3,0,...,0,0,0,0,0,0,0,0,0,22


In [20]:
# Now we want just msoa for London so get a list and then merge
london_covid_cases = pd.merge(london_covid_merged, uk_covid_cases, left_on='msoa_code', right_on='msoa_code', how = 'inner')
london_covid_cases['cases_per_thousand'] = (1000*london_covid_cases.total_cases) / london_covid_cases.total_population_mid_2018

london_covid_cases[10] = (1000*london_covid_cases[10])/london_covid_cases.total_population_mid_2018
london_covid_cases[11] = (1000*london_covid_cases[11])/london_covid_cases.total_population_mid_2018
london_covid_cases[12] = (1000*london_covid_cases[12])/london_covid_cases.total_population_mid_2018
london_covid_cases[13] = (1000*london_covid_cases[13])/london_covid_cases.total_population_mid_2018
london_covid_cases[14] = (1000*london_covid_cases[14])/london_covid_cases.total_population_mid_2018
london_covid_cases[15] = (1000*london_covid_cases[15])/london_covid_cases.total_population_mid_2018
london_covid_cases[16] = (1000*london_covid_cases[16])/london_covid_cases.total_population_mid_2018
london_covid_cases[17] = (1000*london_covid_cases[17])/london_covid_cases.total_population_mid_2018
london_covid_cases[18] = (1000*london_covid_cases[18])/london_covid_cases.total_population_mid_2018
london_covid_cases[19] = (1000*london_covid_cases[19])/london_covid_cases.total_population_mid_2018
london_covid_cases[20] = (1000*london_covid_cases[20])/london_covid_cases.total_population_mid_2018
london_covid_cases[21] = (1000*london_covid_cases[21])/london_covid_cases.total_population_mid_2018
london_covid_cases[22] = (1000*london_covid_cases[22])/london_covid_cases.total_population_mid_2018
london_covid_cases[23] = (1000*london_covid_cases[23])/london_covid_cases.total_population_mid_2018
london_covid_cases[24] = (1000*london_covid_cases[24])/london_covid_cases.total_population_mid_2018
london_covid_cases[25] = (1000*london_covid_cases[25])/london_covid_cases.total_population_mid_2018
london_covid_cases[26] = (1000*london_covid_cases[26])/london_covid_cases.total_population_mid_2018
london_covid_cases[27] = (1000*london_covid_cases[27])/london_covid_cases.total_population_mid_2018
london_covid_cases[28] = (1000*london_covid_cases[28])/london_covid_cases.total_population_mid_2018
london_covid_cases[29] = (1000*london_covid_cases[29])/london_covid_cases.total_population_mid_2018
london_covid_cases[30] = (1000*london_covid_cases[30])/london_covid_cases.total_population_mid_2018
london_covid_cases[31] = (1000*london_covid_cases[31])/london_covid_cases.total_population_mid_2018

# now delete original fields
london_covid_cases.drop(['msoa_name_x'], axis=1, inplace=True)
london_covid_cases.rename(columns = {'msoa_name_y':'msoa_name'}, inplace = True)

print(london_covid_cases['msoa_code'].count())
london_covid_cases.head()

983


Unnamed: 0,msoa_code,local_auth_code,local_auth_name,covid_19_deaths,covid_19_deaths_per_thousand,ons_total_deaths,ons_deaths_per_thousand,total_population_mid_2018,over_70_prop,proportion_at_risk_jobs,...,24,25,26,27,28,29,30,31,total_cases,cases_per_thousand
0,E02000001,E09000001,City of London,3,0.34459,4,0.459453,8706,0.123593,0.120794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,1.493223
1,E02000002,E09000002,Barking and Dagenham,2,0.259067,8,1.036269,7720,0.103886,0.345813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.388601,0.0,17,2.202073
2,E02000003,E09000002,Barking and Dagenham,3,0.271469,8,0.723916,11051,0.085965,0.309099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.542937,0.0,41,3.710071
3,E02000004,E09000002,Barking and Dagenham,9,1.366535,12,1.822047,6586,0.124658,0.311407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34,5.162466
4,E02000005,E09000002,Barking and Dagenham,5,0.488806,7,0.684329,10229,0.066282,0.337135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22,2.150748


## Now load centroid data and merge

In [21]:
london_centroids_full = pd.read_csv("./Topojson/centroids.csv")
london_centroids = london_centroids_full[['MSOA11CD', 'POPDEN', 'cx', 'cy']]
london_centroids.head()

Unnamed: 0,MSOA11CD,POPDEN,cx,cy
0,E02000001,25.5,-0.092332,51.514854
1,E02000002,31.3,0.139442,51.58828
2,E02000003,46.9,0.14091,51.57504
3,E02000004,24.8,0.176676,51.555418
4,E02000005,72.1,0.143325,51.561423


In [22]:
min_max_scaler = MinMaxScaler()

df_density = london_centroids[['POPDEN']].copy()
min_max = min_max_scaler.fit_transform(df_density)

df_min_max = pd.DataFrame(min_max, columns=['population_density'])

london_centroids = pd.merge(left=london_centroids, left_index=True,
                  right=df_min_max, right_index=True,
                  how='inner')

london_centroids.drop(['POPDEN'], axis=1, inplace=True)

In [23]:
print(london_centroids['MSOA11CD'].count())

983


In [24]:
# now merge london_covid_cases and london_centroids
london_covid_geo = pd.merge(london_covid_cases, london_centroids, left_on='msoa_code', right_on='MSOA11CD', how = 'inner')
london_covid_geo.drop(['MSOA11CD'], axis=1, inplace=True)

In [25]:
# now reorder the dataset so a bit tidyer
london_covid_final = london_covid_geo[['msoa_code',
                                       'msoa_name',
                                       'local_auth_code',
                                       'local_auth_name',
                                       'total_population_mid_2018',
                                       'covid_19_deaths',
                                       'covid_19_deaths_per_thousand',
                                       'ons_total_deaths',
                                       'ons_deaths_per_thousand',
                                       'total_cases',
                                       'cases_per_thousand',
                                       'cx',
                                       'cy',
                                       'population_density',                                       
                                       'over_70_prop',
                                       'proportion_at_risk_jobs',
                                       'insecure_proportion',
                                       'all_bame_prop',
                                       'all_black_prop',
                                       'pakistani_or_bangladeshi_prop',
                                       'all_indian_prop',
                                       'hypertension',
                                       'obesity',
                                       'diabetes',
                                       'asthma',
                                       'heart_disease',
                                       'income_rank',
                                       'education_rank',
                                       'health_rank',
                                       'housing_rank',
                                       'living_environment_rank',
                                       'march_deaths_per_thousand',
                                       'april_deaths_per_thousand',
                                       'may_deaths_per_thousand',
                                       'june_deaths_per_thousand',
                                       'july_deaths_per_thousand',
                                       10,
                                       11,
                                       12,
                                       13,
                                       14,
                                       15,
                                       16,
                                       17,
                                       18,
                                       19,
                                       20,
                                       21,
                                       22,
                                       23,
                                       24,
                                       25,
                                       26,
                                       27,
                                       28,
                                       29,
                                       30,
                                       31
                                       ]].copy()


print(london_covid_final['msoa_code'].count())
london_covid_final.head()

983


Unnamed: 0,msoa_code,msoa_name,local_auth_code,local_auth_name,total_population_mid_2018,covid_19_deaths,covid_19_deaths_per_thousand,ons_total_deaths,ons_deaths_per_thousand,total_cases,...,22,23,24,25,26,27,28,29,30,31
0,E02000001,City of London,E09000001,City of London,8706,3,0.34459,4,0.459453,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E02000002,Marks Gate,E09000002,Barking and Dagenham,7720,2,0.259067,8,1.036269,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388601,0.0
2,E02000003,Chadwell Heath East,E09000002,Barking and Dagenham,11051,3,0.271469,8,0.723916,41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542937,0.0
3,E02000004,Eastbrookend,E09000002,Barking and Dagenham,6586,9,1.366535,12,1.822047,34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E02000005,Becontree Heath,E09000002,Barking and Dagenham,10229,5,0.488806,7,0.684329,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now do choropleths

In [None]:
import geopandas as gpd

gb = gpd.read_file("Topojson/MSOA_2011_London_gen_MHW.shp") # a gis format that has geographical boundaries QGIS is a package for looking at shape files
gb.crs = "epsg:27700" # code for the UK national grid

gb.head()

In [None]:
london_covid_geo = pd.merge(gb, london_covid_final, left_on='MSOA11CD', right_on='msoa_code', how = 'inner')
london_covid_geo.head()

In [None]:
col_names = london_covid_final.columns.to_list()

col_names = col_names[14:58]
london_covid_tidy = pd.melt(london_covid_final, id_vars=['msoa_code',
                                                'msoa_name',
                                                'local_auth_code',
                                                'local_auth_name',
                                                'total_population_mid_2018',
                                                'covid_19_deaths',
                                                'covid_19_deaths_per_thousand',
                                                'ons_total_deaths',
                                                'ons_deaths_per_thousand',
                                                'total_cases',
                                                'cases_per_thousand',
                                                'cx',
                                                'cy',
                                                'population_density'], 
                                                value_vars=col_names, var_name = 'measure', value_name='value')

london_covid_tidy.head()

In [None]:
data_geo = alt.InlineData(values = london_covid_geo.to_json(), #geopandas to geojson string
                       format = alt.DataFormat(property='features',type='json'))

alt.Chart(data_geo).mark_geoshape(strokeWidth=1,stroke='lightgray',strokeOpacity=0.2
).encode(
    color=alt.Color('properties.ons_deaths_per_thousand:Q'),
    tooltip=['properties.local_auth_name:N', 'properties.msoa_name:N', 'properties.ons_deaths_per_thousand:Q']
).properties(
    projection={'type': 'identity','reflectY': True},
    width=800,
    height=800,
    title='cases by msoa - to jul'
)

In [None]:
data_geo = alt.InlineData(values = london_covid_geo.to_json(), #geopandas to geojson string
                       format = alt.DataFormat(property='features',type='json'))

alt.Chart(data_geo).mark_geoshape(strokeWidth=1,stroke='lightgray',strokeOpacity=0.2
).encode(
    color=alt.Color('properties.ons_deaths_per_thousand:Q'),
    tooltip=['properties.local_auth_name:N', 'properties.msoa_name:N', 'properties.ons_deaths_per_thousand:Q']
).properties(
    projection={'type': 'identity','reflectY': True},
    width=800,
    height=800,
    title='cases by msoa - to jul'
)


slider = alt.binding_range(min=10, max=31, step=1)
select_week = alt.selection_single(name="week", fields=['week_no'],
                                   bind=slider, init={'week_no': 10})

alt.Chart(london_grid_grouped).mark_bar().encode(
    x='grid_x:O',
    y='grid_y:O',
    color=alt.Color('cummulative_cases:Q'), 
    tooltip = ['cummulative_cases', 'cases']
).properties(
    width=500,
    height=360
).add_selection(
    select_week
).transform_filter(
    select_week
).interactive()

## Now generate grid coordinates

In [None]:
print("max-lon=" + str(london_covid_final.cx.max()))
print("min-lon=" + str(london_covid_final.cx.min()))
print("max-lat=" + str(london_covid_final.cy.max()))
print("min-lat=" + str(london_covid_final.cy.min()))

In [None]:
lon_range = (-0.5, 0.32)
lon_cells = 30

lat_range = (51.2, 51.8)
lat_cells = 26

In [None]:
from shapely.geometry import Polygon

lon_incr = (lon_range[1] - lon_range[0]) / lon_cells
lat_incr = (lat_range[1] - lat_range[0]) / lat_cells
x0, y0 = lon_range[0], lat_range[0]

cell_ids = []
grid_cells = []
for c in range(lon_cells):
    x1 = x0 + lon_incr
    for r in range(lat_cells):
        y1 = y0 + lat_incr
        grid_cells.append(Polygon([(x0,y0),(x0,y1),(x1,y1),(x1,y0)]))
        cell_ids.append('{:02d}_{:02d}'.format(c, r))
        y0 = y1
    x0 = x1
    y0 = lat_range[0]


In [None]:
london_grid_wide = london_covid_final[['msoa_code', 'msoa_name', 'ons_deaths_per_thousand', 'cx', 'cy',
                                 'wk_10', 'wk_11','wk_12','wk_13','wk_14','wk_15','wk_16','wk_17','wk_18','wk_19',
                                  'wk_20','wk_21','wk_22','wk_23','wk_24','wk_25','wk_26','wk_27','wk_28','wk_29','wk_30',
                                  'wk_31']]

london_grid_wide.head()

In [None]:
col_names = london_grid_wide.columns.tolist()
col_names = col_names[5:28]
london_grid = pd.melt(london_grid_wide, id_vars=['msoa_code', 
                                                 'msoa_name',
                                                 'ons_deaths_per_thousand',
                                                 'cx',
                                                 'cy'], 
                        value_vars=col_names, var_name = 'week', value_name='cases')

london_grid.head()

In [None]:
london_grid['grid_x']  = np.floor((london_grid['cx'] - lon_range[0]) / (lon_range[1] - lon_range[0]) * lon_cells).astype(int)
london_grid['grid_y']  = np.floor((london_grid['cy']  - lat_range[0]) / (lat_range[1] - lat_range[0]) * lat_cells).astype(int)
# The cell_id column will be used to link our aggregate data to the grid GeoJSON object for plotting
london_grid['cell_id']  = london_grid[['grid_x','grid_y']].apply(lambda x: '{:02d}_{:02d}'.format(x.grid_x, x.grid_y), axis=1)

london_grid.head()

In [None]:
london_grid_grouped = london_grid.groupby(['cell_id', 'grid_x', 'grid_y', 'week']).sum().reset_index()
london_grid_grouped['cummulative_cases'] = london_grid_grouped.groupby(['cell_id', 'grid_x', 'grid_y'])['cases'].cumsum()


london_grid_grouped.head()

In [None]:
london_grid_grouped['week_no'] = london_grid_grouped['week'].str[3:].astype('int')
london_grid_grouped.head()

In [None]:
slider = alt.binding_range(min=10, max=31, step=1)
select_week = alt.selection_single(name="week", fields=['week_no'],
                                   bind=slider, init={'week_no': 10})

alt.Chart(london_grid_grouped).mark_bar().encode(
    x='grid_x:O',
    y='grid_y:O',
    color=alt.Color('cummulative_cases:Q'), 
    tooltip = ['cummulative_cases', 'cases']
).properties(
    width=500,
    height=360
).add_selection(
    select_week
).transform_filter(
    select_week
).interactive()

In [None]:
london_covid_final.to_csv('output_data/london_covid_merged.csv', index=False)
london_grid_grouped.to_csv('output_data/london_grid_grouped.csv', index=False)
london_grid.to_csv('output_data/london_grid.csv', index=False)

# Check scatter plots and correlations
### ONS deaths versus characteristics
This is just to test that we can produce reasonable scatter plots and that the correlations remain the same irrespective of having scaled the data (we can, they do)

In [None]:
col_names = london_covid_final.columns.tolist()
col_names

In [None]:
col_names = london_covid_final.columns.tolist()
col_names = col_names[14:59]

london_covid_stats = pd.melt(london_covid_final, id_vars=['msoa_code', 
                                                         'msoa_name',
                                                         'local_auth_code',
                                                         'local_auth_name',
                                                         'total_population_mid_2018',
                                                         'covid_19_deaths',
                                                         'covid_19_deaths_per_thousand',
                                                         'ons_total_deaths',
                                                         'ons_deaths_per_thousand',
                                                         'total_cases',
                                                         'cases_per_thousand',
                                                         'cx',
                                                         'cy',
                                                         'population_density'
                                                       ], 
                        value_vars=col_names, var_name = 'measure', value_name='value')

print(london_covid_stats.count())
london_covid_stats.head()

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()

ignore_values = ['march_deaths_per_thousand', 'april_deaths_per_thousand', 'may_deaths_per_thousand',
                  'june_deaths_per_thousand', 'july_deaths_per_thousand', 'wk_10', 'wk_11', 'wk_12',
                  'wk_13', 'wk_14', 'wk_15', 'wk_16', 'wk_17', 'wk_18', 'wk_19', 'wk_20', 'wk_21', 'wk_22', 'wk_23', 
                  'wk_24', 'wk_25', 'wk_26', 'wk_27', 'wk_28', 'wk_29', 'wk_30', 'wk_31']

alt.Chart(london_covid_stats[~london_covid_stats.measure.isin(ignore_values)]).mark_point().encode(
    x='value:Q',
    y='ons_deaths_per_thousand:Q',     
    tooltip = ['msoa_name']
).properties(
    width=180,
    height=180
).facet(
    column='measure:N'
).interactive()

In [None]:
ignore_values = ['march_deaths_per_thousand', 'april_deaths_per_thousand', 'may_deaths_per_thousand',
                  'june_deaths_per_thousand', 'july_deaths_per_thousand', 'wk_10', 'wk_11', 'wk_12',
                  'wk_13', 'wk_14', 'wk_15', 'wk_16', 'wk_17', 'wk_18', 'wk_19', 'wk_20', 'wk_21', 'wk_22', 'wk_23', 
                  'wk_24', 'wk_25', 'wk_26', 'wk_27', 'wk_28', 'wk_29', 'wk_30', 'wk_31']

alt.Chart(london_covid_stats[~london_covid_stats.measure.isin(ignore_values)]).mark_point().encode(
    x='value:Q',
    y='cases_per_thousand:Q',     
    tooltip = ['msoa_name']
).properties(
    width=180,
    height=180
).facet(
    column='measure:N'
).interactive()

In [None]:
alt.Chart(london_covid_stats).mark_point().encode(
    x='population_density:Q',
    y='ons_deaths_per_thousand:Q',     
    tooltip = ['msoa_name']
).properties(
    width=180,
    height=180
).interactive()

In [None]:
alt.Chart(london_covid_stats).mark_point().encode(
    x='population_density:Q',
    y='cases_per_thousand:Q',     
    tooltip = ['msoa_name']
).properties(
    width=180,
    height=180
).interactive()

In [None]:
london_corr_merged_input = london_covid_final[['ons_deaths_per_thousand', 'total_population_mid_2018', 'population_density',
                                     'over_70_prop', 'all_bame_prop', 'all_black_prop', 'pakistani_or_bangladeshi_prop',
                                     'all_indian_prop', 'hypertension', 'proportion_at_risk_jobs', 'insecure_proportion',
                                     'obesity', 'diabetes', 'asthma', 'heart_disease', 'income_rank', 
                                     'education_rank', 'health_rank', 'housing_rank', 'living_environment_rank']]
london_corr_merged = london_corr_merged_input.corr()

corr_deaths_merged=london_corr_merged[['ons_deaths_per_thousand']]
#sort by the amount of correlation
corr_deaths_merged=corr_deaths_merged.sort_values(by ='ons_deaths_per_thousand',ascending=True)

corr_deaths_merged

### Now check to see level of correlation between cases and deaths
- To see if it's a useful proxy

In [None]:
import altair as alt

alt.data_transformers.disable_max_rows()

alt.Chart(london_covid_cases).mark_point().encode(
    x='ons_deaths_per_thousand:Q',
    y='cases_per_thousand:Q', 
    tooltip = ['msoa_name']
).properties(
    width=180,
    height=180
).interactive()

In [None]:
import scipy.stats as stats

corrPearson, pValPearson = stats.pearsonr(london_covid_cases.ons_deaths_per_thousand, london_covid_cases.cases_per_thousand)
corrSpearman,pValSpearman = stats.spearmanr(london_covid_cases.ons_deaths_per_thousand, london_covid_cases.cases_per_thousand)

print("Cased versus deaths: Pearson = " + str(corrPearson) + ", Spearman = " + str(corrSpearman) + "," + str(pValPearson))

### Conclusions
- Even incorporating death data up until the end of the first wave, it is reasonable to confirm the London Datastore findings that there are no correlations between demographic, health or deprivation factors and the total number of covid deaths in a London boroughs when considered in isolation. So a further avenue of research would be to perform a regression analysis, incrementally adding in each feature to understand which combination of borough characteristics have the biggest correlation with the number of deaths within the borough
- There is MODERATE correlation between cases and deaths and so it will be instructive to see how cases by region progress over time. 

## Working - a facet grid of grids

In [None]:
alt.Chart(london_grid_grouped).mark_square().encode(
    x='grid_x:O',
    y='grid_y:O',
    color = alt.Color('cummulative_cases:Q', scale=alt.Scale(scheme='reds')), 
    tooltip = ['cummulative_cases']
).properties(
    width=250,
    height=180
).facet(
    column='week:O'
).interactive()