In [1]:
#load libraries
import pandas as pd
import re

In [2]:
import os
from requests import get
from urllib.parse import urlparse

def cache_data(src:str, dest:str) -> str:
    """Downloads and caches a remote file locally.
    
    The function sits between the 'read' step of a pandas or geopandas
    data frame and downloading the file from a remote location. The idea
    is that it will save it locally so that you don't need to remember to
    do so yourself. Subsequent re-reads of the file will return instantly
    rather than downloading the entire file for a second or n-th itme.
    
    Parameters
    ----------
    src : str
        The remote *source* for the file, any valid URL should work.
    dest : str
        The *destination* location to save the downloaded file.
        
    Returns
    -------
    str
        A string representing the local location of the file.
    """
    
    url = urlparse(src) # We assume that this is some kind of valid URL 
    fn  = os.path.split(url.path)[-1] # Extract the filename
    dfn = os.path.join(dest,fn) # Destination filename
    
    # Check if dest+filename does *not* exist -- 
    # that would mean we have to download it!
    if not os.path.isfile(dfn):
        
        print(f"{dfn} not found, downloading!")

        # Convert the path back into a list (without)
        # the filename -- we need to check that directories
        # exist first.
        path = os.path.split(dest)
        
        # Create any missing directories in dest(ination) path
        # -- os.path.join is the reverse of split (as you saw above)
        # but it doesn't work with lists... so I had to google how
        # to use the 'splat' operator! os.makedirs creates missing
        # directories in a path automatically.
        if len(path) >= 1 and path[0] != '':
            os.makedirs(os.path.join(*path), exist_ok=True)
            
        # Download and write the file
        with open(dfn, "wb") as file:
            response = get(src)
            file.write(response.content)
            
        print('Done downloading...')

    else:
        print(f"Found {dfn} locally!")

    return dfn

# nowork_rate、middle_high_class_pro、advanced_education_rate

In [198]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-labour-market/ab2e314e-dbd8-4afe-8304-1f5402eda542/NSSEC.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

status = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')       

Found data/ward_socioeconomic/NSSEC.xlsx locally!


In [199]:
status.head()

Unnamed: 0,ward code,ward name,local authority code,local authority name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students
0,E09000001,City of London,E09000001,City of London,8003,3173,1914,431,571,133,289,257,441,794
1,E05014053,Abbey,E09000002,Barking and Dagenham,2940,455,481,228,358,107,299,308,415,289
2,E05014054,Alibon,E09000002,Barking and Dagenham,7458,419,1139,769,1044,414,964,1039,893,777
3,E05014055,Barking Riverside,E09000002,Barking and Dagenham,6901,680,1271,736,757,341,743,785,785,803
4,E05014056,Beam,E09000002,Barking and Dagenham,6092,305,873,654,783,320,791,893,764,709


In [200]:
status.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ ',
 'Higher managerial admin and professional ',
 'Lower managerial admin and professional ',
 'Intermediate ',
 'Small employers, own account workers',
 'Lower supervisory and technical ',
 'Semi routine',
 'Routine ',
 'Never worked and long term unemployed',
 'Full time students']

In [201]:
# Dropping the specified columns
status = status.drop(columns=['ward code', 'local authority code', 'local authority name'])
status.head()

Unnamed: 0,ward name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students
0,City of London,8003,3173,1914,431,571,133,289,257,441,794
1,Abbey,2940,455,481,228,358,107,299,308,415,289
2,Alibon,7458,419,1139,769,1044,414,964,1039,893,777
3,Barking Riverside,6901,680,1271,736,757,341,743,785,785,803
4,Beam,6092,305,873,654,783,320,791,893,764,709


In [202]:
status=status.groupby('ward name').sum()

In [203]:
#失业率=Never worked and long term unemployed 中高产阶级人群所占比例=[5:8] 高等教育比例[Full time students]
status['nonwork_rate'] = (status['Never worked and long term unemployed']/status["All usual residents aged 16+ "]).round(2)
status['middle_high_class_pro'] = ((status['Higher managerial admin and professional '] + status['Lower managerial admin and professional '] + status['Intermediate '])/status["All usual residents aged 16+ "]).round(2)
status['advanced_education_rate'] = (status['Full time students']/status["All usual residents aged 16+ "]).round(2)

In [204]:
status.head()

Unnamed: 0_level_0,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students,nonwork_rate,middle_high_class_pro,advanced_education_rate
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Abbey,11322,2877,2646,986,1188,342,822,791,907,763,0.08,0.57,0.07
Abbey Road,11003,3520,2250,691,1047,191,545,408,1156,1195,0.11,0.59,0.11
Abbey Wood,13144,1089,2062,1403,1365,735,1737,1742,1645,1366,0.13,0.35,0.1
Abingdon,7076,2229,1416,414,787,106,290,204,708,922,0.1,0.57,0.13
Addiscombe East,9659,1610,2264,1281,1088,418,863,713,750,672,0.08,0.53,0.07


In [364]:
status = status.reset_index()
status

Unnamed: 0,ward name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students,nonwork_rate,middle_high_class_pro,advanced_education_rate
0,Abbey,11322,2877,2646,986,1188,342,822,791,907,763,0.08,0.57,0.07
1,Abbey Road,11003,3520,2250,691,1047,191,545,408,1156,1195,0.11,0.59,0.11
2,Abbey Wood,13144,1089,2062,1403,1365,735,1737,1742,1645,1366,0.13,0.35,0.10
3,Abingdon,7076,2229,1416,414,787,106,290,204,708,922,0.10,0.57,0.13
4,Addiscombe East,9659,1610,2264,1281,1088,418,863,713,750,672,0.08,0.53,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Worcester Park North,6415,1205,1478,794,794,278,533,477,446,410,0.07,0.54,0.06
666,Worcester Park South,5820,1126,1313,870,767,249,489,374,321,311,0.06,0.57,0.05
667,Wormholt,6374,904,1290,614,622,276,549,526,747,846,0.12,0.44,0.13
668,Yeading,10153,763,1535,1023,1213,571,1337,1257,1506,948,0.15,0.33,0.09


# public_transport_station_coverage 

In [220]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-labour-market/d9f01451-3960-4a1f-a43d-392e45aa0830/Method%20of%20travel%20to%20work.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

method_of_work = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')       

Found data/ward_socioeconomic/Method%20of%20travel%20to%20work.xlsx locally!


In [221]:
method_of_work.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ in employment',
 'Work mainly at or from home',
 'Under-ground; metro; light rail; tram',
 'Train',
 'Bus; minibus or coach',
 'Taxi',
 'Motorcycle; scooter or moped',
 'Driving a car or van',
 'Passenger in a car or van',
 'Bicycle',
 'On foot',
 'Other method of travel to work']

In [222]:
# Dropping the specified columns
method_of_work = method_of_work.drop(columns=['ward code', 'local authority code', 'local authority name'])
method_of_work=method_of_work.groupby('ward name').sum()
method_of_work.head()

Unnamed: 0_level_0,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Abbey,8044,4105,1290,394,443,12,56,912,59,196,512,65
Abbey Road,6354,4061,595,81,328,69,25,426,34,173,496,66
Abbey Wood,7707,1827,679,963,1221,35,35,2241,182,92,321,111
Abingdon,4019,2578,356,79,152,51,40,228,42,112,331,50
Addiscombe East,6073,2589,374,542,495,15,44,1490,93,63,300,68


In [223]:
method_of_work.columns.to_list()

['All usual residents aged 16+ in employment',
 'Work mainly at or from home',
 'Under-ground; metro; light rail; tram',
 'Train',
 'Bus; minibus or coach',
 'Taxi',
 'Motorcycle; scooter or moped',
 'Driving a car or van',
 'Passenger in a car or van',
 'Bicycle',
 'On foot',
 'Other method of travel to work']

In [224]:
#select representive indicator to analyse. 
#we want to find the quantity of public transportation around places where people live, 
#so we decide to choose "Under-ground; metro; light rail; tram","Bus; minibus or coach","Train" columns to represent.
method_of_work["public_transport_station_coverage"] = ((method_of_work["Under-ground; metro; light rail; tram"]+method_of_work["Train"]+method_of_work["Bus; minibus or coach"])/(method_of_work["All usual residents aged 16+ in employment"]-method_of_work["Work mainly at or from home"])).round(2)
method_of_work.head()

Unnamed: 0_level_0,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work,public_transport_station_coverage
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Abbey,8044,4105,1290,394,443,12,56,912,59,196,512,65,0.54
Abbey Road,6354,4061,595,81,328,69,25,426,34,173,496,66,0.44
Abbey Wood,7707,1827,679,963,1221,35,35,2241,182,92,321,111,0.49
Abingdon,4019,2578,356,79,152,51,40,228,42,112,331,50,0.41
Addiscombe East,6073,2589,374,542,495,15,44,1490,93,63,300,68,0.4


In [342]:
method_of_work.rename(columns={'Ward name': 'ward name'}, inplace=True)

In [396]:
method_of_work

Unnamed: 0,ward name,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work,public_transport_station_coverage
0,Abbey,8044,4105,1290,394,443,12,56,912,59,196,512,65,0.54
1,Abbey Road,6354,4061,595,81,328,69,25,426,34,173,496,66,0.44
2,Abbey Wood,7707,1827,679,963,1221,35,35,2241,182,92,321,111,0.49
3,Abingdon,4019,2578,356,79,152,51,40,228,42,112,331,50,0.41
4,Addiscombe East,6073,2589,374,542,495,15,44,1490,93,63,300,68,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Worcester Park North,4264,1784,127,307,190,23,47,1347,102,85,206,46,0.25
666,Worcester Park South,3583,1572,88,190,135,7,27,1243,78,45,161,37,0.21
667,Wormholt,3717,1567,453,116,378,30,41,434,34,221,409,34,0.44
668,Yeading,5796,1205,311,129,800,49,43,2716,186,90,174,93,0.27


In [365]:
method_of_work = method_of_work.reset_index()

# commute_distance

In [228]:
src_url   = ' https://data.london.gov.uk/download/2021-census-wards-labour-market/7725cc70-ab2b-4674-99cb-b825bc9ef4ca/Distance%20traveled%20to%20work.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

commute_distance = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Distance%20traveled%20to%20work.xlsx locally!


In [229]:
commute_distance.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ in employment',
 'Less than 2km',
 '2km to less than 5km',
 '5km to less than 10km',
 '10km to less than 20km',
 '20km to less than 30km',
 '30km to less than 40km',
 '40km to less than 60km',
 '60km and over',
 'Works mainly from home',
 'Other']

In [230]:
# Dropping the specified columns
commute_distance = commute_distance.drop(columns=['ward code', 'local authority code', 'local authority name'])
commute_distance=commute_distance.groupby('ward name').sum()
commute_distance.head()

Unnamed: 0_level_0,All usual residents aged 16+ in employment,Less than 2km,2km to less than 5km,5km to less than 10km,10km to less than 20km,20km to less than 30km,30km to less than 40km,40km to less than 60km,60km and over,Works mainly from home,Other
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abbey,8042,588,520,786,971,116,33,24,48,4105,851
Abbey Road,6354,366,772,435,209,27,13,11,27,4061,433
Abbey Wood,7692,480,1013,1027,1403,248,101,27,86,1827,1480
Abingdon,4013,299,439,281,79,10,9,14,24,2578,280
Addiscombe East,6082,382,719,559,758,122,45,30,32,2589,846


In [231]:
commute_distance["average_commute_distance"] = ((commute_distance["Less than 2km"]*1+commute_distance["2km to less than 5km"]*3.5+commute_distance["5km to less than 10km"]*7.5+commute_distance["10km to less than 20km"]*15+commute_distance["20km to less than 30km"]*25+commute_distance["30km to less than 40km"]*35+commute_distance["40km to less than 60km"]*50+commute_distance["60km and over"]*60)/commute_distance["All usual residents aged 16+ in employment"]).round(2)

In [232]:
commute_distance.head()

Unnamed: 0_level_0,All usual residents aged 16+ in employment,Less than 2km,2km to less than 5km,5km to less than 10km,10km to less than 20km,20km to less than 30km,30km to less than 40km,40km to less than 60km,60km and over,Works mainly from home,Other,average_commute_distance
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Abbey,8042,588,520,786,971,116,33,24,48,4105,851,3.86
Abbey Road,6354,366,772,435,209,27,13,11,27,4061,433,2.01
Abbey Wood,7692,480,1013,1027,1403,248,101,27,86,1827,1480,6.37
Abingdon,4013,299,439,281,79,10,9,14,24,2578,280,1.95
Addiscombe East,6082,382,719,559,758,122,45,30,32,2589,846,4.36


In [343]:
commute_distance.rename(columns={'Ward name': 'ward name'}, inplace=True)

In [366]:
commute_distance = commute_distance.reset_index()

# uhealthy_rate、usual_residents

In [240]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-qualifications-health-disability-and-care/af5f35f0-fdd3-4531-9616-ff1f1a64101a/General%20health.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

health = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/General%20health.xlsx locally!


In [241]:
health.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All Usual residents ',
 'Very good health',
 'Good health',
 'Fair health',
 'Bad health',
 'Very bad health']

In [242]:
# Dropping the specified columns
health = health.drop(columns=['ward code', 'local authority code', 'local authority name'])
health=health.groupby('ward name').sum()
health.head()

Unnamed: 0_level_0,All Usual residents,Very good health,Good health,Fair health,Bad health,Very bad health
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbey,14229,8184,4477,1188,275,105
Abbey Road,12977,7800,3610,1051,355,161
Abbey Wood,17425,9173,5475,1917,671,189
Abingdon,8158,5072,2236,604,180,66
Addiscombe East,11802,5852,4189,1266,351,144


In [244]:
#i think bad health rate can be representaive
health['unhealthy_rate']=((health['Bad health']+health['Very bad health'])/health['All Usual residents ']).round(2)

In [245]:
health.rename(columns={'All Usual residents ': 'All Usual residents'}, inplace=True)

In [246]:
health.head()

Unnamed: 0_level_0,All Usual residents,Very good health,Good health,Fair health,Bad health,Very bad health,unhealthy_rate
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abbey,14229,8184,4477,1188,275,105,0.03
Abbey Road,12977,7800,3610,1051,355,161,0.04
Abbey Wood,17425,9173,5475,1917,671,189,0.05
Abingdon,8158,5072,2236,604,180,66,0.03
Addiscombe East,11802,5852,4189,1266,351,144,0.04


In [367]:
health = health.reset_index()

# disability

In [247]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-qualifications-health-disability-and-care/a9f40c75-fbe5-4a52-acc5-d5a9ac496c4d/Disability.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

disability = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Disability.xlsx locally!


In [248]:
disability.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All Usual residents ',
 'Disabled; activities limited a lot',
 'Disabled; activities limited a little',
 'Not disabled']

In [250]:
# Dropping the specified columns
disability = disability.drop(columns=['ward code', 'local authority code', 'local authority name'])
disability=disability.groupby('ward name').sum()
disability.head()

Unnamed: 0_level_0,All Usual residents,Disabled; activities limited a lot,Disabled; activities limited a little,Not disabled
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abbey,14230,498,921,12811
Abbey Road,12987,689,737,11561
Abbey Wood,17422,1306,1302,14814
Abingdon,8162,294,485,7383
Addiscombe East,11798,638,1017,10143


In [251]:
disability['disabled_pro'] = ((disability['Disabled; activities limited a lot']+disability['Disabled; activities limited a little'])/disability['All Usual residents ']).round(2)

In [252]:
disability.head()

Unnamed: 0_level_0,All Usual residents,Disabled; activities limited a lot,Disabled; activities limited a little,Not disabled,disabled_pro
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abbey,14230,498,921,12811,0.1
Abbey Road,12987,689,737,11561,0.11
Abbey Wood,17422,1306,1302,14814,0.15
Abingdon,8162,294,485,7383,0.1
Addiscombe East,11798,638,1017,10143,0.14


In [368]:
disability = disability.reset_index()

# house_sale_price

In [265]:
houseprice = pd.read_excel(("data/ward_socioeconomic/HPSSA Dataset 37 - Median price paid by ward.xls"),
                           sheet_name='1a',
                           header=[5])

In [266]:
houseprice2=houseprice[
    houseprice["Local authority code"].str.contains(r'E090000(?:0[1-9]|1[0-9]|2[0-9]|3[0-1])'
, regex=True, flags=re.IGNORECASE) 
]
houseprice2.head()

Unnamed: 0,Local authority code,Local authority name,Ward code,Ward name,Year ending Dec 1995,Year ending Mar 1996,Year ending Jun 1996,Year ending Sep 1996,Year ending Dec 1996,Year ending Mar 1997,...,Year ending Dec 2020,Year ending Mar 2021,Year ending Jun 2021,Year ending Sep 2021,Year ending Dec 2021,Year ending Mar 2022,Year ending Jun 2022,Year ending Sep 2022,Year ending Dec 2022,Year ending Mar 2023
6561,E09000001,City of London,E05009288,Aldersgate,100000,106250,112375,120250,121500,130076.5,...,775000,750000,687500,712500,751250,720000,722500,730000,745000,770000
6562,E09000001,City of London,E05009292,Bishopsgate,:,183750,178750,178750,178750,:,...,:,:,:,:,1250850,1259550,1259550,1296900,1445300,1642650
6563,E09000001,City of London,E05009302,Cripplegate,111250,113000,110000,114000,113500,115000,...,797500,792500,787500,800000,805000,797500,797500,805000,810000,842500
6564,E09000001,City of London,E05009304,Farringdon Within,:,:,:,:,195000,195000,...,:,:,:,:,:,870000,870000,783500,897500,925000
6565,E09000001,City of London,E05009305,Farringdon Without,86500,115000,115000,139000,125000,116000,...,1132125,840000,827700,805000,812000,846300,820000,810000,800000,1005678


In [267]:
# Selecting the relevant columns for the year 2022
columns_2022 = ['Ward name', 'Year ending Mar 2022', 'Year ending Jun 2022', 'Year ending Sep 2022', 'Year ending Dec 2022']

# Create a new dataframe with only the relevant columns
houseprice_2022 = houseprice2[columns_2022]

# Replace ':' with NaN (to handle missing values) and convert the columns to numeric
houseprice_2022.replace(':', pd.NA, inplace=True)
houseprice_2022 = houseprice_2022.apply(pd.to_numeric, errors='ignore')

# Display the dataframe after modifications and the summary of missing values
houseprice_2022.head()
houseprice_2022.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  houseprice_2022.replace(':', pd.NA, inplace=True)


Ward name               0
Year ending Mar 2022    1
Year ending Jun 2022    2
Year ending Sep 2022    1
Year ending Dec 2022    1
dtype: int64

In [270]:
houseprice_2022 = houseprice_2022.set_index('Ward name')

In [271]:
houseprice_2022=houseprice_2022.groupby('Ward name').sum()

In [272]:
houseprice_2022['median_houseprice_2022'] = houseprice_2022[
    ['Year ending Mar 2022', 'Year ending Jun 2022', 'Year ending Sep 2022', 'Year ending Dec 2022']
].median(axis=1, skipna=True)

# Display the updated dataframe with the median house price column
houseprice_2022.sample(3)

Unnamed: 0_level_0,Year ending Mar 2022,Year ending Jun 2022,Year ending Sep 2022,Year ending Dec 2022,median_houseprice_2022
Ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pembridge,1350000.0,1295000.0,1295000.0,1275000.0,1295000.0
Avonmore and Brook Green,627000.0,602500.0,588750.0,590000.0,596250.0
Haggerston,518000.0,506250.0,525000.0,507000.0,512500.0


In [379]:
houseprice_2022.rename(columns={'Ward name': 'ward name'}, inplace=True)

In [274]:
houseprice_2022.shape[0]

585

In [380]:
houseprice_2022.head()

Unnamed: 0,ward name,Year ending Mar 2022,Year ending Jun 2022,Year ending Sep 2022,Year ending Dec 2022,median_houseprice_2022
0,Abbey,953000.0,973000.0,980200.0,995000.0,976600.0
1,Abbey Wood,375000.0,380000.0,390000.0,400000.0,385000.0
2,Abingdon,1430000.0,1400000.0,1440000.0,1440000.0,1435000.0
3,Acton Central,570000.0,600000.0,706250.0,652500.0,626250.0
4,Addiscombe East,440000.0,435000.0,442000.0,457538.0,441000.0


In [369]:
houseprice_2022 = houseprice_2022.reset_index()

# house_rent_price

In [280]:
house_rent_price = pd.read_excel(("data/ward_socioeconomic/London Living Rent ward benchmark data 2023-24_0.xlsx"),
    )

In [281]:
house_rent_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Local authority  680 non-null    object 
 1   Ward             680 non-null    object 
 2   One bedroom      680 non-null    float64
 3   Two bedrooms     680 non-null    float64
 4   Three bedrooms   680 non-null    float64
 5   Four bedrooms    680 non-null    float64
 6   Five bedrooms    680 non-null    float64
 7   Six bedrooms     680 non-null    float64
dtypes: float64(6), object(2)
memory usage: 42.6+ KB


In [282]:
house_rent_price.columns.to_list()

['Local authority',
 'Ward',
 'One bedroom',
 'Two bedrooms',
 'Three bedrooms',
 'Four bedrooms',
 'Five bedrooms',
 'Six bedrooms']

In [283]:
# Dropping the specified columns
house_rent_price = house_rent_price.drop(columns=['Local authority'])
house_rent_price=house_rent_price.groupby('Ward').sum()
house_rent_price.head()

Unnamed: 0_level_0,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms
Ward,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbey,1994.915499,2216.572776,2438.230054,2624.573552,2726.621349,2800.0
Abbey Road,1204.003737,1337.78193,1400.0,1400.0,1400.0,1400.0
Abbey Wood,915.608079,1017.34231,1119.076541,1220.810772,1322.545003,1400.0
Abingdon,1345.228828,1400.0,1400.0,1400.0,1400.0,1400.0
Addiscombe East,1108.071851,1231.190945,1354.31004,1400.0,1400.0,1400.0


In [286]:
house_rent_price['mean_rent_price'] = (house_rent_price.iloc[:, 1:].mean(axis=1)).round(2)
house_rent_price.head()

Unnamed: 0_level_0,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms,mean_rent_price
Ward,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abbey,1994.915499,2216.572776,2438.230054,2624.573552,2726.621349,2800.0,2563.59
Abbey Road,1204.003737,1337.78193,1400.0,1400.0,1400.0,1400.0,1387.9
Abbey Wood,915.608079,1017.34231,1119.076541,1220.810772,1322.545003,1400.0,1217.33
Abingdon,1345.228828,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0
Addiscombe East,1108.071851,1231.190945,1354.31004,1400.0,1400.0,1400.0,1357.98


In [383]:
house_rent_price.rename(columns={'Ward': 'ward name'}, inplace=True)

In [370]:
house_rent_price = house_rent_price.reset_index()

In [384]:
house_rent_price

Unnamed: 0,ward name,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms,mean_rent_price
0,Abbey,1994.915499,2216.572776,2438.230054,2624.573552,2726.621349,2800.0,2563.59
1,Abbey Road,1204.003737,1337.781930,1400.000000,1400.000000,1400.000000,1400.0,1387.90
2,Abbey Wood,915.608079,1017.342310,1119.076541,1220.810772,1322.545003,1400.0,1217.33
3,Abingdon,1345.228828,1400.000000,1400.000000,1400.000000,1400.000000,1400.0,1400.00
4,Addiscombe East,1108.071851,1231.190945,1354.310040,1400.000000,1400.000000,1400.0,1357.98
...,...,...,...,...,...,...,...,...
665,Worcester Park North,1175.388013,1305.986681,1400.000000,1400.000000,1400.000000,1400.0,1381.72
666,Worcester Park South,1175.676169,1306.306854,1400.000000,1400.000000,1400.000000,1400.0,1381.78
667,Wormholt,1094.705403,1216.339337,1337.973270,1400.000000,1400.000000,1400.0,1351.80
668,Yeading,907.274258,1008.082509,1108.890759,1209.699010,1310.507261,1400.0,1208.82


# crime

In [299]:
crime = pd.read_csv("data/ward_socioeconomic/MPS Ward Level Crime (most recent 24 months).csv")

In [300]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23453 entries, 0 to 23452
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   MajorText           23453 non-null  object
 1   MinorText           23453 non-null  object
 2   WardName            23453 non-null  object
 3   WardCode            23453 non-null  object
 4   LookUp_BoroughName  23453 non-null  object
 5   202111              23453 non-null  int64 
 6   202112              23453 non-null  int64 
 7   202201              23453 non-null  int64 
 8   202202              23453 non-null  int64 
 9   202203              23453 non-null  int64 
 10  202204              23453 non-null  int64 
 11  202205              23453 non-null  int64 
 12  202206              23453 non-null  int64 
 13  202207              23453 non-null  int64 
 14  202208              23453 non-null  int64 
 15  202209              23453 non-null  int64 
 16  202210              23

In [301]:
# Dropping the specified columns
crime = crime.drop(columns=['MajorText', 'MinorText', 'LookUp_BoroughName', 'WardCode'])

# Grouping by 'WardName' and summing up the crime counts for each month
crime = crime.groupby('WardName').sum()

crime.head()

Unnamed: 0_level_0,202111,202112,202201,202202,202203,202204,202205,202206,202207,202208,...,202301,202302,202303,202304,202305,202306,202307,202308,202309,202310
WardName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey,188,177,187,184,208,165,178,172,184,184,...,177,201,200,203,187,208,203,198,214,217
Abbey Road,59,57,69,40,65,42,41,72,57,78,...,69,67,56,68,58,82,71,74,54,70
Abbey Wood,145,114,146,120,144,104,109,144,124,131,...,120,134,125,121,151,143,121,122,152,120
Abingdon,73,57,61,52,87,87,76,70,76,56,...,83,49,64,63,85,75,89,84,106,83
Addiscombe East,100,77,76,58,79,72,72,73,76,66,...,82,63,71,74,79,48,89,63,76,90


In [310]:
# Selecting the relevant columns for the year 2022
columns_2023 = crime[['202301','202302','202303','202304','202305',
                '202306','202307','202308','202309','202310']]
columns_2023

Unnamed: 0_level_0,202301,202302,202303,202304,202305,202306,202307,202308,202309,202310
WardName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Abbey,177,201,200,203,187,208,203,198,214,217
Abbey Road,69,67,56,68,58,82,71,74,54,70
Abbey Wood,120,134,125,121,151,143,121,122,152,120
Abingdon,83,49,64,63,85,75,89,84,106,83
Addiscombe East,82,63,71,74,79,48,89,63,76,90
...,...,...,...,...,...,...,...,...,...,...
Worcester Park North,38,29,35,41,51,42,22,26,29,27
Worcester Park South,11,20,19,10,21,11,24,12,20,9
Wormholt,50,47,42,29,46,33,47,37,59,38
Yeading,111,101,101,91,114,125,90,130,117,112


In [313]:
crime_2023['average_crime']= crime_2023.iloc[:,1:].mean(axis=1).round(2)

In [315]:
crime_2023.rename(columns={'WardName': 'ward name'}, inplace=True)

In [371]:
crime_2023 = crime_2023.reset_index()

# religon

In [42]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-ethnicity-language-identity-religion/7bb0ed88-e92f-4084-bdc6-66c6f1cba9f9/Religion.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

religon = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Religion.xlsx locally!


In [43]:
religon.head()

Unnamed: 0,ward code,ward name,local authority code,local authority name,All usual residents,Christian,Buddhist,Hindu,Jewish,Muslim,Sikh,Other religion,No religion,Religion not stated
0,E09000001,City of London (aggregated),E09000001,City of London,8586,2981,94,203,180,538,8,54,3762,766
1,E05014053,Abbey,E09000002,Barking and Dagenham,3962,1355,28,728,2,1060,160,23,388,218
2,E05014054,Alibon,E09000002,Barking and Dagenham,9883,5114,26,154,8,1661,58,21,2247,594
3,E05014055,Barking Riverside,E09000002,Barking and Dagenham,10066,4882,46,209,4,2560,79,50,1698,538
4,E05014056,Beam,E09000002,Barking and Dagenham,8355,4551,30,141,10,1615,148,34,1362,464


# Ethnic group

In [44]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-ethnicity-language-identity-religion/4243a044-6b22-4d32-8dbe-9113526b1976/Ethnic%20group.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

ethnic_group = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Ethnic%20group.xlsx locally!


In [45]:
ethnic_group.head()

Unnamed: 0,ward code,ward name,local authority code,local authority name,All usual residents,White British,White Irish,White Gypsy/Irish Traveller,White Roma,White Other,...,Asian Bangladeshi,Asian Chinese,Asian Indian,Asian Pakistani,Asian Other,Black African,Black Caribbean,Black Other,Other Arab,Other Any other
0,E09000001,City of London (aggregated),E09000001,City of London,8584,3652,186,0,62,2065,...,283,544,322,32,259,150,54,26,113,369
1,E05014053,Abbey,E09000002,Barking and Dagenham,3963,447,17,3,34,691,...,346,87,968,358,131,453,68,70,42,125
2,E05014054,Alibon,E09000002,Barking and Dagenham,9889,3845,71,15,47,1555,...,859,67,247,282,296,1529,214,237,57,240
3,E05014055,Barking Riverside,E09000002,Barking and Dagenham,10055,1943,23,3,18,1163,...,969,91,374,593,201,2815,402,547,92,247
4,E05014056,Beam,E09000002,Barking and Dagenham,8360,2488,73,17,25,1202,...,699,41,301,358,156,1814,251,297,39,256


# age

In [400]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-demography-and-migration/35b33393-f355-431b-9f87-ae55addaa636/Five%20year%20age%20bands.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

age = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Five%20year%20age%20bands.xlsx locally!


In [401]:
age.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents',
 'Aged 4 years and under',
 'Aged 5 to 9 years',
 'Aged 10 to 14 years',
 'Aged 15 to 19 years',
 'Aged 20 to 24 years',
 'Aged 25 to 29 years',
 'Aged 30 to 34 years',
 'Aged 35 to 39 years',
 'Aged 40 to 44 years',
 'Aged 45 to 49 years',
 'Aged 50 to 54 years',
 'Aged 55 to 59 years',
 'Aged 60 to 64 years',
 'Aged 65 to 69 years',
 'Aged 70 to 74 years',
 'Aged 75 to 79 years',
 'Aged 80 to 84 years',
 'Aged 85 years and over']

In [402]:
# Dropping the specified columns
age = age.drop(columns=['ward code', 'local authority code', 'local authority name'])
age=age.groupby('ward name').sum()
age.head()

Unnamed: 0_level_0,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,Aged 40 to 44 years,Aged 45 to 49 years,Aged 50 to 54 years,Aged 55 to 59 years,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Abbey,14225,1121,948,697,512,728,1649,1826,1624,1299,950,776,625,453,328,277,186,104,122
Abbey Road,12980,589,612,669,580,864,1176,1345,1092,1004,916,934,735,561,474,481,355,279,314
Abbey Wood,17417,1147,1394,1475,1114,1029,1283,1336,1384,1265,1204,1198,1054,829,606,415,315,179,190
Abingdon,8180,356,343,355,279,736,760,656,630,539,691,672,524,451,328,346,238,141,135
Addiscombe East,11794,780,612,620,565,647,833,976,941,827,767,869,863,698,553,510,304,241,188


In [403]:
age['young_people']=((age['Aged 15 to 19 years']+age['Aged 20 to 24 years'])/age['All usual residents']).round(2)
age['children_teenager']=((age['Aged 4 years and under']+age['Aged 5 to 9 years']+age['Aged 10 to 14 years'])/age['All usual residents']).round(2)
age['old_people']=((age['Aged 65 to 69 years']+age['Aged 70 to 74 years']+age['Aged 75 to 79 years']+age['Aged 80 to 84 years']+age['Aged 85 years and over'])/age['All usual residents']).round(2)

In [404]:
age['middle_aged']=(1-(age['young_people']+age['children_teenager']+age['old_people'])).round(2)

In [405]:
age.head()

Unnamed: 0_level_0,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,Aged 40 to 44 years,...,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over,young_people,children_teenager,old_people,middle_aged
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey,14225,1121,948,697,512,728,1649,1826,1624,1299,...,453,328,277,186,104,122,0.09,0.19,0.07,0.65
Abbey Road,12980,589,612,669,580,864,1176,1345,1092,1004,...,561,474,481,355,279,314,0.11,0.14,0.15,0.6
Abbey Wood,17417,1147,1394,1475,1114,1029,1283,1336,1384,1265,...,829,606,415,315,179,190,0.12,0.23,0.1,0.55
Abingdon,8180,356,343,355,279,736,760,656,630,539,...,451,328,346,238,141,135,0.12,0.13,0.15,0.6
Addiscombe East,11794,780,612,620,565,647,833,976,941,827,...,698,553,510,304,241,188,0.1,0.17,0.15,0.58


In [406]:
age = age.reset_index()

In [407]:
age

Unnamed: 0,ward name,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,...,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over,young_people,children_teenager,old_people,middle_aged
0,Abbey,14225,1121,948,697,512,728,1649,1826,1624,...,453,328,277,186,104,122,0.09,0.19,0.07,0.65
1,Abbey Road,12980,589,612,669,580,864,1176,1345,1092,...,561,474,481,355,279,314,0.11,0.14,0.15,0.60
2,Abbey Wood,17417,1147,1394,1475,1114,1029,1283,1336,1384,...,829,606,415,315,179,190,0.12,0.23,0.10,0.55
3,Abingdon,8180,356,343,355,279,736,760,656,630,...,451,328,346,238,141,135,0.12,0.13,0.15,0.60
4,Addiscombe East,11794,780,612,620,565,647,833,976,941,...,698,553,510,304,241,188,0.10,0.17,0.15,0.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,Worcester Park North,8317,566,616,617,454,386,407,595,730,...,360,297,254,212,127,110,0.10,0.22,0.12,0.56
666,Worcester Park South,7167,401,474,419,336,331,328,375,470,...,475,341,433,257,215,256,0.09,0.18,0.21,0.52
667,Wormholt,7825,402,408,542,516,765,836,697,553,...,350,241,222,130,107,106,0.16,0.17,0.10,0.57
668,Yeading,13564,1045,1092,1078,870,772,903,1113,1108,...,566,390,312,200,146,144,0.12,0.24,0.09,0.55


# deprivation

In [324]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-demography-and-migration/05b99a3e-7e9a-467d-8d73-8417ee42bc9f/Household%20deprivation.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

household_deprivation = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Household%20deprivation.xlsx locally!


In [325]:
# Dropping the specified columns
household_deprivation = household_deprivation.drop(columns=['ward code', 'local authority code', 'local authority name'])
household_deprivation=household_deprivation.groupby('ward name').sum()
household_deprivation.head()

Unnamed: 0_level_0,All Households,deprived in: no dimensions,1 dimension,2 dimensions,3 dimensions,4 dimensions
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbey,5796,3217,1757,650,157,15
Abbey Road,6062,3425,1786,594,235,22
Abbey Wood,6207,2432,2239,1116,374,46
Abingdon,3893,2276,1171,324,106,16
Addiscombe East,4796,2391,1612,629,144,20


In [326]:
household_deprivation['household_deprivation_rate']=((household_deprivation['All Households']-household_deprivation['deprived in: no dimensions'])/household_deprivation['All Households']).round(2)

In [327]:
household_deprivation.sample(5)

Unnamed: 0_level_0,All Households,deprived in: no dimensions,1 dimension,2 dimensions,3 dimensions,4 dimensions,household_deprivation_rate
ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Wealdstone South,2623,1037,965,471,134,16,0.6
Colham & Cowley,6627,2675,2455,1144,321,32,0.6
Brimsdown,5606,1979,2104,1119,362,42,0.65
Kenton East,4491,1867,1730,720,163,11,0.58
Charlton Village & Riverside,3969,1666,1355,688,245,15,0.58


In [373]:
household_deprivation = household_deprivation.reset_index()

# economiaclly active total 不需要

In [54]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-labour-market/6030ee1e-7f1f-47ca-9175-b30a12fe5b0c/Economic%20Activity.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

economic_activity = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Economic%20Activity.xlsx locally!


In [55]:
economic_activity.head()

Unnamed: 0,ward code,ward name,local authority code,local authority name,All usual residents aged 16+,Economically active Total,Employee; Part time,Employee; Full time,Self employed with employees Part time,Self employed with employees Full time,Self employed without employees Part time,Self employed without employees Full time,Unemployed,Economically active Full time student,Economically inactive Total,Economically inactive: retired,Economically inactive full time students,Looking after home or family,Long term sick or disabled,Economically inactive: other
0,E09000001,City of London,E09000001,City of London,8014,5677,441,3761,28,149,365,520,283,130,2337,993,723,195,123,303
1,E05014053,Abbey,E09000002,Barking and Dagenham,2952,2045,315,1099,21,51,86,195,143,135,907,149,168,313,116,161
2,E05014054,Alibon,E09000002,Barking and Dagenham,7458,4867,926,2331,31,106,285,567,339,282,2591,863,534,545,294,355
3,E05014055,Barking Riverside,E09000002,Barking and Dagenham,6903,4970,961,2625,39,75,242,393,361,274,1933,275,555,578,250,275
4,E05014056,Beam,E09000002,Barking and Dagenham,6080,3901,841,1803,37,98,222,368,296,236,2179,697,492,462,237,291


# creat whole dataset

In [408]:
neighbor_socioeconomic=status[['ward name','nonwork_rate','middle_high_class_pro','advanced_education_rate']]

In [409]:
neighbor_socioeconomic.head()

Unnamed: 0,ward name,nonwork_rate,middle_high_class_pro,advanced_education_rate
0,Abbey,0.08,0.57,0.07
1,Abbey Road,0.11,0.59,0.11
2,Abbey Wood,0.13,0.35,0.1
3,Abingdon,0.1,0.57,0.13
4,Addiscombe East,0.08,0.53,0.07


In [410]:
common_key='ward name'

# add every column into the first dataset
neighbor_socioeconomic = neighbor_socioeconomic.merge(method_of_work[[common_key, 'public_transport_station_coverage']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(commute_distance[[common_key, 'average_commute_distance']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(health[[common_key, 'All Usual residents','unhealthy_rate']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(disability[[common_key, 'disabled_pro']], on=common_key, how='left')

neighbor_socioeconomic = neighbor_socioeconomic.merge(houseprice_2022[[common_key, 'median_houseprice_2022']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(house_rent_price[[common_key, 'mean_rent_price']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(crime_2023[[common_key, 'average_crime']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(age[[common_key, 'young_people','children_teenager','old_people','middle_aged']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(household_deprivation[[common_key, 'household_deprivation_rate']], on=common_key, how='left')

In [411]:
neighbor_socioeconomic.head()

Unnamed: 0,ward name,nonwork_rate,middle_high_class_pro,advanced_education_rate,public_transport_station_coverage,average_commute_distance,All Usual residents,unhealthy_rate,disabled_pro,median_houseprice_2022,mean_rent_price,average_crime,young_people,children_teenager,old_people,middle_aged,household_deprivation_rate
0,Abbey,0.08,0.57,0.07,0.54,3.86,14229,0.03,0.1,976600.0,2563.59,365.09,0.09,0.19,0.07,0.65,0.44
1,Abbey Road,0.11,0.59,0.11,0.44,2.01,12977,0.04,0.11,,1387.9,121.64,0.11,0.14,0.15,0.6,0.44
2,Abbey Wood,0.13,0.35,0.1,0.49,6.37,17425,0.05,0.15,385000.0,1217.33,238.0,0.12,0.23,0.1,0.55,0.61
3,Abingdon,0.1,0.57,0.13,0.41,1.95,8158,0.03,0.1,1435000.0,1400.0,142.0,0.12,0.13,0.15,0.6,0.42
4,Addiscombe East,0.08,0.53,0.07,0.4,4.36,11802,0.04,0.14,441000.0,1357.98,133.64,0.1,0.17,0.15,0.58,0.5


In [415]:
neighbor_socioeconomic.to_excel('Neighbourdata_Cleaned.xlsx')

# zhi biao
commute_distance["average_commute_distance"] 2

health['unhealthy_rate']3 4

disability['disabled_pro']5 

houseprice_2022['median_houseprice_2022']6

house_rent_price['mean_rent_price'] 7

crime_sum_ward = crime_sum_2023.groupby('WardName')['avarge_crime'] 8

young_people	children_teenager	old_people	middle_aged 12

household_deprivation['household_deprivation_rate'] 13

In [416]:
#read airbnb data
airbnb_file = 'airbnb_ward_clean.csv'
airbnb_data = pd.read_csv(airbnb_file)

In [417]:
airbnb_data.head()

Unnamed: 0.1,Unnamed: 0,WD22NM,total_room,Entire home/apt,Hotel room,Private room,Shared room,avg_price,avg_minimum_nights,avg_reviews_per_month,avg_availability_365,avg_star_rate
0,0,Abbey,170,110.0,0.0,60.0,0.0,146.958824,3.670588,0.843016,131.158824,4.67046
1,1,Abbey Road,195,152.0,0.0,43.0,0.0,183.897436,5.287179,1.362932,128.512821,4.677921
2,2,Abbey Wood,85,37.0,0.0,48.0,0.0,87.376471,4.176471,0.822787,228.435294,4.650213
3,3,Abingdon,278,227.0,0.0,51.0,0.0,220.147482,4.521583,0.874352,150.741007,4.572039
4,4,Addiscombe East,33,13.0,0.0,20.0,0.0,80.393939,3.363636,1.2348,113.848485,4.737778


In [419]:
# Rename the column in Airbnb data to match the Neighbourhood data
airbnb_data = airbnb_data.rename(columns={"WD22NM": "ward name"})

# Merge the two datasets on 'ward name'
merged_data = pd.merge(airbnb_data_renamed, neighbor_socioeconomic, on="ward name", how="left")

# Display the first few rows of the merged dataframe
merged_data.head()

Unnamed: 0.1,Unnamed: 0,ward name,total_room,Entire home/apt,Hotel room,Private room,Shared room,avg_price,avg_minimum_nights,avg_reviews_per_month,...,unhealthy_rate,disabled_pro,median_houseprice_2022,mean_rent_price,average_crime,young_people,children_teenager,old_people,middle_aged,household_deprivation_rate
0,0,Abbey,170,110.0,0.0,60.0,0.0,146.958824,3.670588,0.843016,...,0.03,0.1,976600.0,2563.59,365.09,0.09,0.19,0.07,0.65,0.44
1,1,Abbey Road,195,152.0,0.0,43.0,0.0,183.897436,5.287179,1.362932,...,0.04,0.11,,1387.9,121.64,0.11,0.14,0.15,0.6,0.44
2,2,Abbey Wood,85,37.0,0.0,48.0,0.0,87.376471,4.176471,0.822787,...,0.05,0.15,385000.0,1217.33,238.0,0.12,0.23,0.1,0.55,0.61
3,3,Abingdon,278,227.0,0.0,51.0,0.0,220.147482,4.521583,0.874352,...,0.03,0.1,1435000.0,1400.0,142.0,0.12,0.13,0.15,0.6,0.42
4,4,Addiscombe East,33,13.0,0.0,20.0,0.0,80.393939,3.363636,1.2348,...,0.04,0.14,441000.0,1357.98,133.64,0.1,0.17,0.15,0.58,0.5


In [420]:
merged_data.to_excel('Airbnb_Merged.xlsx')