In [1]:
#load libraries
import pandas as pd
import re

In [2]:
import os
from requests import get
from urllib.parse import urlparse

def cache_data(src:str, dest:str) -> str:
    """Downloads and caches a remote file locally.
    
    The function sits between the 'read' step of a pandas or geopandas
    data frame and downloading the file from a remote location. The idea
    is that it will save it locally so that you don't need to remember to
    do so yourself. Subsequent re-reads of the file will return instantly
    rather than downloading the entire file for a second or n-th itme.
    
    Parameters
    ----------
    src : str
        The remote *source* for the file, any valid URL should work.
    dest : str
        The *destination* location to save the downloaded file.
        
    Returns
    -------
    str
        A string representing the local location of the file.
    """
    
    url = urlparse(src) # We assume that this is some kind of valid URL 
    fn  = os.path.split(url.path)[-1] # Extract the filename
    dfn = os.path.join(dest,fn) # Destination filename
    
    # Check if dest+filename does *not* exist -- 
    # that would mean we have to download it!
    if not os.path.isfile(dfn):
        
        print(f"{dfn} not found, downloading!")

        # Convert the path back into a list (without)
        # the filename -- we need to check that directories
        # exist first.
        path = os.path.split(dest)
        
        # Create any missing directories in dest(ination) path
        # -- os.path.join is the reverse of split (as you saw above)
        # but it doesn't work with lists... so I had to google how
        # to use the 'splat' operator! os.makedirs creates missing
        # directories in a path automatically.
        if len(path) >= 1 and path[0] != '':
            os.makedirs(os.path.join(*path), exist_ok=True)
            
        # Download and write the file
        with open(dfn, "wb") as file:
            response = get(src)
            file.write(response.content)
            
        print('Done downloading...')

    else:
        print(f"Found {dfn} locally!")

    return dfn

# socioeconomic data tidying

## nowork_rate、middle_high_class_pro、advanced_education_rate indictors

In [3]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-labour-market/ab2e314e-dbd8-4afe-8304-1f5402eda542/NSSEC.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

status = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')       

Found data/ward_socioeconomic/NSSEC.xlsx locally!


In [4]:
status.head()

Unnamed: 0,ward code,ward name,local authority code,local authority name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students
0,E09000001,City of London,E09000001,City of London,8003,3173,1914,431,571,133,289,257,441,794
1,E05014053,Abbey,E09000002,Barking and Dagenham,2940,455,481,228,358,107,299,308,415,289
2,E05014054,Alibon,E09000002,Barking and Dagenham,7458,419,1139,769,1044,414,964,1039,893,777
3,E05014055,Barking Riverside,E09000002,Barking and Dagenham,6901,680,1271,736,757,341,743,785,785,803
4,E05014056,Beam,E09000002,Barking and Dagenham,6092,305,873,654,783,320,791,893,764,709


In [5]:
status.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ ',
 'Higher managerial admin and professional ',
 'Lower managerial admin and professional ',
 'Intermediate ',
 'Small employers, own account workers',
 'Lower supervisory and technical ',
 'Semi routine',
 'Routine ',
 'Never worked and long term unemployed',
 'Full time students']

In [6]:
# Dropping the specified columns
status = status.drop(columns=['local authority code', 'local authority name'])
status.head()

Unnamed: 0,ward code,ward name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students
0,E09000001,City of London,8003,3173,1914,431,571,133,289,257,441,794
1,E05014053,Abbey,2940,455,481,228,358,107,299,308,415,289
2,E05014054,Alibon,7458,419,1139,769,1044,414,964,1039,893,777
3,E05014055,Barking Riverside,6901,680,1271,736,757,341,743,785,785,803
4,E05014056,Beam,6092,305,873,654,783,320,791,893,764,709


In [7]:
status=status.groupby('ward code').sum()

In [8]:
#add no_work_people_rate column and no_work_people_rate=Never worked and long term unemployed  
status['nonwork_rate'] = (status['Never worked and long term unemployed']/status["All usual residents aged 16+ "]).round(2)
#add proportion of middle and high class population 
status['middle_high_class_pro'] = ((status['Higher managerial admin and professional '] + status['Lower managerial admin and professional '] + status['Intermediate '])/status["All usual residents aged 16+ "]).round(2)
#add proportion of higher education
status['advanced_education_rate'] = (status['Full time students']/status["All usual residents aged 16+ "]).round(2)

In [9]:
status.head()

Unnamed: 0_level_0,ward name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students,nonwork_rate,middle_high_class_pro,advanced_education_rate
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
E05009317,Bethnal Green,17109,2607,3056,1363,1207,663,1422,1299,2314,3178,0.14,0.41,0.19
E05009318,Blackwall & Cubitt Town,18518,5891,3608,1293,1204,514,1020,859,1678,2451,0.09,0.58,0.13
E05009319,Bow East,16311,3621,3849,1314,1202,547,1196,1246,1875,1461,0.11,0.54,0.09
E05009320,Bow West,11331,2199,2562,948,931,433,865,984,1275,1134,0.11,0.5,0.1
E05009321,Bromley North,9015,1249,1645,804,677,366,889,795,1554,1036,0.17,0.41,0.11


In [10]:
status = status.reset_index()
status

Unnamed: 0,ward code,ward name,All usual residents aged 16+,Higher managerial admin and professional,Lower managerial admin and professional,Intermediate,"Small employers, own account workers",Lower supervisory and technical,Semi routine,Routine,Never worked and long term unemployed,Full time students,nonwork_rate,middle_high_class_pro,advanced_education_rate
0,E05009317,Bethnal Green,17109,2607,3056,1363,1207,663,1422,1299,2314,3178,0.14,0.41,0.19
1,E05009318,Blackwall & Cubitt Town,18518,5891,3608,1293,1204,514,1020,859,1678,2451,0.09,0.58,0.13
2,E05009319,Bow East,16311,3621,3849,1314,1202,547,1196,1246,1875,1461,0.11,0.54,0.09
3,E05009320,Bow West,11331,2199,2562,948,931,433,865,984,1275,1134,0.11,0.50,0.10
4,E05009321,Bromley North,9015,1249,1645,804,677,366,889,795,1554,1036,0.17,0.41,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,E05014116,Streatham Wells,8229,1647,2067,747,889,329,646,679,701,524,0.09,0.54,0.06
676,E05014117,Vauxhall,8602,2173,1792,588,671,224,514,583,590,1467,0.07,0.53,0.17
677,E05014118,Waterloo & South Bank,8012,1708,1475,526,501,241,478,545,536,2002,0.07,0.46,0.25
678,E05014119,West Dulwich,9269,2441,2453,778,905,274,572,595,657,594,0.07,0.61,0.06


## public_transport_station_coverage 

In [11]:
# download data
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-labour-market/d9f01451-3960-4a1f-a43d-392e45aa0830/Method%20of%20travel%20to%20work.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

method_of_work = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')       

Found data/ward_socioeconomic/Method%20of%20travel%20to%20work.xlsx locally!


In [12]:
# check the column names
method_of_work.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ in employment',
 'Work mainly at or from home',
 'Under-ground; metro; light rail; tram',
 'Train',
 'Bus; minibus or coach',
 'Taxi',
 'Motorcycle; scooter or moped',
 'Driving a car or van',
 'Passenger in a car or van',
 'Bicycle',
 'On foot',
 'Other method of travel to work']

In [13]:
# Dropping the specified columns
method_of_work = method_of_work.drop(columns=['local authority code', 'local authority name'])
method_of_work=method_of_work.groupby('ward code').sum()
method_of_work.head()

Unnamed: 0_level_0,ward name,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E05009317,Bethnal Green,9653,4273,1290,444,699,42,56,1006,79,622,1030,112
E05009318,Blackwall & Cubitt Town,12792,7803,1913,479,384,52,35,876,66,274,799,111
E05009319,Bow East,10861,5917,1308,363,657,78,52,928,57,759,650,92
E05009320,Bow West,7265,3598,929,255,478,43,32,706,55,559,509,101
E05009321,Bromley North,5264,2078,983,373,433,33,33,622,41,240,359,69


In [14]:
#select a representive indicator to analyse. 
#we want to find the quantity of public transportation around places where people live, 
#so we decide to choose "Under-ground; metro; light rail; tram","Bus; minibus or coach","Train" columns to represent.
method_of_work["public_transport_station_coverage"] = ((method_of_work["Under-ground; metro; light rail; tram"]+method_of_work["Train"]+method_of_work["Bus; minibus or coach"])/(method_of_work["All usual residents aged 16+ in employment"]-method_of_work["Work mainly at or from home"])).round(2)
method_of_work.head()

Unnamed: 0_level_0,ward name,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work,public_transport_station_coverage
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
E05009317,Bethnal Green,9653,4273,1290,444,699,42,56,1006,79,622,1030,112,0.45
E05009318,Blackwall & Cubitt Town,12792,7803,1913,479,384,52,35,876,66,274,799,111,0.56
E05009319,Bow East,10861,5917,1308,363,657,78,52,928,57,759,650,92,0.47
E05009320,Bow West,7265,3598,929,255,478,43,32,706,55,559,509,101,0.45
E05009321,Bromley North,5264,2078,983,373,433,33,33,622,41,240,359,69,0.56


In [15]:
#change ward name to merge other indicators
method_of_work.rename(columns={'Ward name': 'ward name'}, inplace=True)

In [16]:
method_of_work

Unnamed: 0_level_0,ward name,All usual residents aged 16+ in employment,Work mainly at or from home,Under-ground; metro; light rail; tram,Train,Bus; minibus or coach,Taxi,Motorcycle; scooter or moped,Driving a car or van,Passenger in a car or van,Bicycle,On foot,Other method of travel to work,public_transport_station_coverage
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
E05009317,Bethnal Green,9653,4273,1290,444,699,42,56,1006,79,622,1030,112,0.45
E05009318,Blackwall & Cubitt Town,12792,7803,1913,479,384,52,35,876,66,274,799,111,0.56
E05009319,Bow East,10861,5917,1308,363,657,78,52,928,57,759,650,92,0.47
E05009320,Bow West,7265,3598,929,255,478,43,32,706,55,559,509,101,0.45
E05009321,Bromley North,5264,2078,983,373,433,33,33,622,41,240,359,69,0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E05014116,Streatham Wells,5688,2719,437,309,748,20,75,723,48,252,289,68,0.50
E05014117,Vauxhall,5516,3077,582,225,437,41,30,282,24,279,466,73,0.51
E05014118,Waterloo & South Bank,4425,2268,416,167,395,37,27,242,25,227,570,51,0.45
E05014119,West Dulwich,6237,3549,313,441,477,24,40,673,30,335,307,48,0.46


In [17]:
method_of_work = method_of_work.reset_index()

## commute_distance

In [18]:
# download data
src_url   = ' https://data.london.gov.uk/download/2021-census-wards-labour-market/7725cc70-ab2b-4674-99cb-b825bc9ef4ca/Distance%20traveled%20to%20work.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

commute_distance = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Distance%20traveled%20to%20work.xlsx locally!


In [19]:
# check columns
commute_distance.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents aged 16+ in employment',
 'Less than 2km',
 '2km to less than 5km',
 '5km to less than 10km',
 '10km to less than 20km',
 '20km to less than 30km',
 '30km to less than 40km',
 '40km to less than 60km',
 '60km and over',
 'Works mainly from home',
 'Other']

In [20]:
# Dropping the specified columns
commute_distance = commute_distance.drop(columns=['local authority code', 'local authority name'])
commute_distance=commute_distance.groupby('ward code').sum()
commute_distance.head()

Unnamed: 0_level_0,ward name,All usual residents aged 16+ in employment,Less than 2km,2km to less than 5km,5km to less than 10km,10km to less than 20km,20km to less than 30km,30km to less than 40km,40km to less than 60km,60km and over,Works mainly from home,Other
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
E05009317,Bethnal Green,9649,1087,1385,1159,390,82,36,54,79,4273,1104
E05009318,Blackwall & Cubitt Town,12793,885,754,1553,643,89,47,53,61,7803,905
E05009319,Bow East,10880,630,1305,1376,462,48,31,35,52,5917,1024
E05009320,Bow West,7261,515,1096,880,292,55,24,21,48,3598,732
E05009321,Bromley North,5263,433,824,842,317,51,30,28,34,2078,626


In [21]:
# add average_commute_distance 
# average_commute_distance = sum of the total proportion of different commuting distances
commute_distance["average_commute_distance"] = ((commute_distance["Less than 2km"]*1+commute_distance["2km to less than 5km"]*3.5+commute_distance["5km to less than 10km"]*7.5+commute_distance["10km to less than 20km"]*15+commute_distance["20km to less than 30km"]*25+commute_distance["30km to less than 40km"]*35+commute_distance["40km to less than 60km"]*50+commute_distance["60km and over"]*60)/commute_distance["All usual residents aged 16+ in employment"]).round(2)

In [22]:
#reset index
commute_distance = commute_distance.reset_index()

In [23]:
commute_distance

Unnamed: 0,ward code,ward name,All usual residents aged 16+ in employment,Less than 2km,2km to less than 5km,5km to less than 10km,10km to less than 20km,20km to less than 30km,30km to less than 40km,40km to less than 60km,60km and over,Works mainly from home,Other,average_commute_distance
0,E05009317,Bethnal Green,9649,1087,1385,1159,390,82,36,54,79,4273,1104,3.24
1,E05009318,Blackwall & Cubitt Town,12793,885,754,1553,643,89,47,53,61,7803,905,2.74
2,E05009319,Bow East,10880,630,1305,1376,462,48,31,35,52,5917,1024,2.72
3,E05009320,Bow West,7261,515,1096,880,292,55,24,21,48,3598,732,2.96
4,E05009321,Bromley North,5263,433,824,842,317,51,30,28,34,2078,626,3.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,E05014116,Streatham Wells,5694,243,518,978,398,44,17,16,29,2719,732,3.44
676,E05014117,Vauxhall,5506,391,869,393,167,35,23,10,27,3077,514,2.30
677,E05014118,Waterloo & South Bank,4420,565,554,331,154,25,13,19,22,2268,469,2.41
678,E05014119,West Dulwich,6247,279,561,942,256,28,8,8,32,3549,584,2.63


## uhealthy_rate、usual_residents

In [24]:
# download data
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-qualifications-health-disability-and-care/af5f35f0-fdd3-4531-9616-ff1f1a64101a/General%20health.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

health = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/General%20health.xlsx locally!


In [25]:
# check column name
health.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All Usual residents ',
 'Very good health',
 'Good health',
 'Fair health',
 'Bad health',
 'Very bad health']

In [26]:
# Dropping the specified columns
health = health.drop(columns=['local authority code', 'local authority name'])
health=health.groupby('ward code').sum()
health.head()

Unnamed: 0_level_0,ward name,All Usual residents,Very good health,Good health,Fair health,Bad health,Very bad health
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
E05009317,Bethnal Green,21219,11024,6891,2194,843,267
E05009318,Blackwall & Cubitt Town,21823,12609,6931,1633,483,167
E05009319,Bow East,19535,10495,6152,1931,692,265
E05009320,Bow West,13713,7246,4279,1450,552,186
E05009321,Bromley North,11436,5596,3854,1340,462,184


In [27]:
#i think bad health rate can be representaive
#add unhealtht_rate column
health['unhealthy_rate']=((health['Bad health']+health['Very bad health'])/health['All Usual residents ']).round(2)

In [28]:
health.rename(columns={'All Usual residents ': 'All Usual residents'}, inplace=True)

In [29]:
health.head()

Unnamed: 0_level_0,ward name,All Usual residents,Very good health,Good health,Fair health,Bad health,Very bad health,unhealthy_rate
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E05009317,Bethnal Green,21219,11024,6891,2194,843,267,0.05
E05009318,Blackwall & Cubitt Town,21823,12609,6931,1633,483,167,0.03
E05009319,Bow East,19535,10495,6152,1931,692,265,0.05
E05009320,Bow West,13713,7246,4279,1450,552,186,0.05
E05009321,Bromley North,11436,5596,3854,1340,462,184,0.06


In [30]:
health = health.reset_index()

## disability

In [31]:
# download data
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-qualifications-health-disability-and-care/a9f40c75-fbe5-4a52-acc5-d5a9ac496c4d/Disability.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

disability = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Disability.xlsx locally!


In [32]:
#check data cloumn names
disability.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All Usual residents ',
 'Disabled; activities limited a lot',
 'Disabled; activities limited a little',
 'Not disabled']

In [33]:
# Dropping the specified columns
disability = disability.drop(columns=['local authority code', 'local authority name'])
disability=disability.groupby('ward code').sum()
disability.head()

Unnamed: 0_level_0,ward name,All Usual residents,Disabled; activities limited a lot,Disabled; activities limited a little,Not disabled
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E05009317,Bethnal Green,21209,1261,1729,18219
E05009318,Blackwall & Cubitt Town,21824,775,1289,19760
E05009319,Bow East,19534,1167,1584,16783
E05009320,Bow West,13712,828,1144,11740
E05009321,Bromley North,11434,736,911,9787


In [34]:
#add the proportion of disabled people
disability['disabled_pro'] = ((disability['Disabled; activities limited a lot']+disability['Disabled; activities limited a little'])/disability['All Usual residents ']).round(2)

In [35]:
disability.head()

Unnamed: 0_level_0,ward name,All Usual residents,Disabled; activities limited a lot,Disabled; activities limited a little,Not disabled,disabled_pro
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E05009317,Bethnal Green,21209,1261,1729,18219,0.14
E05009318,Blackwall & Cubitt Town,21824,775,1289,19760,0.09
E05009319,Bow East,19534,1167,1584,16783,0.14
E05009320,Bow West,13712,828,1144,11740,0.14
E05009321,Bromley North,11434,736,911,9787,0.14


In [36]:
disability = disability.reset_index()

## house_rent_price

In [37]:
# download data
src_url   = 'https://www.london.gov.uk/media/88817/download'
dest_path = os.path.join('data','ward_socioeconomic')

house_rent_price = pd.read_excel(
    cache_data(src_url, dest_path),header=[2])  

Found data/ward_socioeconomic/download locally!


In [38]:
house_rent_price.head()

Unnamed: 0,Borough name,Ward name,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms
0,Barking and Dagenham,Abbey,827.607183,919.563537,1011.51989,1103.476244,1195.432598,1287.388951
1,Barking and Dagenham,Alibon,745.09227,827.8803,910.66833,993.45636,1076.24439,1159.03242
2,Barking and Dagenham,Barking Riverside,981.192343,1090.213714,1199.235085,1308.256457,1400.0,1400.0
3,Barking and Dagenham,Beam,654.128228,726.809143,799.490057,872.170971,944.851885,1017.5328
4,Barking and Dagenham,Becontree,721.867408,802.074898,882.282388,962.489878,1042.697367,1122.904857


In [39]:
#check information of data
house_rent_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Borough name    680 non-null    object 
 1   Ward name       680 non-null    object 
 2   One bedroom     680 non-null    float64
 3   Two bedrooms    680 non-null    float64
 4   Three bedrooms  680 non-null    float64
 5   Four bedrooms   680 non-null    float64
 6   Five bedrooms   680 non-null    float64
 7   Six bedrooms    680 non-null    float64
dtypes: float64(6), object(2)
memory usage: 42.6+ KB


In [40]:
house_rent_price.columns.to_list()

['Borough name',
 'Ward name',
 'One bedroom',
 'Two bedrooms',
 'Three bedrooms',
 'Four bedrooms',
 'Five bedrooms',
 'Six bedrooms']

In [41]:
# Dropping the specified columns
house_rent_price=house_rent_price.groupby('Ward name').sum()
house_rent_price.head()

Unnamed: 0_level_0,Borough name,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms
Ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abbey,Barking and DagenhamMerton,1856.438836,2062.709818,2268.980799,2475.251781,2595.432598,2687.388951
Abbey Road,Westminster,1100.067184,1222.296871,1344.526558,1400.0,1400.0,1400.0
Abbey Wood,Greenwich,823.022316,914.46924,1005.916164,1097.363088,1188.810012,1280.256936
Abingdon,Kensington and Chelsea,1231.811425,1368.679361,1400.0,1400.0,1400.0,1400.0
Addiscombe East,Croydon,986.287446,1095.874941,1205.462435,1315.049929,1400.0,1400.0


In [42]:
#calculate mean ret price
house_rent_price['mean_rent_price'] = (house_rent_price.iloc[:, 1:].mean(axis=1)).round(2)
house_rent_price.head()

Unnamed: 0_level_0,Borough name,One bedroom,Two bedrooms,Three bedrooms,Four bedrooms,Five bedrooms,Six bedrooms,mean_rent_price
Ward name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abbey,Barking and DagenhamMerton,1856.438836,2062.709818,2268.980799,2475.251781,2595.432598,2687.388951,2324.37
Abbey Road,Westminster,1100.067184,1222.296871,1344.526558,1400.0,1400.0,1400.0,1311.15
Abbey Wood,Greenwich,823.022316,914.46924,1005.916164,1097.363088,1188.810012,1280.256936,1051.64
Abingdon,Kensington and Chelsea,1231.811425,1368.679361,1400.0,1400.0,1400.0,1400.0,1366.75
Addiscombe East,Croydon,986.287446,1095.874941,1205.462435,1315.049929,1400.0,1400.0,1233.78


In [43]:
house_rent_price['mean_rent_price'].isna()

Ward name
Abbey                   False
Abbey Road              False
Abbey Wood              False
Abingdon                False
Addiscombe East         False
                        ...  
Worcester Park North    False
Worcester Park South    False
Wormholt                False
Yeading                 False
Yiewsley                False
Name: mean_rent_price, Length: 670, dtype: bool

In [44]:
house_rent_price = house_rent_price.reset_index()

In [45]:
#rename ward name to merge
house_rent_price.rename(columns={'Ward name': 'ward name'}, inplace=True)

## crime_rate

In [46]:
src_url   = 'https://data.london.gov.uk/download/recorded_crime_summary/5daf1d8d-29b8-414f-bfe5-3f37566ef4c4/MPS%20Ward%20Level%20Crime%20%28most%20recent%2024%20months%29.csv'
dest_path = os.path.join('data','ward_socioeconomic')

crime = pd.read_csv(
    cache_data(src_url, dest_path))  

Found data/ward_socioeconomic/MPS%20Ward%20Level%20Crime%20%28most%20recent%2024%20months%29.csv locally!


In [47]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23438 entries, 0 to 23437
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   MajorText           23438 non-null  object
 1   MinorText           23438 non-null  object
 2   WardName            23438 non-null  object
 3   WardCode            23438 non-null  object
 4   LookUp_BoroughName  23438 non-null  object
 5   202112              23438 non-null  int64 
 6   202201              23438 non-null  int64 
 7   202202              23438 non-null  int64 
 8   202203              23438 non-null  int64 
 9   202204              23438 non-null  int64 
 10  202205              23438 non-null  int64 
 11  202206              23438 non-null  int64 
 12  202207              23438 non-null  int64 
 13  202208              23438 non-null  int64 
 14  202209              23438 non-null  int64 
 15  202210              23438 non-null  int64 
 16  202211              23

In [48]:
# Dropping the specified columns
crime = crime.drop(columns=['MajorText', 'MinorText', 'LookUp_BoroughName'])

# Grouping by 'WardName' and summing up the crime counts for each month
crime = crime.groupby('WardCode').sum()

crime.head()

Unnamed: 0_level_0,WardName,202112,202201,202202,202203,202204,202205,202206,202207,202208,...,202302,202303,202304,202305,202306,202307,202308,202309,202310,202311
WardCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E05009317,Bethnal GreenBethnal GreenBethnal GreenBethnal...,181,202,195,192,204,244,188,194,176,...,201,246,228,220,212,213,274,202,214,217
E05009318,Blackwall & Cubitt TownBlackwall & Cubitt Town...,138,129,117,119,120,125,138,139,129,...,122,108,125,161,155,183,139,151,145,102
E05009319,Bow EastBow EastBow EastBow EastBow EastBow Ea...,148,170,148,169,195,207,163,177,166,...,168,168,170,169,135,186,179,157,180,157
E05009320,Bow WestBow WestBow WestBow WestBow WestBow We...,87,115,115,126,107,115,87,110,137,...,125,137,116,113,123,124,150,131,128,115
E05009321,Bromley NorthBromley NorthBromley NorthBromley...,91,82,71,81,92,96,76,105,99,...,99,90,80,104,98,101,102,109,111,134


In [49]:
# Selecting the relevant columns for the year 2022
columns_2023 = crime[['202301','202302','202303','202304','202305',
                '202306','202307','202308','202309','202310']]
columns_2023

Unnamed: 0_level_0,202301,202302,202303,202304,202305,202306,202307,202308,202309,202310
WardCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
E05009317,223,201,246,228,220,212,213,274,202,214
E05009318,131,122,108,125,161,155,183,139,151,145
E05009319,160,168,168,170,169,135,186,179,157,180
E05009320,130,125,137,116,113,123,124,150,131,128
E05009321,99,99,90,80,104,98,101,102,109,111
...,...,...,...,...,...,...,...,...,...,...
E05014115,134,122,150,136,163,127,150,137,126,126
E05014116,55,53,63,53,58,58,61,50,73,52
E05014117,164,139,141,155,155,222,141,181,196,215
E05014118,347,372,339,379,392,429,420,402,349,357


In [50]:
# calculate avareage crime rate in every ward
crime['average_crime']= crime.iloc[:,1:].mean(axis=1).round(2)

In [51]:
crime = crime.reset_index()

In [52]:
crime.rename(columns={'WardCode': 'ward code'}, inplace=True)

In [53]:
crime.head()

Unnamed: 0,ward code,WardName,202112,202201,202202,202203,202204,202205,202206,202207,...,202303,202304,202305,202306,202307,202308,202309,202310,202311,average_crime
0,E05009317,Bethnal GreenBethnal GreenBethnal GreenBethnal...,181,202,195,192,204,244,188,194,...,246,228,220,212,213,274,202,214,217,209.46
1,E05009318,Blackwall & Cubitt TownBlackwall & Cubitt Town...,138,129,117,119,120,125,138,139,...,108,125,161,155,183,139,151,145,102,134.58
2,E05009319,Bow EastBow EastBow EastBow EastBow EastBow Ea...,148,170,148,169,195,207,163,177,...,168,170,169,135,186,179,157,180,157,164.42
3,E05009320,Bow WestBow WestBow WestBow WestBow WestBow We...,87,115,115,126,107,115,87,110,...,137,116,113,123,124,150,131,128,115,119.96
4,E05009321,Bromley NorthBromley NorthBromley NorthBromley...,91,82,71,81,92,96,76,105,...,90,80,104,98,101,102,109,111,134,95.54


## age

In [54]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-demography-and-migration/35b33393-f355-431b-9f87-ae55addaa636/Five%20year%20age%20bands.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

age = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Five%20year%20age%20bands.xlsx locally!


In [55]:
age.columns.to_list()

['ward code',
 'ward name',
 'local authority code',
 'local authority name',
 'All usual residents',
 'Aged 4 years and under',
 'Aged 5 to 9 years',
 'Aged 10 to 14 years',
 'Aged 15 to 19 years',
 'Aged 20 to 24 years',
 'Aged 25 to 29 years',
 'Aged 30 to 34 years',
 'Aged 35 to 39 years',
 'Aged 40 to 44 years',
 'Aged 45 to 49 years',
 'Aged 50 to 54 years',
 'Aged 55 to 59 years',
 'Aged 60 to 64 years',
 'Aged 65 to 69 years',
 'Aged 70 to 74 years',
 'Aged 75 to 79 years',
 'Aged 80 to 84 years',
 'Aged 85 years and over']

In [56]:
# Dropping the specified columns
age = age.drop(columns=['local authority code', 'local authority name'])
age=age.groupby('ward code').sum()
age.head()

Unnamed: 0_level_0,ward name,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,Aged 40 to 44 years,Aged 45 to 49 years,Aged 50 to 54 years,Aged 55 to 59 years,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
E05009317,Bethnal Green,21205,1303,1343,1218,1949,2192,2647,2428,1807,1522,1131,973,796,654,469,302,190,139,142
E05009318,Blackwall & Cubitt Town,21816,1212,1007,919,919,2653,3997,3519,2363,1582,1132,761,594,388,250,205,140,99,76
E05009319,Bow East,19523,1128,967,952,898,1495,3028,3122,2178,1430,960,849,787,568,379,283,191,143,165
E05009320,Bow West,13733,860,695,704,662,1228,1778,1760,1310,918,775,750,624,514,410,305,184,125,131
E05009321,Bromley North,11431,814,774,687,654,979,1410,1441,1140,858,677,539,468,340,256,139,104,88,63


In [57]:
#according to the census, young people age include 15-24
age['young_people']=((age['Aged 15 to 19 years']+age['Aged 20 to 24 years'])/age['All usual residents']).round(2)
#according to the census, children and teenager age include 0-14
age['children_teenager']=((age['Aged 4 years and under']+age['Aged 5 to 9 years']+age['Aged 10 to 14 years'])/age['All usual residents']).round(2)
#according to the census, old people age include 65+
age['old_people']=((age['Aged 65 to 69 years']+age['Aged 70 to 74 years']+age['Aged 75 to 79 years']+age['Aged 80 to 84 years']+age['Aged 85 years and over'])/age['All usual residents']).round(2)

In [58]:
age['middle_aged']=(1-(age['young_people']+age['children_teenager']+age['old_people'])).round(2)

In [59]:
age.head()

Unnamed: 0_level_0,ward name,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,Aged 35 to 39 years,...,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over,young_people,children_teenager,old_people,middle_aged
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E05009317,Bethnal Green,21205,1303,1343,1218,1949,2192,2647,2428,1807,...,654,469,302,190,139,142,0.2,0.18,0.06,0.56
E05009318,Blackwall & Cubitt Town,21816,1212,1007,919,919,2653,3997,3519,2363,...,388,250,205,140,99,76,0.16,0.14,0.04,0.66
E05009319,Bow East,19523,1128,967,952,898,1495,3028,3122,2178,...,568,379,283,191,143,165,0.12,0.16,0.06,0.66
E05009320,Bow West,13733,860,695,704,662,1228,1778,1760,1310,...,514,410,305,184,125,131,0.14,0.16,0.08,0.62
E05009321,Bromley North,11431,814,774,687,654,979,1410,1441,1140,...,340,256,139,104,88,63,0.14,0.2,0.06,0.6


In [60]:
age = age.reset_index()

In [61]:
age

Unnamed: 0,ward code,ward name,All usual residents,Aged 4 years and under,Aged 5 to 9 years,Aged 10 to 14 years,Aged 15 to 19 years,Aged 20 to 24 years,Aged 25 to 29 years,Aged 30 to 34 years,...,Aged 60 to 64 years,Aged 65 to 69 years,Aged 70 to 74 years,Aged 75 to 79 years,Aged 80 to 84 years,Aged 85 years and over,young_people,children_teenager,old_people,middle_aged
0,E05009317,Bethnal Green,21205,1303,1343,1218,1949,2192,2647,2428,...,654,469,302,190,139,142,0.20,0.18,0.06,0.56
1,E05009318,Blackwall & Cubitt Town,21816,1212,1007,919,919,2653,3997,3519,...,388,250,205,140,99,76,0.16,0.14,0.04,0.66
2,E05009319,Bow East,19523,1128,967,952,898,1495,3028,3122,...,568,379,283,191,143,165,0.12,0.16,0.06,0.66
3,E05009320,Bow West,13733,860,695,704,662,1228,1778,1760,...,514,410,305,184,125,131,0.14,0.16,0.08,0.62
4,E05009321,Bromley North,11431,814,774,687,654,979,1410,1441,...,340,256,139,104,88,63,0.14,0.20,0.06,0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,E05014116,Streatham Wells,9879,543,540,475,386,520,1150,1210,...,380,303,238,159,140,138,0.09,0.16,0.10,0.65
676,E05014117,Vauxhall,9633,350,296,310,493,1356,1683,1138,...,317,214,139,102,78,75,0.19,0.10,0.06,0.65
677,E05014118,Waterloo & South Bank,8872,254,263,290,770,1464,1222,846,...,363,280,226,150,112,98,0.25,0.09,0.10,0.56
678,E05014119,West Dulwich,11505,681,701,720,565,523,968,1117,...,529,417,412,255,143,148,0.09,0.18,0.12,0.61


## deprivation:poverty 

In [62]:
src_url   = 'https://data.london.gov.uk/download/2021-census-wards-demography-and-migration/05b99a3e-7e9a-467d-8d73-8417ee42bc9f/Household%20deprivation.xlsx'
dest_path = os.path.join('data','ward_socioeconomic')

household_deprivation = pd.read_excel(
    cache_data(src_url, dest_path), 
    sheet_name='2021')  

Found data/ward_socioeconomic/Household%20deprivation.xlsx locally!


In [63]:
# Dropping the specified columns
household_deprivation = household_deprivation.drop(columns=['local authority code', 'local authority name'])
household_deprivation=household_deprivation.groupby('ward code').sum()
household_deprivation.head()

Unnamed: 0_level_0,ward name,All Households,deprived in: no dimensions,1 dimension,2 dimensions,3 dimensions,4 dimensions
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
E05009317,Bethnal Green,7370,2898,2519,1379,531,43
E05009318,Blackwall & Cubitt Town,9694,5391,2974,929,366,34
E05009319,Bow East,8429,4234,2426,1280,450,39
E05009320,Bow West,5280,2387,1707,876,284,26
E05009321,Bromley North,4421,1784,1458,813,332,34


In [64]:
#add household_deprivation_rate
household_deprivation['household_deprivation_rate']=((household_deprivation['All Households']-household_deprivation['deprived in: no dimensions'])/household_deprivation['All Households']).round(2)

In [65]:
household_deprivation.sample(5)

Unnamed: 0_level_0,ward name,All Households,deprived in: no dimensions,1 dimension,2 dimensions,3 dimensions,4 dimensions,household_deprivation_rate
ward code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E05013557,Rayners Lane,3060,1604,1055,353,47,1,0.48
E05013736,College Park & Old Oak,3885,1483,1408,720,253,21,0.62
E05011475,Park Hill & Whitgift,2392,1337,733,264,51,7,0.44
E05013582,Wood End,6186,2157,2299,1319,367,44,0.65
E05013934,Green Lane & St James,2372,1179,752,337,100,4,0.5


In [66]:
household_deprivation = household_deprivation.reset_index()

## creat whole socioeconomics dataset

In [67]:
# build up initial datafrmae
neighbor_socioeconomic=status[['ward code','ward name','nonwork_rate','middle_high_class_pro','advanced_education_rate']]

In [68]:
neighbor_socioeconomic.head()

Unnamed: 0,ward code,ward name,nonwork_rate,middle_high_class_pro,advanced_education_rate
0,E05009317,Bethnal Green,0.14,0.41,0.19
1,E05009318,Blackwall & Cubitt Town,0.09,0.58,0.13
2,E05009319,Bow East,0.11,0.54,0.09
3,E05009320,Bow West,0.11,0.5,0.1
4,E05009321,Bromley North,0.17,0.41,0.11


In [69]:
common_key='ward code'

# add every column into the first dataset
neighbor_socioeconomic = neighbor_socioeconomic.merge(method_of_work[[common_key, 'public_transport_station_coverage']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(commute_distance[[common_key, 'average_commute_distance']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(health[[common_key, 'All Usual residents','unhealthy_rate']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(disability[[common_key, 'disabled_pro']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(crime[[common_key, 'average_crime']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(age[[common_key, 'young_people','children_teenager','old_people','middle_aged']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(household_deprivation[[common_key, 'household_deprivation_rate']], on=common_key, how='left')
neighbor_socioeconomic = neighbor_socioeconomic.merge(house_rent_price[['ward name', 'mean_rent_price']], on='ward name', how='left')


In [70]:
neighbor_socioeconomic.head()

Unnamed: 0,ward code,ward name,nonwork_rate,middle_high_class_pro,advanced_education_rate,public_transport_station_coverage,average_commute_distance,All Usual residents,unhealthy_rate,disabled_pro
0,E05009317,Bethnal Green,0.14,0.41,0.19,0.45,3.24,21219,0.05,0.14
1,E05009318,Blackwall & Cubitt Town,0.09,0.58,0.13,0.56,2.74,21823,0.03,0.09
2,E05009319,Bow East,0.11,0.54,0.09,0.47,2.72,19535,0.05,0.14
3,E05009320,Bow West,0.11,0.5,0.1,0.45,2.96,13713,0.05,0.14
4,E05009321,Bromley North,0.17,0.41,0.11,0.56,3.83,11436,0.06,0.14


In [75]:
#Save to csv
neighbor_socioeconomic.to_csv('Neighbourdata_Cleaned.csv')

In [76]:
#check info
eighbor_socioeconomic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ward code                          680 non-null    object 
 1   ward name                          680 non-null    object 
 2   nonwork_rate                       680 non-null    float64
 3   middle_high_class_pro              680 non-null    float64
 4   advanced_education_rate            680 non-null    float64
 5   public_transport_station_coverage  680 non-null    float64
 6   average_commute_distance           680 non-null    float64
 7   All Usual residents                680 non-null    int64  
 8   unhealthy_rate                     680 non-null    float64
 9   disabled_pro                       680 non-null    float64
 10  average_crime                      679 non-null    float64
 11  young_people                       680 non-null    float64

# zhi biao
commute_distance["average_commute_distance"] 2

health['unhealthy_rate']3 4

disability['disabled_pro']5 

houseprice_2022['median_houseprice_2022']6

house_rent_price['mean_rent_price'] 7

crime_sum_ward = crime_sum_2023.groupby('WardName')['avarge_crime'] 8

young_people	children_teenager	old_people	middle_aged 12

household_deprivation['household_deprivation_rate'] 13

In [77]:
#read airbnb data
airbnb_file = 'data/airbnb_wardnumber_clean.csv'
airbnb_data = pd.read_csv(airbnb_file)

In [78]:
airbnb_data.head()

Unnamed: 0.1,Unnamed: 0,WD22CD,total_room,Entire home/apt,Hotel room,Private room,Shared room,avg_price,avg_minimum_nights,avg_reviews_per_month,avg_availability_365,avg_star_rate
0,0,E05009294,27,27.0,0.0,0.0,0.0,158.37037,6.555556,1.952917,36.962963,4.660667
1,1,E05009317,388,180.0,0.0,206.0,2.0,109.103093,4.979381,0.872397,91.185567,4.675286
2,2,E05009318,374,227.0,0.0,143.0,4.0,141.473262,5.409091,1.295324,124.737968,4.666619
3,3,E05009319,432,256.0,0.0,175.0,1.0,112.25463,3.94213,0.853971,74.125,4.780307
4,4,E05009320,282,156.0,1.0,118.0,7.0,105.971631,3.868794,0.874266,84.152482,4.717365


In [79]:
# Rename the column in Airbnb data to match the Neighbourhood data
airbnb_data = airbnb_data.rename(columns={"WD22CD": "ward code"})

In [80]:
# Merge the two datasets on 'ward name'
merged_data = pd.merge(airbnb_data, neighbor_socioeconomic, on="ward code", how="left")

# Display the first few rows of the merged dataframe
merged_data.head()

Unnamed: 0.1,Unnamed: 0,ward code,total_room,Entire home/apt,Hotel room,Private room,Shared room,avg_price,avg_minimum_nights,avg_reviews_per_month,...,All Usual residents,unhealthy_rate,disabled_pro,average_crime,young_people,children_teenager,old_people,middle_aged,household_deprivation_rate,mean_rent_price
0,0,E05009294,27,27.0,0.0,0.0,0.0,158.37037,6.555556,1.952917,...,,,,,,,,,,
1,1,E05009317,388,180.0,0.0,206.0,2.0,109.103093,4.979381,0.872397,...,21219.0,0.05,0.14,209.46,0.2,0.18,0.06,0.56,0.61,1328.37
2,2,E05009318,374,227.0,0.0,143.0,4.0,141.473262,5.409091,1.295324,...,21823.0,0.03,0.09,134.58,0.16,0.14,0.04,0.66,0.44,1348.4
3,3,E05009319,432,256.0,0.0,175.0,1.0,112.25463,3.94213,0.853971,...,19535.0,0.05,0.14,164.42,0.12,0.16,0.06,0.66,0.5,1315.48
4,4,E05009320,282,156.0,1.0,118.0,7.0,105.971631,3.868794,0.874266,...,13713.0,0.05,0.14,119.96,0.14,0.16,0.08,0.62,0.55,1302.59


In [81]:
#Save to csv
merged_data.to_csv('Airbnb_Merged.csv')