# Cleaning London House Prices by Borough Dataset

In [40]:
import pandas as pd
import seaborn as sns

# Read in the dataset and correct sheet
df = pd.read_excel('../../Datasets/01_raw_data_files/UK House price index.xlsx', sheet_name='Average price')

In [24]:
# Visualise the data
df.head()

Unnamed: 0.1,Unnamed: 0,City of London,Barking & Dagenham,Barnet,Bexley,Brent,Bromley,Camden,Croydon,Ealing,...,NORTH WEST,YORKS & THE HUMBER,EAST MIDLANDS,WEST MIDLANDS,EAST OF ENGLAND,LONDON,SOUTH EAST,SOUTH WEST,Unnamed: 47,England
0,NaT,E09000001,E09000002,E09000003,E09000004,E09000005,E09000006,E09000007,E09000008,E09000009,...,E12000002,E12000003,E12000004,E12000005,E12000006,E12000007,E12000008,E12000009,,E92000001
1,1995-01-01,91448.98487,50460.2266,93284.51832,64958.09036,71306.56698,81671.47692,120932.8881,69158.16225,79885.89069,...,43958.48001,44803.42878,45544.52227,48527.52339,56701.5961,74435.76052,64018.87894,54705.1579,,53202.77128
2,1995-02-01,82202.77314,51085.77983,93190.16963,64787.92069,72022.26197,81657.55944,119508.8622,68951.09542,80897.06551,...,43925.42289,44528.80721,46051.57066,49341.29029,56593.59475,72777.93709,63715.02399,54356.14843,,53096.1549
3,1995-03-01,79120.70256,51268.96956,92247.52435,64367.49344,72015.76274,81449.31143,120282.2131,68712.44341,81379.86288,...,44434.8681,45200.46775,45383.82395,49442.17973,56171.18278,73896.84204,64113.60858,53583.07667,,53201.2843
4,1995-04-01,77101.20804,53133.50526,90762.87492,64277.66881,72965.63094,81124.41227,120097.899,68610.04641,82188.90498,...,44267.7796,45614.34341,46124.23045,49455.93299,56567.89582,74455.28754,64623.22395,54786.01938,,53590.8548


In [25]:
# Brief overview of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341 entries, 0 to 340
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Unnamed: 0            340 non-null    datetime64[ns]
 1   City of London        341 non-null    object        
 2   Barking & Dagenham    341 non-null    object        
 3   Barnet                341 non-null    object        
 4   Bexley                341 non-null    object        
 5   Brent                 341 non-null    object        
 6   Bromley               341 non-null    object        
 7   Camden                341 non-null    object        
 8   Croydon               341 non-null    object        
 9   Ealing                341 non-null    object        
 10  Enfield               341 non-null    object        
 11  Greenwich             341 non-null    object        
 12  Hackney               341 non-null    object        
 13  Hammersmith & Fulham

### Key Observations/Problems to be cleaned:
* Select relevant time period (2006-2022) to our project
* Change the date column to just show the year and month
* Rename unnamed column to Date
* Only keep London borough columns, these are the columns relevant to our project
* Change object types to floats

### Data Cleaning:

In [26]:
# Select relevant rows (2006-2022)
prices_df = df.iloc[133:337]
prices_df = prices_df.reset_index(drop = True)

prices_df

Unnamed: 0.1,Unnamed: 0,City of London,Barking & Dagenham,Barnet,Bexley,Brent,Bromley,Camden,Croydon,Ealing,...,NORTH WEST,YORKS & THE HUMBER,EAST MIDLANDS,WEST MIDLANDS,EAST OF ENGLAND,LONDON,SOUTH EAST,SOUTH WEST,Unnamed: 47,England
0,2006-01-01,316121.4492,162339.9562,282333.5446,188012.9565,247765.9162,240254.6582,368709.6304,204536.277,247593.5017,...,130143.7753,130043.163,140443.4005,147576.2808,178992.2244,239845.5309,201251.6422,181152.3421,,166543.5872
1,2006-02-01,323653.2874,163023.9489,281120.306,187235.5686,246054.9316,241717.779,373670.358,207312.2851,251238.8236,...,131608.9907,130997.8575,140814.364,148738.1635,179220.395,239114.9725,201878.4843,180356.2912,,166895.5702
2,2006-03-01,325106.3667,162004.3844,281674.2983,187449.8626,246766.2836,243140.6832,377339.5567,208650.3412,250284.2467,...,131138.846,131602.2981,141417.9469,148993.6565,180623.0245,241380.3315,202725.8266,183672.1074,,168001.0451
3,2006-04-01,319579.8523,164203.9629,283205.9287,189193.9169,246980.2997,243914.4217,382932.5242,209106.4866,251678.6762,...,135796.6246,134859.3407,145424.5235,150422.1431,183444.7579,245474.4153,205285.3579,185201.3196,,171128.7014
4,2006-05-01,313125.8007,166048.3794,286461.884,189826.5174,249085.8169,244570.8703,389595.8246,208379.6343,255037.5534,...,136328.0768,135180.1468,145115.6537,154179.9434,185049.0459,247591.1474,208458.2565,187086.6382,,172698.7875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,2022-08-01,865647.8228,350109.2815,601505.3863,400930.044,561214.2226,510180.196,861447.7882,424342.5705,537694.5852,...,215109.6992,209298.9018,249927.7049,251270.7316,356758.1767,544298.7914,396972.3166,332993.2587,,310185.8123
200,2022-09-01,865561.3351,349552.4486,606057.9735,404308.03,580359.8804,512062.7635,861312.601,431380.4029,540072.4799,...,217353.6019,211145.1115,251290.5708,253473.2658,359493.6536,544962.2211,399866.1407,335069.9219,,312345.6612
201,2022-10-01,884896.7212,351242.5951,602165.8061,409127.1711,572148.3116,516601.3391,850555.646,431982.3891,536557.8112,...,217306.4539,211351.5301,250123.2612,253793.7014,357844.3636,536616.4423,398760.9029,333908.8408,,311280.3556
202,2022-11-01,952551.4913,354485.6669,593702.626,407337.3436,562592.5366,515823.3266,840398.4096,430289.6512,537426.6113,...,217834.713,211595.9533,251814.6933,252773.7863,360365.9537,536982.3005,396963.6099,334456.5779,,311568.5504


In [27]:
# Rename 'Unnamed: 0' column to 'Date'
prices_df.rename(columns={'Unnamed: 0':'Date'}, inplace=True)

# Change the date to just the year and month
prices_df['Date'] = prices_df['Date'].dt.to_period('M')

# Check changes
prices_df.head()

Unnamed: 0,Date,City of London,Barking & Dagenham,Barnet,Bexley,Brent,Bromley,Camden,Croydon,Ealing,...,NORTH WEST,YORKS & THE HUMBER,EAST MIDLANDS,WEST MIDLANDS,EAST OF ENGLAND,LONDON,SOUTH EAST,SOUTH WEST,Unnamed: 47,England
0,2006-01,316121.4492,162339.9562,282333.5446,188012.9565,247765.9162,240254.6582,368709.6304,204536.277,247593.5017,...,130143.7753,130043.163,140443.4005,147576.2808,178992.2244,239845.5309,201251.6422,181152.3421,,166543.5872
1,2006-02,323653.2874,163023.9489,281120.306,187235.5686,246054.9316,241717.779,373670.358,207312.2851,251238.8236,...,131608.9907,130997.8575,140814.364,148738.1635,179220.395,239114.9725,201878.4843,180356.2912,,166895.5702
2,2006-03,325106.3667,162004.3844,281674.2983,187449.8626,246766.2836,243140.6832,377339.5567,208650.3412,250284.2467,...,131138.846,131602.2981,141417.9469,148993.6565,180623.0245,241380.3315,202725.8266,183672.1074,,168001.0451
3,2006-04,319579.8523,164203.9629,283205.9287,189193.9169,246980.2997,243914.4217,382932.5242,209106.4866,251678.6762,...,135796.6246,134859.3407,145424.5235,150422.1431,183444.7579,245474.4153,205285.3579,185201.3196,,171128.7014
4,2006-05,313125.8007,166048.3794,286461.884,189826.5174,249085.8169,244570.8703,389595.8246,208379.6343,255037.5534,...,136328.0768,135180.1468,145115.6537,154179.9434,185049.0459,247591.1474,208458.2565,187086.6382,,172698.7875


In [28]:
# Transpose the dataset, i.e. switch the rows and columns
prices_df2 = prices_df.transpose()

# Add a new index
prices_df2 = prices_df2.reset_index()

# Select relevant rows (Only London boroughs)
prices_df2 = prices_df2.iloc[:34]

# Check changes to the new dataframe
prices_df2.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,194,195,196,197,198,199,200,201,202,203
0,Date,2006-01,2006-02,2006-03,2006-04,2006-05,2006-06,2006-07,2006-08,2006-09,...,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12
1,City of London,316121.4492,323653.2874,325106.3667,319579.8523,313125.8007,304976.9241,338307.1373,358438.1038,377545.3976,...,823597.83378,811845.5214,795968.1538,814502.7842,820757.0426,865647.8228,865561.3351,884896.7212,952551.4913,930424.7317
2,Barking & Dagenham,162339.9562,163023.9489,162004.3844,164203.9629,166048.3794,169013.0192,170203.9708,171177.1072,170250.5241,...,335997.20854,332490.9095,335163.6297,344971.6741,348396.4143,350109.2815,349552.4486,351242.5951,354485.6669,354799.3903
3,Barnet,282333.5446,281120.306,281674.2983,283205.9287,286461.884,287725.1512,292435.4468,296104.3032,300380.688,...,573415.95151,579692.6666,582782.5567,588913.9018,590896.309,601505.3863,606057.9735,602165.8061,593702.626,597199.0461
4,Bexley,188012.9565,187235.5686,187449.8626,189193.9169,189826.5174,191156.222,189337.5476,193378.0406,194016.7524,...,385473.18358,383803.788,385268.5269,389047.2309,394752.12,400930.044,404308.03,409127.1711,407337.3436,408227.2906


In [29]:
# Set the column labels to equal the values in the 2nd row
prices_df2.columns = prices_df2.iloc[0]
prices_df2 = prices_df2.drop(prices_df2.index[0])

# Check the changes
prices_df2.head()

Unnamed: 0,Date,2006-01,2006-02,2006-03,2006-04,2006-05,2006-06,2006-07,2006-08,2006-09,...,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12
1,City of London,316121.4492,323653.2874,325106.3667,319579.8523,313125.8007,304976.9241,338307.1373,358438.1038,377545.3976,...,823597.83378,811845.5214,795968.1538,814502.7842,820757.0426,865647.8228,865561.3351,884896.7212,952551.4913,930424.7317
2,Barking & Dagenham,162339.9562,163023.9489,162004.3844,164203.9629,166048.3794,169013.0192,170203.9708,171177.1072,170250.5241,...,335997.20854,332490.9095,335163.6297,344971.6741,348396.4143,350109.2815,349552.4486,351242.5951,354485.6669,354799.3903
3,Barnet,282333.5446,281120.306,281674.2983,283205.9287,286461.884,287725.1512,292435.4468,296104.3032,300380.688,...,573415.95151,579692.6666,582782.5567,588913.9018,590896.309,601505.3863,606057.9735,602165.8061,593702.626,597199.0461
4,Bexley,188012.9565,187235.5686,187449.8626,189193.9169,189826.5174,191156.222,189337.5476,193378.0406,194016.7524,...,385473.18358,383803.788,385268.5269,389047.2309,394752.12,400930.044,404308.03,409127.1711,407337.3436,408227.2906
5,Brent,247765.9162,246054.9316,246766.2836,246980.2997,249085.8169,252877.368,254617.2415,254296.853,254398.7895,...,508995.81868,518865.824,529981.8582,534686.4918,551947.0216,561214.2226,580359.8804,572148.3116,562592.5366,549438.9525


In [30]:
# Rename 'Date' column to 'Borough'
prices_df2.rename(columns={'Date':'Borough'}, inplace=True)

# Check the changes
prices_df2.head()

Unnamed: 0,Borough,2006-01,2006-02,2006-03,2006-04,2006-05,2006-06,2006-07,2006-08,2006-09,...,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12
1,City of London,316121.4492,323653.2874,325106.3667,319579.8523,313125.8007,304976.9241,338307.1373,358438.1038,377545.3976,...,823597.83378,811845.5214,795968.1538,814502.7842,820757.0426,865647.8228,865561.3351,884896.7212,952551.4913,930424.7317
2,Barking & Dagenham,162339.9562,163023.9489,162004.3844,164203.9629,166048.3794,169013.0192,170203.9708,171177.1072,170250.5241,...,335997.20854,332490.9095,335163.6297,344971.6741,348396.4143,350109.2815,349552.4486,351242.5951,354485.6669,354799.3903
3,Barnet,282333.5446,281120.306,281674.2983,283205.9287,286461.884,287725.1512,292435.4468,296104.3032,300380.688,...,573415.95151,579692.6666,582782.5567,588913.9018,590896.309,601505.3863,606057.9735,602165.8061,593702.626,597199.0461
4,Bexley,188012.9565,187235.5686,187449.8626,189193.9169,189826.5174,191156.222,189337.5476,193378.0406,194016.7524,...,385473.18358,383803.788,385268.5269,389047.2309,394752.12,400930.044,404308.03,409127.1711,407337.3436,408227.2906
5,Brent,247765.9162,246054.9316,246766.2836,246980.2997,249085.8169,252877.368,254617.2415,254296.853,254398.7895,...,508995.81868,518865.824,529981.8582,534686.4918,551947.0216,561214.2226,580359.8804,572148.3116,562592.5366,549438.9525


In [31]:
# Replace object data types in all year columns with floats
prices_df2 = prices_df2.replace(prices_df2.iloc[:, 1:].astype(float))

# Check the correct data types have been changed
prices_df2.dtypes

0
Borough     object
2006-01    float64
2006-02    float64
2006-03    float64
2006-04    float64
            ...   
2022-08    float64
2022-09    float64
2022-10    float64
2022-11    float64
2022-12    float64
Length: 205, dtype: object

In [32]:
# Save the cleaned dataframe as a .csv
prices_df2.to_csv('prices_london_borough.csv')