In [4]:
# import Pandas to use for data cleaning 
import pandas as pd


In [5]:
# fetch the file we are going to begin with and visualise the first 5 rows
data = pd.read_csv('price_paid_records.csv')


In [6]:
# sort the data in ascending to check up until what year it goes and the visualise through data.tail()
data = data.sort_values(by=['Date of Transfer'], ascending=True)
data.tail()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Property Type,Old/New,Duration,Town/City,District,County,PPDCategory Type,Record Status - monthly file only
22362577,{5376B386-4BF0-34C1-E053-6B04A8C09FF6},527500,2017-06-29 00:00,D,N,F,HEMEL HEMPSTEAD,DACORUM,HERTFORDSHIRE,B,A
22170578,{5376B385-57BD-34C1-E053-6B04A8C09FF6},277000,2017-06-29 00:00,S,N,F,WICKFORD,BASILDON,ESSEX,A,A
22227348,{5376B385-E03E-34C1-E053-6B04A8C09FF6},551000,2017-06-29 00:00,T,N,F,CHISLEHURST,BROMLEY,GREATER LONDON,A,A
22401945,{5376B386-659E-34C1-E053-6B04A8C09FF6},250000,2017-06-29 00:00,D,N,F,SALISBURY,WILTSHIRE,WILTSHIRE,B,A
22461663,{5376B386-089F-34C1-E053-6B04A8C09FF6},255000,2017-06-29 00:00,D,N,F,WASHINGTON,SUNDERLAND,TYNE AND WEAR,A,A


In [7]:
# create a copy of the original dataframe with the columns we are interested in
new_data = data[['Date of Transfer','Price', 'County']].copy()

In [8]:
# check that the new dataframe looks like we want it to
new_data.sample(n = 10)

Unnamed: 0,Date of Transfer,Price,County
12474116,2006-02-27 00:00,115500,TYNE AND WEAR
13838216,2007-09-12 00:00,52000,LANCASHIRE
1180768,1996-04-29 00:00,79000,GREATER LONDON
4584278,1999-06-25 00:00,64000,NORTH YORKSHIRE
11623321,2005-06-10 00:00,155000,GREATER LONDON
12787029,2006-06-30 00:00,249950,CONWY
12313041,2005-05-23 00:00,148000,WILTSHIRE
18385568,2013-02-28 00:00,326000,SURREY
16795574,2010-12-10 00:00,62000,REDCAR AND CLEVELAND
4738061,1999-11-05 00:00,103000,GREATER LONDON


In [9]:
# make sure there are no nulls in the County column, as it will be used to filter later on.
number_of_nans = new_data['County'].isnull().sum()
number_of_nans

0

In [11]:
#  Make a new dataframe from the new_data dataframe but only include rows where the column county equals greater london.
london_data = new_data.loc[new_data['County'].str.contains('LONDON', na=False)]

In [12]:
# check that it looks the way we want to. 
london_data.tail()
# london_data.head()

Unnamed: 0,Date of Transfer,Price,County
22461166,2017-06-28 00:00,369950,GREATER LONDON
22281359,2017-06-28 00:00,485000,GREATER LONDON
22380614,2017-06-28 00:00,775000,GREATER LONDON
22122189,2017-06-28 00:00,420000,GREATER LONDON
22227348,2017-06-29 00:00,551000,GREATER LONDON


In [13]:
# are the datatypes correct for analysis?
london_data.dtypes
# we can see 'Date of Transfer' is an object, ideally it should be a date so we can extract the year later on.

Date of Transfer    object
Price                int64
County              object
dtype: object

In [14]:
# transform the data type from Date of Transfer to 'datetime64'
london_data['Date of Transfer'] = london_data['Date of Transfer'].astype('datetime64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  london_data['Date of Transfer'] = london_data['Date of Transfer'].astype('datetime64')


In [15]:
# check that we have changed it correctly
london_data.dtypes

Date of Transfer    datetime64[ns]
Price                        int64
County                      object
dtype: object

In [16]:
# create a new column named year with only the year out of date of transfer
london_data['year']= london_data['Date of Transfer'].dt.strftime('%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  london_data['year']= london_data['Date of Transfer'].dt.strftime('%Y')


In [17]:
# check that we have done it correctly
london_data.year

444544      1995
199501      1995
506016      1995
242235      1995
90870       1995
            ... 
22461166    2017
22281359    2017
22380614    2017
22122189    2017
22227348    2017
Name: year, Length: 2993422, dtype: object

In [18]:
# since we dont need date of transfer column anymore, drop it. 
london_data.drop('Date of Transfer', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  london_data.drop('Date of Transfer', axis=1, inplace=True)


In [19]:
# check that the table looks like we want it to
london_data.head()

Unnamed: 0,Price,County,year
444544,128000,GREATER LONDON,1995
199501,300000,GREATER LONDON,1995
506016,84000,GREATER LONDON,1995
242235,51500,GREATER LONDON,1995
90870,117000,GREATER LONDON,1995


In [21]:
london_data.sample(n=20).sort_values(by=['year'], ascending=True)

Unnamed: 0,Price,County,year
1073290,69950,GREATER LONDON,1996
2415273,105000,GREATER LONDON,1997
3829997,57950,GREATER LONDON,1998
3918545,62000,GREATER LONDON,1999
4964115,85500,GREATER LONDON,1999
6947794,69000,GREATER LONDON,2001
8848721,1250000,GREATER LONDON,2003
10248068,292500,GREATER LONDON,2004
10889612,247500,GREATER LONDON,2004
12392033,209000,GREATER LONDON,2005


In [52]:
# export to csv 
london_data.to_csv('london_clean_data.csv')