# Preparing the dataset for exploratory analysis


## To do:
* ~check columns' datatypes~
* ~decide what needs conversion~
* ~decide what should be deleted~
* ~add date column~
* add data from Kaggle datasets
* create final dataset for DA with all columns
* save it to csv and db

In [163]:
import pandas as pd
from datetime import datetime as dt
import numpy as np
import re

In [164]:
df = pd.read_csv('ldn_properties.csv', index_col=0)
df.head()

Unnamed: 0,borough,id,address,link,title,num_of_bed,num_of_bath,reception_room,transport_primary,station1_dist(mi),transport_secondary,station2_dist(mi),tag,parking,price
0,city-of-london-london-borough,listing_58614319,"One Crown Place, 54 Wilson Street EC2A",/new-homes/details/58614319/?search_identifier...,1 bed flat for sale,1,1,0,national_rail_station,0.2 miles London Liverpool Street,london_underground_station,0.2 miles Liverpool Street,New home,0,"£1,030,000"
1,city-of-london-london-borough,listing_58614075,"The Barbican, Barbican, London EC2Y",/for-sale/details/58614075/?search_identifier=...,1 bed flat for sale,1,1,1,london_underground_station,0.2 miles Barbican,national_rail_station,0.2 miles Moorgate,New home,0,"£725,000"
2,city-of-london-london-borough,listing_52061413,"Principal Tower, Worship Lane, Shoreditch EC2A",/for-sale/details/52061413/?search_identifier=...,1 bed flat for sale,1,1,1,national_rail_station,0.2 miles Shoreditch High Street,national_rail_station,0.3 miles London Liverpool Street,New home,0,"£875,000"
3,city-of-london-london-borough,listing_55657629,"One Crown Place, Hackney EC2A",/new-homes/details/55657629/?search_identifier...,2 bed flat for sale,2,2,0,national_rail_station,0.2 miles London Liverpool Street,london_underground_station,0.2 miles Liverpool Street,New home,0,"£1,750,000"
4,city-of-london-london-borough,listing_58592857,"Barbican, London EC2Y",/for-sale/details/58592857/?search_identifier=...,Studio for sale,1,1,1,london_underground_station,0 miles Barbican,national_rail_station,0.3 miles Moorgate,New home,0,"£535,000"


In [165]:
df.dtypes

borough                object
id                     object
address                object
link                   object
title                  object
num_of_bed              int64
num_of_bath             int64
reception_room          int64
transport_primary      object
station1_dist(mi)      object
transport_secondary    object
station2_dist(mi)      object
tag                    object
parking                 int64
price                  object
dtype: object

## Typecasting and formatting:

* ~**check for duplicates** - use the id columns~
* ~**check for missing data**~
* ~**price** - delete currency sign and convert it to int~
* ~**station1/2** - extract miles and convert them to floats~
* **delete** - link

In [166]:
df.duplicated(subset='id').sum()

7317

In [167]:
df.drop_duplicates('id', keep='first', inplace=True)

In [168]:
df.shape

(63837, 15)

In [169]:
df.isna().sum()

borough                0
id                     0
address                0
link                   0
title                  0
num_of_bed             0
num_of_bath            0
reception_room         0
transport_primary      0
station1_dist(mi)      0
transport_secondary    0
station2_dist(mi)      0
tag                    0
parking                0
price                  0
dtype: int64

In [170]:
df.isnull().sum()

borough                0
id                     0
address                0
link                   0
title                  0
num_of_bed             0
num_of_bath            0
reception_room         0
transport_primary      0
station1_dist(mi)      0
transport_secondary    0
station2_dist(mi)      0
tag                    0
parking                0
price                  0
dtype: int64

In [171]:
# Checking if price values are digits only
filt = df['price'].str.contains(re.compile(r'(\d),|\d+,\d+'))
df.loc[~filt,'price']

  return func(self, *args, **kwargs)


304      POA
305      POA
595      POA
596      POA
638      POA
        ... 
70832    POA
71014    POA
71069    POA
71095    POA
71130    POA
Name: price, Length: 446, dtype: object

There seem to be POA values in the price column, these will be dropped

In [172]:
df.drop(index=df[df['price'] == 'POA'].index, inplace=True)

In [173]:
df.loc[df['price'] == 'POA']

Unnamed: 0,borough,id,address,link,title,num_of_bed,num_of_bath,reception_room,transport_primary,station1_dist(mi),transport_secondary,station2_dist(mi),tag,parking,price


In [174]:
#Converting prices into integers
df['price'] = [int(x.replace('£', '').replace(',', '')) for x in df['price']]

In [175]:
df['price'].dtypes

dtype('int64')

In [177]:
# Extracting the distance in miles
df['station1_dist(mi)'] = [float(x.split(' ')[0]) for x in df['station1_dist(mi)']]

In [178]:
df['station1_dist(mi)']

0        0.2
1        0.2
2        0.2
3        0.2
4        0.0
        ... 
71146    0.5
71147    0.1
71149    0.1
71150    1.2
71151    1.6
Name: station1_dist(mi), Length: 63391, dtype: float64

In [179]:
df['station2_dist(mi)'] = [float(x.split(' ')[0]) for x in df['station2_dist(mi)']]

In [180]:
df['station2_dist(mi)']

0        0.2
1        0.2
2        0.3
3        0.2
4        0.3
        ... 
71146    0.8
71147    1.8
71149    1.8
71150    1.3
71151    1.6
Name: station2_dist(mi), Length: 63391, dtype: float64

In [181]:
del df['link']

In [182]:
df.dtypes

borough                 object
id                      object
address                 object
title                   object
num_of_bed               int64
num_of_bath              int64
reception_room           int64
transport_primary       object
station1_dist(mi)      float64
transport_secondary     object
station2_dist(mi)      float64
tag                     object
parking                  int64
price                    int64
dtype: object

In [183]:
# Date of accessing the scraped website (this will be useful in the future for time series analysis if I continue to scrape every month)
df['date'] = dt.now().date()