# Preparing the dataset for exploratory analysis


## To do:
* ~check columns' datatypes~
* ~decide what needs conversion~
* ~decide what should be deleted~
* ~add date column~
* add data from Kaggle datasets
* create final dataset for DA with all columns
* save it to csv and db

In [130]:
import pandas as pd
from datetime import datetime as dt
import numpy as np
import re
import openpyxl, xlrd

In [131]:
df = pd.read_csv('ldn_properties.csv', index_col=0)
df.head()

Unnamed: 0,borough,id,address,link,title,num_of_bed,num_of_bath,reception_room,transport_primary,station1_dist(mi),transport_secondary,station2_dist(mi),tag,parking,price
0,city-of-london-london-borough,listing_58614319,"One Crown Place, 54 Wilson Street EC2A",/new-homes/details/58614319/?search_identifier...,1 bed flat for sale,1,1,0,national_rail_station,0.2 miles London Liverpool Street,london_underground_station,0.2 miles Liverpool Street,New home,0,"£1,030,000"
1,city-of-london-london-borough,listing_58614075,"The Barbican, Barbican, London EC2Y",/for-sale/details/58614075/?search_identifier=...,1 bed flat for sale,1,1,1,london_underground_station,0.2 miles Barbican,national_rail_station,0.2 miles Moorgate,New home,0,"£725,000"
2,city-of-london-london-borough,listing_52061413,"Principal Tower, Worship Lane, Shoreditch EC2A",/for-sale/details/52061413/?search_identifier=...,1 bed flat for sale,1,1,1,national_rail_station,0.2 miles Shoreditch High Street,national_rail_station,0.3 miles London Liverpool Street,New home,0,"£875,000"
3,city-of-london-london-borough,listing_55657629,"One Crown Place, Hackney EC2A",/new-homes/details/55657629/?search_identifier...,2 bed flat for sale,2,2,0,national_rail_station,0.2 miles London Liverpool Street,london_underground_station,0.2 miles Liverpool Street,New home,0,"£1,750,000"
4,city-of-london-london-borough,listing_58592857,"Barbican, London EC2Y",/for-sale/details/58592857/?search_identifier=...,Studio for sale,1,1,1,london_underground_station,0 miles Barbican,national_rail_station,0.3 miles Moorgate,New home,0,"£535,000"


In [132]:
df.dtypes

borough                object
id                     object
address                object
link                   object
title                  object
num_of_bed              int64
num_of_bath             int64
reception_room          int64
transport_primary      object
station1_dist(mi)      object
transport_secondary    object
station2_dist(mi)      object
tag                    object
parking                 int64
price                  object
dtype: object

## Typecasting and formatting:

* ~**check for duplicates** - use the id columns~
* ~**check for missing data**~
* ~**price** - delete currency sign and convert it to int~
* ~**station1/2** - extract miles and convert them to floats~
* **delete** - link

In [10]:
df.duplicated(subset='id').sum()

7317

In [11]:
df.drop_duplicates('id', keep='first', inplace=True)

In [12]:
df.shape

(63837, 15)

In [13]:
df.isna().sum()

borough                0
id                     0
address                0
link                   0
title                  0
num_of_bed             0
num_of_bath            0
reception_room         0
transport_primary      0
station1_dist(mi)      0
transport_secondary    0
station2_dist(mi)      0
tag                    0
parking                0
price                  0
dtype: int64

In [14]:
df.isnull().sum()

borough                0
id                     0
address                0
link                   0
title                  0
num_of_bed             0
num_of_bath            0
reception_room         0
transport_primary      0
station1_dist(mi)      0
transport_secondary    0
station2_dist(mi)      0
tag                    0
parking                0
price                  0
dtype: int64

In [15]:
# Checking if price values are digits only
filt = df['price'].str.contains(re.compile(r'(\d),|\d+,\d+'))
df.loc[~filt,'price']

  return func(self, *args, **kwargs)


304      POA
305      POA
595      POA
596      POA
638      POA
        ... 
70832    POA
71014    POA
71069    POA
71095    POA
71130    POA
Name: price, Length: 446, dtype: object

There seem to be POA values in the price column, these will be dropped

In [16]:
df.drop(index=df[df['price'] == 'POA'].index, inplace=True)

In [17]:
df.loc[df['price'] == 'POA']

Unnamed: 0,borough,id,address,link,title,num_of_bed,num_of_bath,reception_room,transport_primary,station1_dist(mi),transport_secondary,station2_dist(mi),tag,parking,price


In [18]:
#Converting prices into integers
df['price'] = [int(x.replace('£', '').replace(',', '')) for x in df['price']]

In [19]:
df['price'].dtypes

dtype('int64')

In [20]:
# Extracting the distance in miles
df['station1_dist(mi)'] = [float(x.split(' ')[0]) for x in df['station1_dist(mi)']]

In [21]:
df['station1_dist(mi)']

0        0.2
1        0.2
2        0.2
3        0.2
4        0.0
        ... 
71146    0.5
71147    0.1
71149    0.1
71150    1.2
71151    1.6
Name: station1_dist(mi), Length: 63391, dtype: float64

In [22]:
df['station2_dist(mi)'] = [float(x.split(' ')[0]) for x in df['station2_dist(mi)']]

In [23]:
df['station2_dist(mi)']

0        0.2
1        0.2
2        0.3
3        0.2
4        0.3
        ... 
71146    0.8
71147    1.8
71149    1.8
71150    1.3
71151    1.6
Name: station2_dist(mi), Length: 63391, dtype: float64

In [24]:
del df['link']

In [25]:
df.dtypes

borough                 object
id                      object
address                 object
title                   object
num_of_bed               int64
num_of_bath              int64
reception_room           int64
transport_primary       object
station1_dist(mi)      float64
transport_secondary     object
station2_dist(mi)      float64
tag                     object
parking                  int64
price                    int64
dtype: object

In [26]:
# Date of accessing the scraped website (this will be useful in the future for time series analysis if I continue to scrape every month)
df['date'] = dt.now().date()

In [28]:
df['borough']

0        city-of-london-london-borough
1        city-of-london-london-borough
2        city-of-london-london-borough
3        city-of-london-london-borough
4        city-of-london-london-borough
                     ...              
71146        hillingdon-london-borough
71147        hillingdon-london-borough
71149        hillingdon-london-borough
71150        hillingdon-london-borough
71151        hillingdon-london-borough
Name: borough, Length: 63391, dtype: object

In [133]:
# Reformatting the boroughs
regex = re.compile(r'-london-borough|-royal-borough')
df['borough'] = [regex.sub('', x) for x in df['borough']]

In [135]:
df['borough'] = [x.replace('-', ' ') for x in df['borough']]
df['borough']

0        city of london
1        city of london
2        city of london
3        city of london
4        city of london
              ...      
71149        hillingdon
71150        hillingdon
71151        hillingdon
71152            merton
71153           croydon
Name: borough, Length: 71154, dtype: object

# Assessing the downloaded datasets:
* extract data for main dataset for DS:
    * satisfaction index
    * happiness index
    * mean/median salary
    * crime rates
* note down which datasets can be used for Dashboard/Report

In [109]:
wb = pd.read_excel('./datasets/personal-well-being-borough.xlsx', sheet_name=1, header=[0, 1])

In [119]:
wb.dropna(subset=[(' ', 'Area')], axis=0, inplace=True)

In [127]:
wb.loc[:, (' ', 'Area')] = wb[' ']['Area'].str.lower()

In [185]:
wb.loc[:, ('Life Satisfaction','2018/19')] = wb['Life Satisfaction']['2018/19'].replace(np.nan, np.mean(wb['Life Satisfaction']['2018/19']))

In [192]:
wb.loc[:, ('Happiness','2018/19')] = wb['Happiness']['2018/19'].replace('x', np.mean(wb['Life Satisfaction']['2018/19']))

In [200]:
df['satisfaction'] = [wb.loc[wb[' ']['Area'] == x, 'Life Satisfaction']['2018/19'].values[:1] for x in df['borough']]

In [190]:
df['happiness'] = [wb.loc[wb[' ']['Area'] == x, 'Happiness']['2018/19'].values for x in df['borough']]

0           [x]
1           [x]
2           [x]
3           [x]
4           [x]
          ...  
71149    [7.67]
71150    [7.67]
71151    [7.67]
71152    [7.57]
71153    [7.78]
Name: happiness, Length: 71154, dtype: object