# Data exploring 

We would like to to help us gain insights into how people feel about their local communities in Scotland. In particular, we are interested in the relationship between distance to outdoor space, and neighbourhood ratings.

1. Are there certain groups that have local access to green space?
2. Are there groups that are lacking access?
3. What there big differences in how far people have to walk to access their green space?
4. Are there any differences between rural and urban areas?
5. How do people in neighbourhoods with good access to green space differ from those who have no good access? Are there differences in how they rate their neighbourhoods? Are there differences in how they rate their communities?
6. Is there any way to predict which households would have higher ratings?

In [1]:
import pandas as pd 
import numpy as np 
import pandas_profiling
from janitor import clean_names

In [2]:
green_blue = pd.read_csv('raw_data/dis_green_blue.csv').clean_names()
n_rate = pd.read_csv('raw_data/neighbourhood_rate.csv').clean_names()
belong = pd.read_csv('raw_data/community_belonging.csv').clean_names()

In [3]:
green_blue.measurement.unique()

array(['95% Upper Confidence Limit, Percent',
       '95% Lower Confidence Limit, Percent', 'Percent'], dtype=object)

In [95]:
green_blue.head()

Unnamed: 0,featurecode,featurename,featuretype,datecode,measurement,units,value,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.1,Within a 6-10 minute walk,All,Male,All,All,All,All,All
1,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.4,Within a 6-10 minute walk,All,All,All,All,All,Pensioners,All
2,S12000014,Falkirk,Council Area,2017,"95% Upper Confidence Limit, Percent",Percent Of Adults,40.1,Within a 6-10 minute walk,65 years and over,All,All,All,All,All,All
3,S12000027,Shetland Islands,Council Area,2016,"95% Lower Confidence Limit, Percent",Percent Of Adults,3.1,Within a 6-10 minute walk,All,All,All,All,All,Adults,All
4,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.3,Within a 6-10 minute walk,All,All,All,All,Owned Mortgage/Loan,All,All


In [96]:
#pandas_profiling.ProfileReport(green_blue)

In [97]:
green_blue.columns

Index(['featurecode', 'featurename', 'featuretype', 'datecode', 'measurement',
       'units', 'value', 'distance_to_nearest_green_or_blue_space', 'age',
       'gender', 'urban_rural_classification', 'simd_quintiles',
       'type_of_tenure', 'household_type', 'ethnicity'],
      dtype='object')

In [98]:
n_rate.head()

Unnamed: 0,featurecode,featurename,featuretype,datecode,measurement,units,value,neighbourhood_rating,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,walking_distance_to_nearest_greenspace
0,S12000029,South Lanarkshire,Council Area,2017,"95% Lower Confidence Limit, Percent",Percent Of Adults,24.8,Fairly good,All,All,All,All,All,All,More than 10 minutes
1,S12000029,South Lanarkshire,Council Area,2017,"95% Upper Confidence Limit, Percent",Percent Of Adults,49.1,Fairly good,All,All,All,All,All,All,More than 10 minutes
2,S12000033,Aberdeen City,Council Area,2019,"95% Upper Confidence Limit, Percent",Percent Of Adults,57.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
3,S12000033,Aberdeen City,Council Area,2019,Percent,Percent Of Adults,45.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
4,S12000036,City of Edinburgh,Council Area,2016,"95% Upper Confidence Limit, Percent",Percent Of Adults,56.2,Fairly good,All,All,All,All,All,All,More than 10 minutes


In [99]:
#pandas_profiling.ProfileReport(n_rate)

In [100]:
belong.head()

Unnamed: 0,featurecode,featurename,featuretype,datecode,measurement,units,value,community_belonging,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,walking_distance_to_nearest_greenspace
0,S12000011,East Renfrewshire,Council Area,2017,"95% Upper Confidence Limit, Percent",Percent Of Adults,7.8,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
1,S12000011,East Renfrewshire,Council Area,2017,"95% Lower Confidence Limit, Percent",Percent Of Adults,0.0,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
2,S92000003,Scotland,Country,2016,Percent,Percent Of Adults,40.0,Fairly strongly,All,All,All,All,All,All,More than 10 minutes
3,S12000011,East Renfrewshire,Council Area,2017,Percent,Percent Of Adults,3.0,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
4,S12000049,Glasgow City,Council Area,2017,"95% Lower Confidence Limit, Percent",Percent Of Adults,34.2,Fairly strongly,All,All,All,All,All,All,More than 10 minutes


In [101]:
#pandas_profiling.ProfileReport(belong)

In [102]:
# they are so similar 

# Cleaning 

### green_blue cleaning 

In [103]:
green_blue.isna().sum()

featurecode                                0
featurename                                0
featuretype                                0
datecode                                   0
measurement                                0
units                                      0
value                                      0
distance_to_nearest_green_or_blue_space    0
age                                        0
gender                                     0
urban_rural_classification                 0
simd_quintiles                             0
type_of_tenure                             0
household_type                             0
ethnicity                                  0
dtype: int64

In [104]:
green_blue.head()

Unnamed: 0,featurecode,featurename,featuretype,datecode,measurement,units,value,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.1,Within a 6-10 minute walk,All,Male,All,All,All,All,All
1,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.4,Within a 6-10 minute walk,All,All,All,All,All,Pensioners,All
2,S12000014,Falkirk,Council Area,2017,"95% Upper Confidence Limit, Percent",Percent Of Adults,40.1,Within a 6-10 minute walk,65 years and over,All,All,All,All,All,All
3,S12000027,Shetland Islands,Council Area,2016,"95% Lower Confidence Limit, Percent",Percent Of Adults,3.1,Within a 6-10 minute walk,All,All,All,All,All,Adults,All
4,S12000027,Shetland Islands,Council Area,2015,"95% Upper Confidence Limit, Percent",Percent Of Adults,14.3,Within a 6-10 minute walk,All,All,All,All,Owned Mortgage/Loan,All,All


In [149]:
green_blue.drop(['featurecode','featurename','featuretype','measurement','units'], axis = 1, inplace = True)

In [150]:
green_blue

Unnamed: 0,datecode,value,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,2015,14.1,Within a 6-10 minute walk,All,Male,All,All,All,All,All
1,2015,14.4,Within a 6-10 minute walk,All,All,All,All,All,Pensioners,All
2,2017,40.1,Within a 6-10 minute walk,65 years and over,All,All,All,All,All,All
3,2016,3.1,Within a 6-10 minute walk,All,All,All,All,All,Adults,All
4,2015,14.3,Within a 6-10 minute walk,All,All,All,All,Owned Mortgage/Loan,All,All
...,...,...,...,...,...,...,...,...,...,...
38446,2018,2.0,Don't Know,All,All,All,All,All,All,Other
38447,2017,27.0,Within a 6-10 minute walk,All,All,All,All,All,All,Other
38448,2017,33.0,Within a 6-10 minute walk,All,All,All,All,All,All,Other
38449,2017,50.0,A 5 minute walk or less,All,All,All,All,All,All,Other


In [151]:
(green_blue['distance_to_nearest_green_or_blue_space'].replace({'Within a 6-10 minute walk': 'Less than 10 minutes',
                                                                'A 5 minute walk or less': 'Less than 10 minutes',
                                                                'An 11 minute walk or more': 'More than 10 minutes'}, inplace = True)
)

In [152]:
green_blue[green_blue.distance_to_nearest_green_or_blue_space == "Don't Know"]
#7347 rows 

Unnamed: 0,datecode,value,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
18511,2016,0.0,Don't Know,All,All,All,All,All,With Children,All
18512,2013,0.0,Don't Know,All,All,Urban,All,All,All,All
18513,2016,0.0,Don't Know,All,Female,All,All,All,All,All
18514,2013,13.4,Don't Know,All,All,All,All,All,Pensioners,All
18515,2013,1.0,Don't Know,All,Female,All,All,All,All,All
...,...,...,...,...,...,...,...,...,...,...
38426,2019,2.1,Don't Know,All,All,All,All,All,All,Other
38441,2016,0.0,Don't Know,All,All,All,All,All,All,Other
38443,2018,3.6,Don't Know,All,All,All,All,All,All,Other
38445,2013,0.0,Don't Know,All,All,All,All,All,All,Other


In [109]:
green_blue.shape
#38451 rows

(38451, 10)

```
it's about 19% of the data that has the distance to green sapce as 'Don't know', it's a lot to drop 
but they are not very helpful otherwise. 
```

In [153]:
green_blue = green_blue[green_blue.distance_to_nearest_green_or_blue_space != "Don't Know"]
green_blue

Unnamed: 0,datecode,value,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,2015,14.1,Less than 10 minutes,All,Male,All,All,All,All,All
1,2015,14.4,Less than 10 minutes,All,All,All,All,All,Pensioners,All
2,2017,40.1,Less than 10 minutes,65 years and over,All,All,All,All,All,All
3,2016,3.1,Less than 10 minutes,All,All,All,All,All,Adults,All
4,2015,14.3,Less than 10 minutes,All,All,All,All,Owned Mortgage/Loan,All,All
...,...,...,...,...,...,...,...,...,...,...
38444,2016,23.0,Less than 10 minutes,All,All,All,All,All,All,Other
38447,2017,27.0,Less than 10 minutes,All,All,All,All,All,All,Other
38448,2017,33.0,Less than 10 minutes,All,All,All,All,All,All,Other
38449,2017,50.0,Less than 10 minutes,All,All,All,All,All,All,Other


In [111]:
green_blue[green_blue.featurecode == 'S12000027']

Unnamed: 0,featurecode,featuretype,distance_to_nearest_green_or_blue_space,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,S12000027,Council Area,Less than 10 minutes,All,Male,All,All,All,All,All
1,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,Pensioners,All
3,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,Adults,All
4,S12000027,Council Area,Less than 10 minutes,All,All,All,All,Owned Mortgage/Loan,All,All
6,S12000027,Council Area,Less than 10 minutes,All,All,All,80% least deprived,All,All,All
...,...,...,...,...,...,...,...,...,...,...
37960,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,All,White
38068,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,All,White
38108,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,All,White
38116,S12000027,Council Area,Less than 10 minutes,All,All,All,All,All,All,White


In [112]:
# ummmm be careful with the 'ALl' 
# if i am thinking of building a model i shouldn't have those all????

### n_rate cleaning 

In [113]:
n_rate.head()

Unnamed: 0,featurecode,featurename,featuretype,datecode,measurement,units,value,neighbourhood_rating,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,walking_distance_to_nearest_greenspace
0,S12000029,South Lanarkshire,Council Area,2017,"95% Lower Confidence Limit, Percent",Percent Of Adults,24.8,Fairly good,All,All,All,All,All,All,More than 10 minutes
1,S12000029,South Lanarkshire,Council Area,2017,"95% Upper Confidence Limit, Percent",Percent Of Adults,49.1,Fairly good,All,All,All,All,All,All,More than 10 minutes
2,S12000033,Aberdeen City,Council Area,2019,"95% Upper Confidence Limit, Percent",Percent Of Adults,57.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
3,S12000033,Aberdeen City,Council Area,2019,Percent,Percent Of Adults,45.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
4,S12000036,City of Edinburgh,Council Area,2016,"95% Upper Confidence Limit, Percent",Percent Of Adults,56.2,Fairly good,All,All,All,All,All,All,More than 10 minutes


In [114]:
n_rate.shape

(38055, 15)

In [154]:
n_rate.drop(['featurecode','featurename','featuretype','measurement','units'], axis = 1,inplace = True)


In [155]:
n_rate

Unnamed: 0,datecode,value,neighbourhood_rating,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,walking_distance_to_nearest_greenspace
0,2017,24.8,Fairly good,All,All,All,All,All,All,More than 10 minutes
1,2017,49.1,Fairly good,All,All,All,All,All,All,More than 10 minutes
2,2019,57.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
3,2019,45.0,Fairly good,All,All,All,All,All,All,More than 10 minutes
4,2016,56.2,Fairly good,All,All,All,All,All,All,More than 10 minutes
...,...,...,...,...,...,...,...,...,...,...
38050,2013,42.0,Fairly good,All,All,All,All,All,White,All
38051,2017,53.3,Fairly good,All,Rural,All,All,All,All,All
38052,2017,20.2,Fairly good,All,Rural,All,All,All,All,All
38053,2016,31.0,Fairly good,All,All,All,All,With Children,All,All


In [117]:
belong.shape

(43611, 15)

In [156]:
belong.drop(['featurecode','featurename','featuretype','measurement','units'], axis = 1, inplace = True)
belong

Unnamed: 0,datecode,value,community_belonging,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,walking_distance_to_nearest_greenspace
0,2017,7.8,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
1,2017,0.0,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
2,2016,40.0,Fairly strongly,All,All,All,All,All,All,More than 10 minutes
3,2017,3.0,Not at all strongly,All,All,All,All,All,All,More than 10 minutes
4,2017,34.2,Fairly strongly,All,All,All,All,All,All,More than 10 minutes
...,...,...,...,...,...,...,...,...,...,...
43606,2019,3.3,Not very strongly,All,All,All,All,Adults,All,All
43607,2014,9.6,Not very strongly,All,All,All,All,All,White,All
43608,2017,16.0,Not very strongly,All,All,All,All,All,All,All
43609,2018,16.0,Not very strongly,Female,All,All,All,All,All,All


In [119]:
# how do i handle the all and how do i join them together. 

In [120]:
# testing 

In [121]:
df_a = pd.DataFrame({'a':[1,2,3],'b' : [3,3,3]})
df_b = pd.DataFrame({'a':[3,2,1], 'c': [1,1,1]})


In [122]:
df_a

Unnamed: 0,a,b
0,1,3
1,2,3
2,3,3


In [123]:
df_b

Unnamed: 0,a,c
0,3,1
1,2,1
2,1,1


In [124]:
pd.concat([df_a,df_b], axis=1)

Unnamed: 0,a,b,a.1,c
0,1,3,3,1
1,2,3,2,1
2,3,3,1,1


In [125]:
df_a.merge(df_b, left_index=True, right_index=True)

Unnamed: 0,a_x,b,a_y,c
0,1,3,3,1
1,2,3,2,1
2,3,3,1,1


In [126]:
pd.concat([df_a, df_b], ignore_index=True,axis=0)

Unnamed: 0,a,b,c
0,1,3.0,
1,2,3.0,
2,3,3.0,
3,3,,1.0
4,2,,1.0
5,1,,1.0


In [127]:
# end of testing 

In [128]:
green_blue.columns

Index(['featurecode', 'featuretype', 'distance_to_nearest_green_or_blue_space',
       'age', 'gender', 'urban_rural_classification', 'simd_quintiles',
       'type_of_tenure', 'household_type', 'ethnicity'],
      dtype='object')

In [129]:
n_rate.columns

Index(['featurecode', 'featuretype', 'neighbourhood_rating', 'gender',
       'urban_rural_classification', 'simd_quintiles', 'type_of_tenure',
       'household_type', 'ethnicity',
       'walking_distance_to_nearest_greenspace'],
      dtype='object')

In [130]:
belong.columns

Index(['featurecode', 'featuretype', 'community_belonging', 'gender',
       'urban_rural_classification', 'simd_quintiles', 'type_of_tenure',
       'household_type', 'ethnicity',
       'walking_distance_to_nearest_greenspace'],
      dtype='object')

In [157]:
green_blue.rename(columns={'distance_to_nearest_green_or_blue_space': 'walking_distance_to_nearest_greenspace'}, inplace = True)
green_blue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  green_blue.rename(columns={'distance_to_nearest_green_or_blue_space': 'walking_distance_to_nearest_greenspace'}, inplace = True)


Unnamed: 0,datecode,value,walking_distance_to_nearest_greenspace,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity
0,2015,14.1,Less than 10 minutes,All,Male,All,All,All,All,All
1,2015,14.4,Less than 10 minutes,All,All,All,All,All,Pensioners,All
2,2017,40.1,Less than 10 minutes,65 years and over,All,All,All,All,All,All
3,2016,3.1,Less than 10 minutes,All,All,All,All,All,Adults,All
4,2015,14.3,Less than 10 minutes,All,All,All,All,Owned Mortgage/Loan,All,All
...,...,...,...,...,...,...,...,...,...,...
38444,2016,23.0,Less than 10 minutes,All,All,All,All,All,All,Other
38447,2017,27.0,Less than 10 minutes,All,All,All,All,All,All,Other
38448,2017,33.0,Less than 10 minutes,All,All,All,All,All,All,Other
38449,2017,50.0,Less than 10 minutes,All,All,All,All,All,All,Other


In [158]:
df = pd.concat([green_blue, n_rate, belong], ignore_index=True,axis=0)

df

Unnamed: 0,datecode,value,walking_distance_to_nearest_greenspace,age,gender,urban_rural_classification,simd_quintiles,type_of_tenure,household_type,ethnicity,neighbourhood_rating,community_belonging
0,2015,14.1,Less than 10 minutes,All,Male,All,All,All,All,All,,
1,2015,14.4,Less than 10 minutes,All,All,All,All,All,Pensioners,All,,
2,2017,40.1,Less than 10 minutes,65 years and over,All,All,All,All,All,All,,
3,2016,3.1,Less than 10 minutes,All,All,All,All,All,Adults,All,,
4,2015,14.3,Less than 10 minutes,All,All,All,All,Owned Mortgage/Loan,All,All,,
...,...,...,...,...,...,...,...,...,...,...,...,...
112765,2019,3.3,All,,All,All,All,All,Adults,All,,Not very strongly
112766,2014,9.6,All,,All,All,All,All,All,White,,Not very strongly
112767,2017,16.0,All,,All,All,All,All,All,All,,Not very strongly
112768,2018,16.0,All,,Female,All,All,All,All,All,,Not very strongly


In [159]:
df.to_csv('clean_data/green.csv',index=False) 