In [None]:
from google.cloud import bigquery
import pandas as pd

In [None]:
#Does this work for storing git credentials?
#Yes.
#!git config --global credential.helper store

In [None]:
client = bigquery.Client(location=" europe-west2")
print("Client creating using default project: {}".format(client.project))

In [None]:
query = """
    SELECT AREACD, Indicator, Category, Value 
    FROM `project.ingest_dataset.ingest_table` 
    
"""
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="location",
)  # API request - starts the query

all_metrics = query_job.to_dataframe()
all_metrics

In [None]:
metrics_wide = pd.pivot(all_metrics, values='Value', columns='Indicator', index='AREACD').reindex()
metrics_wide

In [None]:
corr = metrics_wide.corr()
corr.style.background_gradient(cmap='rainbow')

In [None]:
metrics_wide.plot.scatter(x="5 year olds achieving 'expected level' on communication early learning goals",
                          y="5 year olds achieving 'expected level' on maths early learning goals")

In [None]:
metrics_wide.hist("Male healthy life expectancy")

In [None]:
##Subset by category
living_standards = all_metrics[all_metrics['Category'].str.contains('Living Standards')]
living_standards = pd.pivot_table(living_standards, values='Value', columns='Indicator', index='AREACD').reindex()

spreading_opp = all_metrics[all_metrics['Category'].str.contains('opportunity')]
spreading_opp = pd.pivot_table(spreading_opp, values='Value', columns='Indicator', index='AREACD').reindex()

local_pride = all_metrics[all_metrics['Category'].str.contains('local pride')]
local_pride = pd.pivot_table(local_pride, values='Value', columns='Indicator', index='AREACD').reindex()


In [None]:
#Need to remove NaN values.
liv_reduced = living_standards[living_standards.notna().all(axis=1)]
loc_reduced = local_pride[local_pride.notna().all(axis=1)]
opp_reduced = spreading_opp[spreading_opp.notna().all(axis=1)]

#Then we can do scaling and then run a k-means
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
liv_scaled = scaler.fit_transform(liv_reduced)
loc_scaled = scaler.fit_transform(loc_reduced)
opp_scaled = scaler.fit_transform(opp_reduced)

In [None]:
#Do the clustering. Here assume it's reasonable-ish to cluster into 3 (high/med/low)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, n_init=10, max_iter=300)

kmeans.fit(liv_reduced)
liv_clusters = pd.DataFrame(liv_reduced.reset_index()['AREACD'])
liv_clusters['Cluster'] = kmeans.labels_

kmeans.fit(loc_reduced)
loc_clusters = pd.DataFrame(loc_reduced.reset_index()['AREACD'])
loc_clusters['Cluster'] = kmeans.labels_

kmeans.fit(opp_reduced)
opp_clusters = pd.DataFrame(opp_reduced.reset_index()['AREACD'])
opp_clusters['Cluster'] = kmeans.labels_

In [None]:
#Get geospatial data via a query.
import geopandas

query = """
    SELECT LAD20CD, geom, BNG_E, BNG_N
    FROM `project.geography_ingest_dataset.geography_ingest_table`
    
"""
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="location",
)  # API request - starts the query

la_geo = query_job.to_geodataframe
la_geo()

In [None]:
#This is a bit messy, had to do it backwards as la_geo is a GEO_dataframe, others are regular dataframes.
#So changed order and did right <-> left
opp_clusters = la_geo().merge(opp_clusters, right_on = 'AREACD', left_on = 'LAD20CD', how='right')
liv_clusters = la_geo().merge(liv_clusters, right_on = 'AREACD', left_on = 'LAD20CD', how='right')
loc_clusters = la_geo().merge(loc_clusters, right_on = 'AREACD', left_on = 'LAD20CD', how='right')


In [None]:
#Plotting. Test out package installs.
#!pip install geopandas

In [None]:
opp_clusters.plot(column = "Cluster")
loc_clusters.plot(column = "Cluster")
liv_clusters.plot(column = "Cluster")