In [None]:
#Some simple exploration that will look at the shape of each metric's distribution

In [None]:
#Get the subnational explorer metrics.
from google.cloud import bigquery
import pandas as pd
client = bigquery.Client(location=" europe-west2")

query = """
    SELECT AREACD, Indicator, Category, Value 
    FROM `project.ingest_dataset_name.ingest_table_name` 
    
"""
query_job = client.query(
    query,
    # Location must match that of the dataset(s) referenced in the query.
    location="location",
)  # API request - starts the query

all_metrics = query_job.to_dataframe()
metrics_wide = pd.pivot(all_metrics, values='Value', columns='Indicator', index='AREACD').reindex()

In [None]:
import scipy.stats
scipy.stats.skew(metrics_wide, axis=0, nan_policy='omit')
#Reminder: skew > 0  means left-skewed. skew < 0 means right skewed.
#scipy.stats.describe(metrics_wide, axis=0, nan_policy='omit')

In [None]:
#For reference, the metrics are in this order.
list(metrics_wide)

In [None]:
#So we can we some of these metrics have obviously skewed distributions.
skewtest = scipy.stats.skewtest(metrics_wide, axis=0, nan_policy='omit')
print(skewtest.pvalue < 0.05)
print(skewtest.pvalue)

In [None]:
#Can we get a list of metrics which appear to be correlated
corr = metrics_wide.corr()
n_metrics = corr.shape[0]
high_corr=[]
med_corr=[]
for i in range(n_metrics):
    for j in range(i+1, n_metrics):
        if abs(corr.iloc[i,j]) > 0.5:
               high_corr.append([corr.columns[i], corr.columns[j], corr.iloc[i,j]])
        elif abs(corr.iloc[i,j]) > 0.3:
               med_corr.append([corr.columns[i], corr.columns[j], corr.iloc[i,j]])
high_corr        


In [None]:
len(high_corr)

In [None]:
len(high_corr[0][1])

In [None]:
def plot_high_corr(i, invert=True, auto_choose_orientation=False):
    #Check the length of the labels. Maybe be easier to have the long one on the x axis.
    if auto_choose_orientation:
        if len(high_corr[i][1]) > len(high_corr[i][0]):
            invert = True
        else: 
            invert = False

    if invert:
        metrics_wide.plot.scatter(x=high_corr[i][1],
                                  y=high_corr[i][0])
    else:
        metrics_wide.plot.scatter(x=high_corr[i][0],
                                  y=high_corr[i][1])

#Now can look at pair 41 more easily, which is between HLE and childhood attainment.
plot_high_corr(41)
plot_high_corr(38)

In [None]:
#Brute force plotting of interesting scatters.
for pair in range(len(high_corr)):
    plot_high_corr(pair)

In [None]:
#Worth noting at this point on the correlations between apprenticeship starts/completions and housing stock:
#These are all totals, so to a large extent depend on the population of the LA area.
#Suspect these may be spurious, and that normalising by capita would be needed for this analysis.
#Relatedly, that these don't correlate with anything else is not conclusive.

In [None]:
#A test to look for visualising a third helpful explanatory variable.
metrics_wide.plot.scatter(x="Young people achieving GCSEs (and equivalent qualifications) in English and Maths by age 19",
                          y="Male healthy life expectancy",
                          c="Adults that currently smoke cigarettes", cmap='Reds')