# 15 - ANOVA analysis

ANOVA - Analysis of variance

## Import libraries

In [7]:
import geopandas as gpd
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import seaborn as sns
from scipy.stats import f_oneway
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Config notebook

In [2]:
walking_speed = 4.5
str_walking_speed = str(walking_speed).replace('.','_')
str_walking_speed

# Not running _priv educational equipments? Set pub = True. (ALSO VERIFY CHANGES IN PARAMETERS DICT)
pub = True

# Input - Processed HQSL
if pub:
    hex_processed_table = f'santiago_hexproximity_hqsl_{str_walking_speed}_kmh_pub'
else:
    hex_processed_table = f'santiago_hexproximity_hqsl_{str_walking_speed}_kmh'

print(hex_processed_table)

santiago_hexproximity_hqsl_4_5_kmh_pub


In [3]:
# resolution to be loaded and analysed
res = 10

In [4]:
# --- REQUIRED: VERIFY PARAMETERS DICT.
# Structure: {social_functions:{themes:[source_names]}}

parameters_dict = {'supplying':{'groceries':['carniceria','hogar','local_mini_market',
                                             'bakeries','ferias','supermercado'],
                                'services':['centro_recyc'],
                               },
                   'caring':{'health':['clinica_priv','clinica_pub',
                                       'hospital_priv','hospital_pub',
                                       'vacunatorio_priv','vacunatorio_pub',
                                       'consult_ado_priv','consult_ado_pub',
                                       'salud_mental','labs_priv','residencia_adumayor','farmacia'],
                             'public_wellbeing':['bomberos'],
                             'exercise':['club_deportivo','eq_deportivo_pub','eq_deportivo_priv']
                            },
                   'living':{'errands_paperwork':['civic_office','tax_collection',
                                                  'social_security','correos','banco'],
                             'housing':['viv_count','viv_social_count'],
                             'public_services':['police'],
                             'fresh_air':['ep_plaza_small','ep_plaza_big'],
                             'temporary_residence':['hotel_count']
                            },
                   'enjoying':{'culture':['museos_priv','museos_pub','bibliotecas','sitios_historicos'],
                               'greenspace':['ndvi_count'],
                               'entertainment':['cines','librerias','restaurantes_bar_cafe']
                              },
                   'learning':{'education':['jardin_inf_pub',#'jardin_inf_priv',
                                            'edu_especial_pub',#'edu_especial_priv',
                                            'edu_basica_pub',#'edu_basica_priv',
                                            'edu_media_pub',#'edu_media_priv',
                                            'edu_tecnica','universidad',
                                            'edu_adultos_pub',#'edu_adultos_priv',
                                            'centro_edu_amb']
                              },
                   'working':{'workplaces':['oficinas_count'],
                              'sustainable_mobility':['ciclovias','paradas_tp',
                                                      'paradas_tp_tren','paradas_tp_metro']}
                  }

## Load data

## Load data - processed HQSL data

In [5]:
query = f'SELECT * FROM projects_research.{hex_processed_table} WHERE res = {res}'
hex_gdf = aup.gdf_from_query(query)

# Show
print(hex_gdf.shape)
print(hex_gdf.res.unique())
hex_gdf.head(2)

(63727, 201)
[10.]


Unnamed: 0,hex_id,geometry,supermercado_time,supermercado_count_15min,clinica_priv_time,clinica_priv_count_15min,clinica_pub_time,clinica_pub_count_15min,hospital_priv_time,hospital_priv_count_15min,...,culture_count,greenspace_count,entertainment_count,enjoying_count,education_count,learning_count,workplaces_count,sustainable_mobility_count,working_count,hqsl
0,8ab2c556d697fff,"POLYGON ((-70.59586 -33.41622, -70.59649 -33.4...",3.707278,4.251,0.621772,5.167667,38.784833,0.001,65.502859,0.001,...,14.16163,7.882002,17.525184,4.946102,32.996476,4.124559,8.641466,27.953463,7.318986,34.025012
1,8ab2c550c06ffff,"POLYGON ((-70.77623 -33.38186, -70.77686 -33.3...",33.36468,0.001,43.643197,0.001,53.773104,0.001,171.802084,0.001,...,0.0,8.304933,0.0,1.038117,0.0,0.0,0.0,6.918553,1.383711,4.041173


### Test to understand f_oneway

In [12]:
test = f_oneway(hex_gdf['supermercado_time'],hex_gdf['clinica_priv_time'])
test

F_onewayResult(statistic=8233.681808137386, pvalue=0.0)

In [10]:
test[0]

8233.681808137386

In [11]:
test[1]

0.0

## __Social function cols + HQSL__ ANOVA analysis df (Unknown if needed statistic or pvalue, so instead of matrix made Dataframe format)

In [18]:
annova_1 = pd.DataFrame()

# ------------------------------ COLUMNS OF INTEREST
# Social function + HQSL count columns
summary_cols = []

for k in parameters_dict.keys():
    summary_cols.append(k+'_count')
summary_cols.append('hqsl')

# ------------------------------ ANNOVA
# For each col combination
i=0
for j in range(len(summary_cols)):
    for k in range(len(summary_cols)):

        # Calculate annova analysis
        data = f_oneway(hex_gdf[summary_cols[j]],hex_gdf[summary_cols[k]])
        statistic = data[0]
        pvalue = data[1]

        # Register data in matrix
        annova_1.loc[i,'variable_1'] = summary_cols[j]
        annova_1.loc[i,'variable_2'] = summary_cols[k]
        annova_1.loc[i,'statistic'] = statistic
        annova_1.loc[i,'pvalue'] = pvalue

        i = i+1

# Show
annova_1

Unnamed: 0,variable_1,variable_2,statistic,pvalue
0,supplying_count,supplying_count,-1.0826629999999999e-30,
1,supplying_count,caring_count,31016.38,0.0
2,supplying_count,living_count,13040.7,0.0
3,supplying_count,enjoying_count,21520.51,0.0
4,supplying_count,learning_count,7694.958,0.0
5,supplying_count,working_count,4925.692,0.0
6,supplying_count,hqsl,71839.87,0.0
7,caring_count,supplying_count,31016.38,0.0
8,caring_count,caring_count,6.577405e-29,1.0
9,caring_count,living_count,7001.825,0.0


## __Time and Count__ ANOVA analysis df

In [19]:
# Collect all sources from parameters dict
all_sources = []
for sf in parameters_dict.keys():
    for th in parameters_dict[sf]:
        for source in parameters_dict[sf][th]:
            all_sources.append(source)

# Remove sources that do not have '_time' or _count_15min' cols (Not processed through regular proximity analysis)
all_sources.remove('viv_count')
all_sources.remove('viv_social_count')
all_sources.remove('hotel_count')
all_sources.remove('oficinas_count')
all_sources.remove('ndvi_count')

# Create time_cols list
time_cols = []
for source in all_sources:
    time_col = source+'_time'
    time_cols.append(time_col)

# Create count_cols list
count_cols=[]
for source in all_sources:
    count_col = source+'_count_15min'
    count_cols.append(count_col)

# Add sources that have '_count'
count_cols.append('viv_count')
count_cols.append('viv_social_count')
count_cols.append('hotel_count')
count_cols.append('oficinas_count')
count_cols.append('ndvi_count')

print("TIME COLUMNS:")
print(time_cols)
print("COUNT COLUMNS:")
print(count_cols)

TIME COLUMNS:
['carniceria_time', 'hogar_time', 'local_mini_market_time', 'bakeries_time', 'ferias_time', 'supermercado_time', 'centro_recyc_time', 'clinica_priv_time', 'clinica_pub_time', 'hospital_priv_time', 'hospital_pub_time', 'vacunatorio_priv_time', 'vacunatorio_pub_time', 'consult_ado_priv_time', 'consult_ado_pub_time', 'salud_mental_time', 'labs_priv_time', 'residencia_adumayor_time', 'farmacia_time', 'bomberos_time', 'club_deportivo_time', 'eq_deportivo_pub_time', 'eq_deportivo_priv_time', 'civic_office_time', 'tax_collection_time', 'social_security_time', 'correos_time', 'banco_time', 'police_time', 'ep_plaza_small_time', 'ep_plaza_big_time', 'museos_priv_time', 'museos_pub_time', 'bibliotecas_time', 'sitios_historicos_time', 'cines_time', 'librerias_time', 'restaurantes_bar_cafe_time', 'jardin_inf_pub_time', 'edu_especial_pub_time', 'edu_basica_pub_time', 'edu_media_pub_time', 'edu_tecnica_time', 'universidad_time', 'edu_adultos_pub_time', 'centro_edu_amb_time', 'ciclovias_

In [20]:
annova_2 = pd.DataFrame()

# ------------------------------ ANNOVA
# For each col combination
i=0
for j in range(len(time_cols)):
    for k in range(len(time_cols)):

        # Calculate annova analysis
        data = f_oneway(hex_gdf[time_cols[j]],hex_gdf[time_cols[k]])
        statistic = data[0]
        pvalue = data[1]

        # Register data in matrix
        annova_2.loc[i,'variable_1'] = time_cols[j]
        annova_2.loc[i,'variable_2'] = time_cols[k]
        annova_2.loc[i,'statistic'] = statistic
        annova_2.loc[i,'pvalue'] = pvalue

        i = i+1

# Show
annova_2

Unnamed: 0,variable_1,variable_2,statistic,pvalue
0,carniceria_time,carniceria_time,-1.058426e-29,
1,carniceria_time,hogar_time,2.315307e+02,3.072780e-52
2,carniceria_time,local_mini_market_time,4.926691e+03,0.000000e+00
3,carniceria_time,bakeries_time,2.379304e+03,0.000000e+00
4,carniceria_time,ferias_time,2.857640e+02,4.892338e-64
...,...,...,...,...
2495,paradas_tp_metro_time,centro_edu_amb_time,2.049769e+04,0.000000e+00
2496,paradas_tp_metro_time,ciclovias_time,1.432205e+04,0.000000e+00
2497,paradas_tp_metro_time,paradas_tp_time,2.908937e+04,0.000000e+00
2498,paradas_tp_metro_time,paradas_tp_tren_time,3.238787e+04,0.000000e+00


In [21]:
annova_3 = pd.DataFrame()

# ------------------------------ ANNOVA
# For each col combination
i=0
for j in range(len(count_cols)):
    for k in range(len(count_cols)):

        # Calculate annova analysis
        data = f_oneway(hex_gdf[count_cols[j]],hex_gdf[count_cols[k]])
        statistic = data[0]
        pvalue = data[1]

        # Register data in matrix
        annova_3.loc[i,'variable_1'] = count_cols[j]
        annova_3.loc[i,'variable_2'] = count_cols[k]
        annova_3.loc[i,'statistic'] = statistic
        annova_3.loc[i,'pvalue'] = pvalue

        i = i+1

# Show
annova_3

Unnamed: 0,variable_1,variable_2,statistic,pvalue
0,carniceria_count_15min,carniceria_count_15min,1.304415e-29,1.000000e+00
1,carniceria_count_15min,hogar_count_15min,7.605593e+02,6.303128e-167
2,carniceria_count_15min,local_mini_market_count_15min,7.102621e+03,0.000000e+00
3,carniceria_count_15min,bakeries_count_15min,4.922299e+03,0.000000e+00
4,carniceria_count_15min,ferias_count_15min,2.444707e+03,0.000000e+00
...,...,...,...,...
3020,ndvi_count,viv_count,1.319808e+04,0.000000e+00
3021,ndvi_count,viv_social_count,2.450451e+03,0.000000e+00
3022,ndvi_count,hotel_count,3.094209e+02,3.526437e-69
3023,ndvi_count,oficinas_count,3.869189e+02,5.206968e-86
