# Merge accessibility and physical activity data

In [None]:
# Standard Library
import sys
import time
import pickle
from pathlib import Path

# Scientific Computing & Data Analysis
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from scipy import stats
from scipy.spatial import cKDTree

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

# Geospatial Analysis
import geopandas as gpd
import libpysal as lps
from libpysal.weights.distance import get_points_array
import esda

# Machine Learning & Statistics
from sklearn.preprocessing import StandardScaler
from statsmodels.nonparametric.smoothers_lowess import lowess

# File I/O
import h5py

# Local/Custom Modules
sys.path.append('../../../Scripts/Spatial analyses')
import pyspace

In [None]:
# Directory paths
data_folder = Path('../data')
results_folder = Path('../results/')

# Create directories if they don't exist
data_folder.mkdir(parents=True, exist_ok=True)
results_folder.mkdir(parents=True, exist_ok=True)

## Data sources
### Load admin boundaries

In [None]:
# =============================================================================
# GEOGRAPHIC DATA LOADING
# =============================================================================

# Load processed Geneva geographic data
canton_ge_contour = gpd.read_file(data_folder / "canton_gecontour.GeoJSON", engine='pyogrio')
canton_ge = gpd.read_file(data_folder / "canton_ge.GeoJSON", engine='pyogrio')
communes_ge = gpd.read_file(data_folder / 'communes_ge.GeoJSON', engine='pyogrio')

print("Geographic data loaded successfully")

### Load PA data

#### All single metrics

In [None]:
df_full_pafq = pd.read_csv(data_folder/"20250602_pafq_bus_1997_2024.csv")

In [None]:
df_full_pafq = df_full_pafq[df_full_pafq.dtnumdoc < '2024-11-04'] # Restrict to data extraction date of analyses

In [None]:
df_full_pafq['codbar_new'] = df_full_pafq['codbar_new'].astype('string')
df_full_pafq['codbar'] = df_full_pafq['codbar'].astype('string')

In [None]:
pro1_columns = [col for col in df_full_pafq.columns if 'pro1_' in col]

#### Aggregated metrics 

In [None]:
seden_pafq_ddr = pd.read_csv(data_folder/'20250602_summary_pafq_1997_2024.csv')
seden_pafq_ddr.filter(regex='codbar').dtypes
seden_pafq_ddr['codbar_new'] = seden_pafq_ddr['codbar_new'].astype('string')

###  Bussante IDs

In [None]:
df_bussante_id = pd.read_csv(data_folder/'bussante_id_ddr_wrichard.csv', low_memory = 'False')
df_bussante_id['codbar_richard'] = df_bussante_id['codbar_richard'].fillna(df_bussante_id['codbar'].astype('Int64').astype('string'))

### Merge PAFQ-IDs

In [None]:
df_pafq_seden = pd.merge(df_bussante_id[['codbar_richard','x','y']], seden_pafq_ddr, left_on = 'codbar_richard', right_on = 'codbar_new')
df_pafq_all = pd.merge(df_bussante_id[['codbar_richard','x','y']], df_full_pafq, left_on = 'codbar_richard', right_on = 'codbar_new')

### Make geodataframes

In [None]:
gdf_pafq_seden = gpd.GeoDataFrame(df_pafq_seden, crs = 2056, geometry = gpd.points_from_xy(df_pafq_seden.x, df_pafq_seden.y))
gdf_pafq_seden_ge = gdf_pafq_seden[gdf_pafq_seden.within(canton_ge.geometry.unary_union)]

gdf_pafq_all = gpd.GeoDataFrame(df_pafq_all, crs = 2056, geometry = gpd.points_from_xy(df_pafq_all.x, df_pafq_all.y))
gdf_pafq_all_ge = gdf_pafq_all[gdf_pafq_all.within(canton_ge.geometry.unary_union)]

### Add health/SES indicators

- Smoking
- BMI
- Education level
- Occupation category
- Country of birth

### Define age groups

In [None]:
# Define bins and labels
bins = [18, 35, 50, 65, 75, float('inf')]  # float('inf') for 80+
labels = ['18-34', '35-49' , '50-64', '65-74', '75+']

# Categorize ages using pd.cut
gdf_pafq_all_ge['age_group'] = pd.cut(gdf_pafq_all_ge['age'], bins=bins, labels=labels)
gdf_pafq_all_ge['age_group'] = gdf_pafq_all_ge['age_group'].astype('string')

#### Filter out > 74

In [None]:
gdf_pafq_all_ge = gdf_pafq_all_ge[gdf_pafq_all_ge.age_group != '75+']

### Rename columns

In [None]:
labels = {
    'Perso':'Personal activities, standardized (min/day)',
    'PAFQ_SE_raw': "Sedentary, raw data, PAFQ (min/day)",
    'PAFQ_SE_pct': "Sedentary, PAFQ (pct)",
    'PAFQ_LPA_raw': "Light PA, raw data, PAFQ (min/day)",
    'PAFQ_LPA_pct': "Light PA, PAFQ (pct)",
    'PAFQ_MPA_raw': "Moderate PA, raw data, PAFQ (min/day)",
    'PAFQ_MPA_pct': "Moderate PA, PAFQ (pct)",
    'PAFQ_VPA_raw': "Vigorous PA, raw data, PAFQ (min/day)",
    'PAFQ_VPA_pct': "Vigorous PA, PAFQ (pct)",
    'Work_se_pct': "Work, sedentary, (% time)",
    'Work_li_pct': "Work, light intensity, (% time)",
    'Work_mo_pct': "Work, moderate intensity, (% time)",
    'Work_vi_pct': "Work, vigorous intensity, (% time)",
    'Home_se_pct': "Home, sedentary, (% time)",
    'Home_li_pct': "Home, light intensity, (% time)",
    'Home_mo_pct': "Home, moderate intensity, (% time)",
    'Home_vi_pct': "Home, vigorous intensity, (% time)",
    'Work_se_raw': "Work, sedentary, (min/day)",
    'Work_li_raw': "Work, light intensity, (min/day)",
    'Work_mo_raw': "Work, moderate intensity, (min/day)",
    'Work_vi_raw': "Work, vigorous intensity, (min/day)",
    'Home_se_raw': "Home, sedentary, (min/day)",
    'Home_li_raw': "Home, light intensity, (min/day)",
    'Home_mo_raw': "Home, moderate intensity, (min/day)",
    'Home_vi_raw': "Home, vigorous intensity, (min/day)",
    'Work_se': "Work, sedentary, standardized (min/day)",
    'Work_li': "Work, light intensity, standardized (min/day)",
    'Work_mo': "Work, moderate intensity, standardized (min/day)",
    'Work_vi': "Work, vigorous intensity, standardized (min/day)",
    'Home_se': "Home, sedentary, standardized (min/day)",
    'Home_li': "Home, light intensity, standardized (min/day)",
    'Home_mo': "Home, moderate intensity, standardized (min/day)",
    'Home_vi': "Home, vigorous intensity, standardized (min/day)",
    'Sport_mo': "Sport, moderate intensity, standardized (min/day)",
    'Sport_vi': "Sport, vigorous intensity, standardized (min/day)",
    'PAFQ_SE': "Sedentary, standardized data, PAFQ (min/day)",
    'PAFQ_LPA': "Light PA, standardized data, PAFQ (min/day)",
    'PAFQ_MPA': "Moderate PA, standardized data, PAFQ (min/day)",
    'PAFQ_VPA': "Vigorous PA, standardized data, PAFQ (min/day)",
    'etj':'Total adjusted energy kcal/day',
    'etsemns':'Total adjusted energy kcal/week (no sleep)',
    'etsemtot': "Total adjusted energy kcal/week (with sleep)"
}

In [None]:
gdf_pafq_all_ge = gdf_pafq_all_ge.rename(columns = labels)
gdf_pafq_seden_ge = gdf_pafq_seden_ge.rename(columns = labels)

## Load 15min accessibility indicators

In [None]:
# df_15min = gpd.read_file(data_folder/'15min_city'/'15min_indicators.gpkg')
df_15min = gpd.read_parquet(data_folder/'h3_accessibility_metrics_ge_final.parquet')

In [None]:
df_15min.plot()

## Create final PA-15min dataset

In [None]:
def get_season(date):
    """
    Convert a date to its corresponding season.
    
    Parameters:
    date (str or datetime): Date to convert
    
    Returns:
    str: Season name (Winter, Spring, Summer, or Fall)
    """
    # Convert string to datetime if necessary
    if isinstance(date, str):
        date = pd.to_datetime(date)
    
    # Get month from date
    month = date.month
    
    # Define seasons
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # month in [9, 10, 11]
        return 'Fall'

# To apply to your series:
gdf_pafq_all_ge['season'] = gdf_pafq_all_ge['dtnumdoc'].apply(get_season)

In [None]:
## Columns that are commented out contain ~ 9,000 NaNs values due to missing sleep data for the period (1997-2004)
_gdf_pafq_p1 = gdf_pafq_all_ge[['codbar',
                                'codbar_new',
                                'sexe',
                                'appointment_year',
                                'dtnumdoc',
                                'season',
                                'age',
                                'dtnaiss',
                                'age_group',
                                'mixed_weight',
                                'mixed_height',
                                'working_status',
                                'mba_min',
                                'sleep_time',
                                'wake_time',
                                'Sedentary, raw data, PAFQ (min/day)',
                                'Light PA, raw data, PAFQ (min/day)',
                                'Moderate PA, raw data, PAFQ (min/day)',
                                'Vigorous PA, raw data, PAFQ (min/day)',
                                
                                'Sedentary, standardized data, PAFQ (min/day)',
                                'Light PA, standardized data, PAFQ (min/day)',
                                'Moderate PA, standardized data, PAFQ (min/day)',
                                'Vigorous PA, standardized data, PAFQ (min/day)',
                                
                                'Sedentary, PAFQ (pct)',
                                'Light PA, PAFQ (pct)',
                                'Moderate PA, PAFQ (pct)',
                                'Vigorous PA, PAFQ (pct)',
                                
                                'Work, sedentary, (min/day)',
                                'Home, sedentary, (min/day)',
                                'Work, light intensity, (min/day)',
                                'Home, light intensity, (min/day)',
                                'Work, moderate intensity, (min/day)',
                                'Home, moderate intensity, (min/day)',
                                'Work, vigorous intensity, (min/day)',
                                'Home, vigorous intensity, (min/day)',

                                'Work, sedentary, (% time)',
                                'Home, sedentary, (% time)',
                                'Work, light intensity, (% time)',
                                'Home, light intensity, (% time)',
                                'Work, moderate intensity, (% time)',
                                'Home, moderate intensity, (% time)',
                                'Work, vigorous intensity, (% time)',
                                'Home, vigorous intensity, (% time)',

                                'Work, sedentary, standardized (min/day)',
                                'Home, sedentary, standardized (min/day)',
                                'Work, light intensity, standardized (min/day)',
                                'Home, light intensity, standardized (min/day)',
                                'Work, moderate intensity, standardized (min/day)',
                                'Home, moderate intensity, standardized (min/day)',
                                'Work, vigorous intensity, standardized (min/day)',
                                'Home, vigorous intensity, standardized (min/day)',
                                'Mobility (commute & personal time), standardized (min/day)',
                                'Leisure-time MVPA, standardized (min/day)',
                                'Personal activities, standardized (min/day)',
                                'Sport, moderate intensity, standardized (min/day)',
                                'Sport, vigorous intensity, standardized (min/day)',
                                ]]

_gdf_pafq_p2 = gdf_pafq_seden_ge[['codbar','codbar_new','x','y','geometry','period','Total adjusted energy kcal/week (no sleep)','Total adjusted energy kcal/day','pafqttot','seden','seden_label']]

In [None]:
_gdf_pafq_p1.codbar_new.nunique()

In [None]:
gdf_pafq_final_ge = pd.merge(_gdf_pafq_p1.drop('codbar', axis=1), _gdf_pafq_p2.drop('codbar', axis=1), on = ['codbar_new'])

### Compute BMI

In [None]:
def calculate_bmi(weight_kg, height_cm):
    """
    Calculate BMI from weight (kg) and height (cm) and return both BMI and category.
    
    Parameters:
    weight_kg (float): Weight in kilograms
    height_cm (float): Height in centimeters
    
    Returns:
    tuple: (BMI value, BMI category)
    """
    # Input validation
    if not isinstance(weight_kg, (int, float)) or not isinstance(height_cm, (int, float)):
        return None, "Invalid input"
    if weight_kg <= 0 or height_cm <= 0:
        return None, "Invalid input"
    if weight_kg > height_cm*0.9:
        return None, None
    
    # Convert height to meters and calculate BMI
    height_m = height_cm / 100
    bmi = weight_kg / (height_m * height_m)
    
    # Round BMI to 1 decimal place
    bmi = round(bmi, 1)

    if bmi > 50:
        return None, None
    # Categorize BMI
    if bmi < 12:
        return None, None
    elif bmi < 18.5:
        category = "Underweight"
    elif bmi < 25:
        category = "Normal weight"
    elif bmi < 30:
        category = "Overweight"
    else:
        category = "Obesity"
    
    return bmi, category

In [None]:
gdf_pafq_final_ge[['bmi', 'bmi_category']] = gdf_pafq_final_ge.apply(
    lambda x: calculate_bmi(x['mixed_weight'], x['mixed_height']), 
    axis=1, 
    result_type='expand'
)

In [None]:
gdf_pafq_final_ge = gdf_pafq_final_ge[gdf_pafq_final_ge.bmi.isnull()==False]

## Check NANs

In [None]:
gdf_pafq_final_ge.isna().sum().sort_values().tail(30).plot.bar()

In [None]:
gdf_pafq_final_ge.isna().sum().sort_values()

In [None]:
gdf_pafq_final_ge = gdf_pafq_final_ge.dropna()

In [None]:
gdf_pafq_final_ge.shape

## Check duplicates

In [None]:
gdf_pafq_final_ge.codbar_new.nunique()

In [None]:
gdf_pafq_final_ge = gdf_pafq_final_ge.drop_duplicates(subset = ['codbar_new','period','appointment_year'], keep='first')

### Add health/SES indicators

In [None]:
## Harmonize PAFQ and Health Q data
gdf_pafq_final_ge['sexe'] = gdf_pafq_final_ge['sexe'].map({'Masculin':'M', 'Féminin':'F'})
gdf_pafq_final_ge['appointment_year'] = gdf_pafq_final_ge['appointment_year'] - 1997
gdf_pafq_final_ge['dtnumdoc'] = pd.to_datetime(gdf_pafq_final_ge['dtnumdoc'], format='mixed').dt.date
gdf_pafq_final_ge['dtnaiss'] = pd.to_datetime(gdf_pafq_final_ge['dtnaiss'], format='mixed').dt.date
gdf_pafq_final_ge['dtnumdoc_ym'] = pd.to_datetime(gdf_pafq_final_ge['dtnumdoc']).dt.strftime('%Y-%m')

In [None]:
df_qhealth = pd.read_csv('../data/20250109_health_questionnaire_1992_2024.csv').drop_duplicates()
df_codbar_map = pd.read_csv('../data/cle_codage_BS.csv', sep=';')

In [None]:
df_codbar_map['sexe'] = df_codbar_map['sexe'].map({'Féminin':'F', 'Masculin':'M'})

In [None]:
df_codbar_map['codbar_new'] = df_codbar_map['codbar_new'].fillna(df_codbar_map['codbar'])

In [None]:
df_codbar_map['participant_identifier_sexe'] = df_codbar_map['participant_identifier'] + df_codbar_map['sexe']
df_qhealth['participant_identifier_sexe'] = df_qhealth['participant_identifier'] + df_qhealth['sexe']

In [None]:
df_qhealth['codbar_new'] = df_qhealth['participant_identifier_sexe'].map(df_codbar_map.set_index('participant_identifier_sexe')['codbar_new'].to_dict())

In [None]:
df_qhealth['dtnumdoc'] = pd.to_datetime(df_qhealth['dtnumdoc']).dt.date
df_qhealth['dtnaiss'] = pd.to_datetime(df_qhealth['birthdate']).dt.date
df_qhealth['dtnumdoc_ym'] = pd.to_datetime(df_qhealth['dtnumdoc']).dt.strftime('%Y-%m')

In [None]:
df_qhealth['age_rounded'] = df_qhealth['age'].round(0)
gdf_pafq_final_ge['age_rounded'] = gdf_pafq_final_ge['age'].round(0)

In [None]:
# codbar_matches = pd.merge(df_qhealth[['participant_identifier','dtnumdoc','dtnumdoc_ym','dtnaiss', 'age', 'age_rounded','sexe']], gdf_pafq_final_ge[['codbar_new','dtnumdoc','dtnumdoc_ym','dtnaiss', 'age', 'age_rounded','sexe']], on = ['dtnumdoc_ym','dtnaiss','age_rounded','sexe'])

In [None]:
gdf_pafq_final_ge_wses = pd.merge(gdf_pafq_final_ge, df_qhealth[['codbar_new','sexe','ctry_bth','smoking', 'former_smoker','self_rated_health','education','education_coded','employment_status','last_employment_status','income_grp']], on = ['codbar_new','sexe'], how='left').drop_duplicates()

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses.drop_duplicates(subset = ['codbar_new','sexe'])

### Education coded

I noticed (Discussed with Shannon) that there are some mistakes in the coding of the education_coded column. This is aim to fix it.

In [None]:
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.period != 'postpandemie')&(gdf_pafq_final_ge_wses.education == 8),'education_coded'] = 'Tertiaire'
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.period == 'postpandemie')&(gdf_pafq_final_ge_wses.education == 3),'education_coded'] = 'Primaire'

In [None]:
gdf_pafq_final_ge_wses['education_coded'] = gdf_pafq_final_ge_wses['education_coded'].map({'Primaire': 'Primary',
                                                                                           'Secondaire': 'Secondary',
                                                                                           'Tertiaire': 'Tertiary',
                                                                                           'Autre': 'Other'})

In [None]:
gdf_pafq_final_ge_wses['education_coded'].value_counts(dropna=False)

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses.dropna(subset = ['education_coded'])

### Smoking variable

In [None]:
gdf_pafq_final_ge_wses.loc[gdf_pafq_final_ge_wses.smoking == 1,'smoking_status'] = 'Active smoker'
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.smoking == 0)&(gdf_pafq_final_ge_wses.former_smoker == 1),'smoking_status'] = 'Ex-smoker'
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.smoking == 0)&(gdf_pafq_final_ge_wses.former_smoker == 1),'smoking_status'] = 'Ex-smoker'
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.smoking == 0)&(gdf_pafq_final_ge_wses.former_smoker == 0),'smoking_status'] = 'Never-smoker'
gdf_pafq_final_ge_wses.loc[(gdf_pafq_final_ge_wses.smoking == 0)&(gdf_pafq_final_ge_wses.former_smoker == 0),'smoking_status'] = 'Never-smoker'
# gdf_pafq_final_ge_wses.loc[gdf_pafq_final_ge_wses.smoking_status.isnull(),'smoking_status'] = 'Unknown'

In [None]:
gdf_pafq_final_ge_wses.smoking_status.value_counts(dropna=False)

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses.dropna(subset = ['smoking_status'])

### Employement status variable

In [None]:
gdf_pafq_final_ge_wses['employment_coded'] = gdf_pafq_final_ge_wses['employment_status'].fillna(gdf_pafq_final_ge_wses['last_employment_status'])

In [None]:
gdf_pafq_final_ge_wses['employment_coded'] = gdf_pafq_final_ge_wses['employment_coded'].map({1: 'Non-manual manager',
                                                2: 'Non-manual worker',
                                                3: 'Manual self-employed worker',
                                                4: 'Manual worker',
                                                5: 'Housewife/Househusband'})
# gdf_pafq_final_ge_wses['employment_coded'] = gdf_pafq_final_ge_wses['employment_coded'].fillna('Unknown')

In [None]:
gdf_pafq_final_ge_wses.employment_coded.value_counts(dropna=False)

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses.dropna(subset = ['employment_coded'])

### Self-rated health

In [None]:
gdf_pafq_final_ge_wses['self_rated_health'].value_counts(dropna=False)

## Create final GDF

In [None]:
gdf_pafq_final_ge_wses = gpd.GeoDataFrame(gdf_pafq_final_ge_wses, crs=2056, geometry = gdf_pafq_final_ge_wses['geometry'])

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses.drop_duplicates(subset = ['codbar_new','sexe','appointment_year'])

In [None]:
gdf_pafq_final_ge_wses = gdf_pafq_final_ge_wses[gdf_pafq_final_ge_wses.age > 19]

## Spatially join PA and accessibility measures

In [None]:
gdf_pafq_final_ge_15min = gpd.sjoin_nearest(gdf_pafq_final_ge_wses, df_15min.to_crs(2056), how = 'left', distance_col = 'distance_h3')

In [None]:
gdf_pafq_final_ge_wses[gdf_pafq_final_ge_wses.codbar_new == '2011F0033']['geometry']

In [None]:
m = gdf_pafq_final_ge_15min[gdf_pafq_final_ge_15min.distance_h3 > 100][['geometry']].explore(color='red')
df_15min[['geometry']].to_crs(2056).explore(m=m)

In [None]:
gdf_pafq_final_ge_15min = pyspace.add_random_noise(gdf_pafq_final_ge_15min, 'x', 'y')

## E(S)DA 

In [None]:
# distances = [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600]
distances = [1125]

In [None]:
# First calculate and add the distances as a column
def add_nearest_neighbor_distance(gdf):
    # Create coordinates array
    coords = np.array(list(zip(gdf.geometry.x, gdf.geometry.y)))
    
    # Create KD-tree
    tree = cKDTree(coords)
    
    # Find distances to 2 nearest neighbors (including self)
    distances, _ = tree.query(coords, k=2)
    
    # Add the distances as a new column (second column has actual nearest neighbor)
    gdf['nearest_neighbor_dist'] = distances[:, 1]
    
    return gdf

# Add the distances
gdf_pafq_final_ge_15min = add_nearest_neighbor_distance(gdf_pafq_final_ge_15min)

In [None]:
gdfs_distance = [gdf_pafq_final_ge_15min[gdf_pafq_final_ge_15min.nearest_neighbor_dist < dist] for dist in distances]

In [None]:
incremental_weights = [lps.weights.DistanceBand(cKDTree(get_points_array(_gdf.geometry)), dist) for _gdf, dist in zip(gdfs_distance, distances)]

### Global Moran's I

#### Sedentarity

In [None]:
# col = 'Sedentary, raw data, PAFQ (min/day)'
# global_morans_I_seden = []
# for _distance, _w, _gdf in zip(distances, incremental_weights, gdfs_distance):
#     print(_distance)
#     mi = esda.Moran(_gdf[col],  _w, permutations = 999)
#     z_score, p_value = mi.z_sim, mi.p_z_sim
#     mi_values = [_distance, z_score]
#     global_morans_I_seden.append(mi_values)

In [None]:
def plot_incremental_spatial_autocorrelation(distances_zscores, 
                                           title="Incremental Spatial Autocorrelation",
                                           save_path=None,
                                           figsize=(10, 6),
                                           color='#2E86C1'):
    """
    Create a line plot for incremental spatial autocorrelation analysis.
    
    Parameters:
    -----------
    distances_zscores : list of lists or numpy array
        List containing [distance, z_score] pairs
    title : str, optional
        Title of the plot
    save_path : str, optional
        Path to save the figure. If None, figure is not saved
    figsize : tuple, optional
        Figure size in inches (width, height)
    color : str, optional
        Color of the line plot
        
    Returns:
    --------
    fig : matplotlib figure object
    ax : matplotlib axes object
    """
    
    # Convert to DataFrame
    df = pd.DataFrame(distances_zscores, columns=['Distance', 'Z-Score'])
    
    # Create the plot
    fig, ax = plt.subplots(figsize=figsize)
    sns.set_style("whitegrid")
    
    # Main line plot
    sns.lineplot(data=df, x='Distance', y='Z-Score', 
                marker='o',
                markersize=8,
                color=color,
                ax=ax)
    
    # Add color coding for significant points
    significant_points = df[df['Z-Score'] > 1.96]
    ax.scatter(significant_points['Distance'], significant_points['Z-Score'],
              color='red', s=100, label='Significant (p<0.05)')
    
    # Add the significance line
    ax.axhline(y=1.96, color='r', linestyle='--', alpha=0.5, 
               label='95% Significance')
    
    # Customize the plot
    ax.set_title(title, fontsize=12, pad=15)
    ax.set_xlabel('Distance (meters)', fontsize=10)
    ax.set_ylabel('Moran\'s I Z-Score', fontsize=10)
    
    # Format x-axis to show distances in thousands (k)

    # ax.xaxis.set_major_formatter(plt.FuncFormatter(format_func))
    
    # Add legend
    ax.legend()
    
    # Tight layout
    plt.tight_layout()
    
    # Save if path is provided
    if save_path:
        plt.savefig(results_folder/save_path, dpi=300, bbox_inches='tight')
    
    return fig, ax

#### Physical activity levels

#### Accessibility measures

### Getis Ord Gi

In [None]:
gdf_1200 = gdfs_distance[0]
w_1200 = incremental_weights[0]
w_1200.transform = 'R'  # Row-standardize the weights

#### Light PA

In [None]:
gdf_spatial = gdf_1200.copy()

In [None]:
col = 'Light PA, standardized data, PAFQ (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_LightPA_std.png', dpi = 360)

#### Moderate PA

In [None]:
col = 'Moderate PA, standardized data, PAFQ (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_ModeratePA_std.png', dpi = 360)

#### Vigorous PA

In [None]:
col = 'Vigorous PA, standardized data, PAFQ (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_VigorousPA_std.png', dpi = 360)

#### Sedentarity

In [None]:
col = 'Sedentary, standardized data, PAFQ (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_Sedentary_std.png', dpi = 360)

In [None]:
gdf_spatial.groupby(['Sedentary, standardized data, PAFQ (min/day)_G_cl'])['Work, sedentary, standardized (min/day)'].mean()

In [None]:
gdf_spatial[f'{col}_lag'] = lps.weights.lag_spatial(w_1200, gdf_spatial[col])

In [None]:
pyspace.plot_getis_by_class(gdf_spatial, f'{col}_G_cl', f'{col}_lag', label='Sedentarity', p_001 = False)

In [None]:
col = 'Total adjusted energy kcal/week (no sleep)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)

#### Home sedentarity

In [None]:
col = 'Home, sedentary, standardized (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_HomeSedentary_std.png', dpi = 360)

### Sport, moderate intensity

In [None]:
col = 'Sport, moderate intensity, standardized (min/day)'
gdf_spatial[col] = gdf_spatial[col].astype('float64')
getis_values = pyspace.compute_getis(gdf_spatial, col, w_1200, 999, transform_type='R', p_001=True)
fig, ax = pyspace.plotGetisMap_ge(gdf_spatial,f'{col}_G_cl', markersize_s=3, markersize_l=5,p_001 = True, commune_name = False)
plt.savefig(results_folder/'Getis_SportModerate_std.png', dpi = 360)

## Physical activity

In [None]:
gdf_1200['Total PA, standardized data, PAFQ (min/day)'] = gdf_1200[['Light PA, standardized data, PAFQ (min/day)','Moderate PA, standardized data, PAFQ (min/day)','Vigorous PA, standardized data, PAFQ (min/day)']].sum(axis=1)

In [None]:
gdf_1200_mob_nonull = gdf_1200[gdf_1200['Mobility (commute & personal time), standardized (min/day)'] > 0]

## Heatmap

In [None]:
gdf_1200_final = pd.concat([gdf_1200, 
                      pd.get_dummies(gdf_1200.sexe, prefix='SEX').astype(int),
                      pd.get_dummies(gdf_1200.bmi_category, prefix='BMI').astype(int),    
                      pd.get_dummies(gdf_1200.education_coded, prefix='EDUC').astype(int),    
                      pd.get_dummies(gdf_1200.employment_coded, prefix='JOB').astype(int),    
                      pd.get_dummies(gdf_1200.smoking_status, prefix='SMOKING').astype(int),    
                      pd.get_dummies(gdf_1200.season, prefix='SEASON').astype(int),               
                      pd.get_dummies(gdf_1200.age_group.str.replace(",",'').str.replace("'",'').str.replace("/",'_').str.replace(' ','_'), prefix='AGE').astype(int)], axis = 1)

In [None]:
correlation_matrix = gdf_1200_final[['Mobility (commute & personal time), standardized (min/day)',
                                     'Home, sedentary, standardized (min/day)',
                                     'time_to_20th_physical',
                                     'time_to_20th_transport',
                                     'time_to_20th_outdoor',
                                     'overall_15min_city_proximity_time',
                                     'overall_15min_city_pa_proximity_time']].corr().round(2)

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)

### MVPA

In [None]:
gdf_1200_final['Total adjusted energy kcal/week (no sleep)'].plot.hist(bins=50)

In [None]:
np.log(gdf_1200_final[gdf_1200_final['Mobility (commute & personal time), standardized (min/day)'] > 0]['Mobility (commute & personal time), standardized (min/day)']).std()

In [None]:
gdf_1200_final['SB_cat'] = pd.qcut(gdf_1200_final['Sedentary, standardized data, PAFQ (min/day)'], q=3, labels = ['Low','Moderate','High'])
gdf_1200_final['LPA_cat'] = pd.qcut(gdf_1200_final['Light PA, standardized data, PAFQ (min/day)'], q=3, labels = ['Low','Moderate','High'])
gdf_1200_final['MPA_cat'] = pd.qcut(gdf_1200_final['Moderate PA, standardized data, PAFQ (min/day)'], q=3, labels = ['Low','Moderate','High'])
# gdf_1200_final['VPA_cat'] = pd.qcut(gdf_1200_final['Vigorous PA, standardized data, PAFQ (min/day)'], q=3, labels = ['Low','Moderate','High'])

In [None]:
gdf_1200_final['MVPA'] = gdf_1200_final['Moderate PA, standardized data, PAFQ (min/day)']+gdf_1200_final['Vigorous PA, standardized data, PAFQ (min/day)']

## Save final dataset

In [None]:
gdf_1200_final['E'], gdf_1200_final['N'] = gdf_1200_final['geometry'].x, gdf_1200_final['geometry'].y

In [None]:
gdf_1200_final['use_active_mobility_binary'] = 0
gdf_1200_final.loc[gdf_1200_final['Mobility (commute & personal time), standardized (min/day)'] > 0, 'use_active_mobility_binary'] = 1

In [None]:
gdf_1200_final['leisure_mvpa_binary'] = 0
gdf_1200_final.loc[gdf_1200_final['Leisure-time MVPA, standardized (min/day)'] > 0, 'leisure_mvpa_binary'] = 1

In [None]:
gdf_1200_final[['dtnumdoc','dtnaiss']] = gdf_1200_final[['dtnumdoc','dtnaiss']].astype('string')

In [None]:
gdf_1200_final['Mobility (commute & personal time), standardized (min/day)_eps'] = gdf_1200_final['Mobility (commute & personal time), standardized (min/day)']+0.01

In [None]:
gdf_1200_final['overall_15min_city_proximity_time_q4'] = 'Q'+pd.qcut(gdf_1200_final['overall_15min_city_proximity_time'], q=4, labels=False).astype('string') +' - '+ pd.qcut(gdf_1200_final['overall_15min_city_proximity_time'], q=4, precision=1).astype('string')

In [None]:
gdf_1200_final['overall_15min_city_proximity_time_cat'] = pd.cut(gdf_1200_final['overall_15min_city_proximity_time'], bins = [0, 5, 10, 15, 20, 25, 30, 40, 1000], labels = ["<5 min", "5-10 min", '10-15 min', "15-20min","20-25 min", "25-30 min", "30-40 min", ">40 min"])
gdf_1200_final['overall_15min_city_proximity_time_cat_4'] = pd.cut(gdf_1200_final['overall_15min_city_proximity_time'], bins = [0, 10, 15, 30, 1000], labels = ["<10 min", '10-15 min', "15-30 min", ">30 min"])

In [None]:
gdf_1200_final_mobility = gdf_1200_final[gdf_1200_final['Mobility (commute & personal time), standardized (min/day)']>0]

In [None]:
gdf_1200_final['MVPA_cat'] = pd.qcut(gdf_1200_final['MVPA'], q=4, labels = ['Low','Moderate','High','Very high'])
gdf_1200_final_mobility['active_mobility_cat'] = pd.qcut(gdf_1200_final_mobility['Mobility (commute & personal time), standardized (min/day)'], q=4, labels = ['Low','Moderate','High','Very high'])

In [None]:
gdf_1200_final.to_csv('../data/gdf_final_1200.csv', index=False)
# gdf_1200_final.to_file('../data/15min_city/gdf_final_1200.gpkg')

## 15-minuteness

In [None]:
gdf_1200_final['dummy'] = 1

In [None]:
def create_cumulative_accessibility_plot(df):
    # Calculate cumulative percentage
    total_pop = df['dummy'].sum()
    sorted_data = df.sort_values('overall_15min_city_proximity_time')
    cumulative_pop = np.cumsum(sorted_data['dummy']) / total_pop * 100

    # Create the plot
    plt.figure(figsize=(5, 5))
    
    # Plot the cumulative distribution
    plt.plot(sorted_data['overall_15min_city_proximity_time'], cumulative_pop, 
            linewidth=2, label='Canton of Geneva')
    
    # Add vertical line at 15 minutes
    plt.axvline(x=15, color='gray', linestyle='--', alpha=0.5)
    
    # Customize the plot
    plt.xlabel('Proximity time PT [min]')
    plt.ylabel('Cumulative population % below time')
    plt.ylim(0, 100)
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Improve layout
    plt.tight_layout()
    
    return plt

# Create and display the plot
plot = create_cumulative_accessibility_plot(gdf_1200_final)
plt.savefig('../results/15minutecity_cumul.png', dpi=360, bbox_inches='tight')

In [None]:
def create_multiple_cumulative_accessibility_plots(df):
    # Identify columns with 'time_to_20th' in their name
    time_columns = [col for col in df.columns if 'time_to_20th' in col]
    
    if not time_columns:
        print("No columns with 'time_to_20th' substring found in the dataframe")
        return None
    
    # Calculate total population
    total_pop = df['dummy'].sum()
    
    # Create the plot
    plt.figure(figsize=(12, 8))
    
    # Use tab10 categorical colormap for the different lines
    cmap = plt.cm.tab10
    colors = [cmap(i % 10) for i in range(len(time_columns))]
    
    # For each column, calculate and plot cumulative distribution
    for i, col in enumerate(sorted(time_columns)):
        # Create a copy of the dataframe sorted by this specific column
        sorted_data = df.sort_values(col)
        
        # Calculate cumulative percentage for this column
        cumulative_pop = np.cumsum(sorted_data['dummy']) / total_pop * 100
        
        # Get a nice label from the column name - remove 'time_to_20th_' and capitalize
        label = col.replace('time_to_20th_', '').replace('_', ' ').title()
        
        # Plot the cumulative distribution for this column
        plt.plot(sorted_data[col], cumulative_pop, linewidth=1.8, label=label, color=colors[i])
    
    # Add the overall proximity time as a black, thicker line
    if 'overall_15min_city_proximity_time' in df.columns:
        # Sort data by the overall proximity time
        sorted_overall = df.sort_values('overall_15min_city_proximity_time')
        
        # Calculate cumulative percentage
        cumulative_overall = np.cumsum(sorted_overall['dummy']) / total_pop * 100
        
        # Plot with thicker, black line - add as the last item so it's on top
        plt.plot(sorted_overall['overall_15min_city_proximity_time'], cumulative_overall, 
                linewidth=2.5, color='black', label='All categories combined')
        
        # Calculate percentage at 15 minutes
        y_at_15min = np.interp(15, sorted_overall['overall_15min_city_proximity_time'], cumulative_overall)
        
        # Add a marker at the 15-minute threshold point for the overall line
        plt.plot(15, y_at_15min, 'ko', markersize=7)
        
        # Add text annotation for the percentage
        plt.text(15.9, y_at_15min + 2, f'{y_at_15min:.1f}%', fontsize=10, fontweight='bold')
    
    # Add vertical line at 15 minutes
    plt.axvline(x=15, color='gray', linestyle='--', alpha=0.7, label='15-min threshold')
    
    # Customize the plot
    plt.xlabel('Proximity time [min]')
    plt.ylabel('Cumulative population (%) below time')
    plt.ylim(0, 100)
    plt.xlim(0, 60)  # Limit to 60 minutes
    plt.grid(True, alpha=0.3)
    
    # Add a legend with smaller font and outside the plot area
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
    
    # Add title
    # plt.title('Cumulative Accessibility by Amenity Type')
    
    # Improve layout with extended right margin for legend
    plt.tight_layout(rect=[0, 0, 0.75, 1])
    
    return plt

# Also include original function as an alternative
def create_overall_cumulative_accessibility_plot(df, proximity_col='overall_15min_city_proximity_time'):
    # Calculate cumulative percentage
    total_pop = df['dummy'].sum()
    sorted_data = df.sort_values(proximity_col)
    cumulative_pop = np.cumsum(sorted_data['dummy']) / total_pop * 100
    
    # Create the plot
    plt.figure(figsize=(6, 6))
    
    # Plot the cumulative distribution
    plt.plot(sorted_data[proximity_col], cumulative_pop, 
            linewidth=2.5, color='darkblue', label='Canton of Geneva')
    
    # Add vertical line at 15 minutes
    plt.axvline(x=15, color='red', linestyle='--', alpha=0.7, label='15-min threshold')
    
    # Add horizontal line at the percentage with access within 15 minutes
    y_at_15min = np.interp(15, sorted_data[proximity_col], cumulative_pop)
    plt.axhline(y=y_at_15min, color='gray', linestyle=':', alpha=0.7)
    
    # Mark the intersection point
    plt.plot(15, y_at_15min, 'ro', markersize=6)
    
    # Add text annotation for the percentage
    plt.text(15.5, y_at_15min + 2, f'{y_at_15min:.1f}%', fontsize=10, color='red')
    
    # Customize the plot
    plt.xlabel('Proximity time PT [min]')
    plt.ylabel('Cumulative population (%) below time')
    plt.ylim(0, 100)
    plt.xlim(0,60)
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Improve layout
    plt.tight_layout()
    
    return plt

# Example usage:
plot = create_multiple_cumulative_accessibility_plots(gdf_1200_final)
plt.savefig('../results/15minutecity_multi_cumul.png', dpi=360, bbox_inches='tight')

In [None]:
def create_stratified_cumulative_plot(df):
    plt.figure(figsize=(5, 5))
    
    # Define education levels and corresponding colors
    education_levels = ['Low', 'Moderate', 'High']
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, Orange, Green
    
    # Create curve for each education level
    for education, color in zip(education_levels, colors):
        # Filter data for current education level
        edu_data = df[df['LPA_cat'] == education]
        
        # Calculate cumulative percentage
        total_pop = edu_data['dummy'].sum()
        sorted_data = edu_data.sort_values('overall_15min_city_proximity_time')
        cumulative_pop = np.cumsum(sorted_data['dummy']) / total_pop * 100
        
        # Plot the cumulative distribution
        plt.plot(sorted_data['overall_15min_city_proximity_time'], cumulative_pop, 
                linewidth=2, label=education, color=color)
    
    # Add vertical line at 15 minutes
    plt.axvline(x=15, color='gray', linestyle='--', alpha=0.5)
    
    # Customize the plot
    plt.xlabel('Proximity time PT [min]')
    plt.ylabel('Cumulative population % below time')
    plt.ylim(0, 100)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    
    return plt

# Create and display the plot
plot = create_stratified_cumulative_plot(gdf_1200_final)
plot.show()

In [None]:
def create_stratified_cumulative_plot(df, col, levels, label, pt_col):
    plt.figure(figsize=(6, 6))
        
    # Create a nice color palette (blues to purples with good distinction)
    # Using a custom colormap for better distinction between groups
    colors = ['#eff3ff', '#c6dbef', '#9ecae1', '#6baed6', '#3182bd', '#08519c']
    
    # Plot for each income level
    for level, color in zip(levels, colors):
        # Filter data for current income level
        _data = df[df[col] == level]
        
        # If there's data for this income level
        if not _data.empty:
            # Calculate cumulative percentage
            total_pop = _data['dummy'].sum()
            sorted_data = _data.sort_values(pt_col)
            cumulative_pop = np.cumsum(sorted_data['dummy']) / total_pop * 100
            
            # Plot the cumulative distribution
            if level == 'Refus':
                plt.plot(sorted_data[pt_col], cumulative_pop, 
                    linewidth=2.5, label=level, color=color, alpha = 0.3)
            else :
                plt.plot(sorted_data[pt_col], cumulative_pop, 
                    linewidth=2.5, label=level, color=color)
    
    # Add vertical line at 15 minutes
    plt.axvline(x=15, color='gray', linestyle='--', alpha=0.7, label='15 min threshold')
    
    # Customize the plot
    plt.xlabel('Proximity time PT [min]', fontsize=12)
    plt.ylabel('Cumulative population % below time', fontsize=12)
    plt.xlim(0, 60)  # Assuming max proximity time is 60 min as in original graph
    plt.ylim(0, 100)
    plt.grid(True, alpha=0.3)
    
    # Improve legend
    plt.legend(title=label, title_fontsize=12, fontsize=10, 
               loc='lower right', framealpha=0.9)
    
    # Add title
    plt.title(f'Cumulative Distribution of Proximity Time by {label}', fontsize=14)
    
    plt.tight_layout()
    
    return plt

In [None]:
# Create and display the plot
levels = ['Faible', 'Modéré faible', 'Modéré élevé', 'Elevé']
col = 'income_grp'
label = 'Income level'
plot = create_stratified_cumulative_plot(gdf_1200_final, col, levels, label, 'time_to_20th_education')
plot.show()

In [None]:
# Create and display the plot
levels = ['Low','Moderate','High','Very high']
col = 'MVPA_cat'
label = 'MVPA level'
plot = create_stratified_cumulative_plot(gdf_1200_final, col, levels, label, 'overall_15min_city_proximity_time')
plot.show()

### Normality check

In [None]:
gdf_1200_final['Mobility (commute & personal time), standardized (min/day)']

In [None]:
def check_normality(data_series):
    # Shapiro-Wilk test
    shapiro_stat, shapiro_p = stats.shapiro(data_series)
    
    # D'Agostino-Pearson test
    dagostino_stat, dagostino_p = stats.normaltest(data_series)
    
    print("Shapiro-Wilk test:")
    print(f"Statistic: {shapiro_stat:.4f}")
    print(f"p-value: {shapiro_p:.4f}")
    print("\nD'Agostino-Pearson test:")
    print(f"Statistic: {dagostino_stat:.4f}")
    print(f"p-value: {dagostino_p:.4f}")
    
    # Visual inspection
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Histogram
    sns.histplot(data_series, kde=True, ax=ax1)
    ax1.set_title('Distribution with KDE')
    
    # Q-Q plot
    stats.probplot(data_series, dist="norm", plot=ax2)
    ax2.set_title('Q-Q Plot')
    
    plt.tight_layout()
    plt.savefig(results_folder/'normality_seden_std.png', dpi=120)

# Usage example:
check_normality(gdf_1200_final['Leisure-time MVPA, standardized (min/day)'])

In [None]:
check_normality(gdf_1200_final['Mobility (commute & personal time), standardized (min/day)'])