In [None]:
import os
os.chdir('/Users/jjaniak/Documents/studia/projekt/gradient')

import geopandas as gpd
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import shap
import numpy as np

from shapely.wkt import loads

from esda.moran import Moran_Local
import libpysal

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from src.embedders.osm_data_embedder import OSMDataEmbedder
from srai.regionalizers import geocode_to_region_gdf
from srai.embedders import CountEmbedder
from srai.regionalizers import H3Regionalizer
from srai.loaders.osm_loaders.filters import OsmTagsFilter

from IPython.display import display

pd.set_option("display.max_columns", None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
city_name = "Pozna\u0144"
nominatim_city_name = "Pozna\u0144, Poland"
year = 2022

## Functions

In [None]:
query: OsmTagsFilter = {"highway": True, "railway": True, "route": True}


def create_hex_gds(h3_resolution, city_name=city_name):
    data_embedder = OSMDataEmbedder(
        area=geocode_to_region_gdf(nominatim_city_name),
        embedder=CountEmbedder(),
        regionalizer=H3Regionalizer(resolution=h3_resolution),
        query=query,
    )

    filename = f"data/baseline-datasets/in/{city_name}-hex-res-{h3_resolution}-and-features-gdf.shp"

    if not os.path.exists(filename):
        hex_and_features_gdf = data_embedder.make_embeddings()  # type: ignore
        hex_and_features_gdf.to_file(
            filename,
            index=True,
        )
    else:
        hex_and_features_gdf = gpd.read_file(filename)
        hex_and_features_gdf.set_index("region_id", inplace=True)

    return hex_and_features_gdf


def get_accidents_gdf(h3_resolution, city_name=city_name, year=year):
    filename = f"data/accidents_in_hex/{city_name}_accidents_{year}_res{h3_resolution}.csv"
    if not os.path.exists(filename):
        raise FileNotFoundError(f"The file {filename} does not exist.")
    else:
        accidents_df = pd.read_csv(filename)
        accidents_df['geometry'] = accidents_df['geometry'].apply(loads)
        accidents_gdf = gpd.GeoDataFrame(accidents_df, geometry='geometry', crs="EPSG:4326")
        return accidents_gdf
    
def merge_gdf(accidents_gdf, hex_and_features_gdf):
    merged_gdf = gpd.sjoin(left_df=accidents_gdf, right_df=hex_and_features_gdf, how='inner', op='intersects')
    merged_gdf = merged_gdf.drop(columns='index_right')
    merged_gdf.rename(columns={'count': 'num_accidents'}, inplace=True)
    merged_gdf['num_accidents'] = merged_gdf['num_accidents'].astype(int)

    return merged_gdf

In [None]:
def local_moran(df, column):
    w = libpysal.weights.Queen.from_dataframe(df)
    y = df[column].values
    moran_loc = Moran_Local(y, w)
    return moran_loc.Is, moran_loc.p_sim, moran_loc.q

In [None]:
def correlation_analysis(merged_gdf):
    all_features = merged_gdf.drop(columns=['geometry', 'region_id', 'num_accidents', 'binary_accidents']).columns.to_list()
    
    correlation_matrix = merged_gdf[all_features + ['binary_accidents']].corr()
    correlation_values = correlation_matrix['binary_accidents']

    correlation_metrics = correlation_values.drop('binary_accidents').describe()

    # Calculate the interquartile range (IQR)
    Q1 = correlation_values.drop('binary_accidents').quantile(0.25)
    Q3 = correlation_values.drop('binary_accidents').quantile(0.75)
    IQR = Q3 - Q1

    # Filter features based on IQR
    best_features = correlation_values[(correlation_values < (Q1 - 0.5 * IQR)) | (correlation_values > (Q3 + 0.5 * IQR))].index.to_list()

    best_features.remove('binary_accidents')
    correlation_matrix_best_features = merged_gdf[best_features + ['binary_accidents']].corr()

    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix_best_features, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap (best features)')
    plt.show()

    return best_features, correlation_metrics

In [None]:
def shap_analysis(h3_resolution, merged_gdf, selected_features, city_name=city_name, save_force_plot=False):
    y = merged_gdf['binary_accidents']
    X = merged_gdf[selected_features]

    X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)

    linear_lr = LogisticRegression(max_iter=1000)
    linear_lr.fit(X_train, y_train)

    #Przygotowanie przybliżonej próbki tła za pomocą metody k-means.
    background_summary = shap.kmeans(X_train, 10)

    explainer = shap.KernelExplainer(linear_lr.predict_proba, background_summary)

    #Obliczenie SHAP wartości dla danych testowych (X_test), co pozwala na zrozumienie, jak każda zmienna przyczynia się do przewidywanej wartości.
    shap_values = explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test)

    if save_force_plot:
        shap.initjs()
        p = shap.force_plot(explainer.expected_value[0], shap_values[0], X_test)
        shap.save_html(f'{city_name}_res_{h3_resolution}_force_plot_best_features.html', p)

## Analysis

In [None]:
# Set resolutions
resolutions = [8, 9, 10]

# Initialize dataframe and dictionaries to store results
distribution_results = pd.DataFrame(columns=['Resolution', 'Num_Hexes', 'Accidents_Mean', 'Accidents_Std','Binary_Accidents_Mean', 'Binary_Accidents_Std'])
moran_i_results = pd.DataFrame(columns=['Resolution', 'Mean', 'Std', 'Min','Max'])
df_correlation_metrics = pd.DataFrame()

best_features_based_on_correlation = {}
local_moran_results = {} 

# Loop through resolutions
for resolution in resolutions:
    
    print(f"Analysis for Resolution = {resolution}")
    
    # Create hex dataframes
    hex_and_features_gdf = create_hex_gds(h3_resolution=resolution, city_name=city_name)
    accidents_gdf = get_accidents_gdf(h3_resolution=resolution, city_name=city_name, year=year)
    merged_gdf = merge_gdf(accidents_gdf, hex_and_features_gdf)
    
    # Convert accidents count to binary classification
    merged_gdf['binary_accidents'] = 0  # Initialize with 0
    merged_gdf.loc[merged_gdf["num_accidents"] > 0, "binary_accidents"] = 1.0

    new_row = {
        'Resolution': resolution,
        'Num_Hexes': len(merged_gdf),
        'Accidents_Mean': merged_gdf['num_accidents'].mean(),
        'Accidents_Std': merged_gdf['num_accidents'].std(),
        'Binary_Accidents_Mean': merged_gdf['binary_accidents'].mean(),
        'Binary_Accidents_Std': merged_gdf['binary_accidents'].std()
    }
    result_df = pd.DataFrame([new_row], columns=distribution_results.columns)
    distribution_results = pd.concat([distribution_results, result_df], ignore_index=True)
    
    # Summary statistics of numerical columns
    summary_statistics = merged_gdf.describe()
    mean_row = summary_statistics.loc['mean']
    sorted_columns = mean_row.sort_values().index
    summary_statistics_sorted = summary_statistics[sorted_columns]
    print("Summary Statistics of Numerical Columns (Sorted by Mean):")
    display(summary_statistics_sorted)
    
    # Correlation
    best_features, correlation_metrics = correlation_analysis(merged_gdf)
    df_correlation_metrics = pd.concat([df_correlation_metrics, correlation_metrics], axis=1)
    best_features_based_on_correlation[resolution] = best_features
    
    shap_analysis(h3_resolution=resolution, merged_gdf=merged_gdf, selected_features=best_features)
    
    # Calculate Local Moran's I
    moran_i, p_sim, q = local_moran(merged_gdf, column='num_accidents')
    local_moran_results[resolution] = {'moran_i': moran_i, 'p_sim': p_sim, 'q': q}
    new_row = {
        'Resolution': resolution,
        'Mean':  np.mean(moran_i),
        'Std': np.std(moran_i),
        'Min': np.min(moran_i),
        'Max': np.min(moran_i),
    }
    result_df = pd.DataFrame([new_row], columns=distribution_results.columns)
    distribution_results = pd.concat([distribution_results, result_df], ignore_index=True)

In [None]:
df_correlation_metrics.columns = [f'resolution={resolution}' for resolution in resolutions]
print("Correlation Metrics of Binary Accidents with All Features:")
display(df_correlation_metrics)

### Rozkład wypadków

In [None]:
print("Distribution of Accidents in Hexes:")
display(distribution_results)

### Istotne cechy

In [None]:
print("Best Features Based on Correlation:\n")
for resolution, best_features in best_features_based_on_correlation.items():
    print(f"\nResolution {resolution}:")
    for feature in best_features:
        print(f"{feature}")

In [None]:
features_resolution_8 = best_features_based_on_correlation[8]
features_resolution_9 = best_features_based_on_correlation[9]
features_resolution_10 = best_features_based_on_correlation[10]

# Find common features
common_features = set(features_resolution_8) & set(features_resolution_9) & set(features_resolution_10)

print("Common Features Across All Resolutions:")
for feature in common_features:
    print(feature)

### Moran's I

In [None]:
print("Local Moran's I Results:")
for resolution, results in local_moran_results.items():
    print(f"Resolution {resolution}: Moran's I = {results['moran_i']}, p_sim = {results['p_sim']}, q = {results['q']}")

In [None]:
res = {}
for resolution, results in local_moran_results.items():
    res[resolution] = []
    for key, value in results.items():
        res[resolution].extend([np.mean(value), np.std(value), np.min(value), np.max(value)])

local_moran_metrics = list(local_moran_results.values())[0].keys()
id_names = [f'{key}_{stat}' for key in local_moran_metrics for stat in ['mean', 'std', 'min', 'max']]
        
df = pd.DataFrame(res, index=id_names)
df.columns = [f'resolution={resolution}' for resolution in df.columns]
display(df)