### Compare different weighting schemes for emission average at unique time-steps to determine spatial correlation characteristics  

- city-center distance weight: monitoring sites with a similar distance to the city center are weighted higher
- inverse distance weighting (IDW): near-by locations are weighted higher 
- no spatial weighting: average of all stations at same time step   

In [2]:
import geopandas as gpd
print(gpd.__version__)

1.0.1


In [2]:
# load data
import geopandas as gpd
import pandas as pd

# read sites and adjust naming
sites = gpd.read_file('../data/monitoring_station/monitoring_station.shp')[['id', 'geometry']]
sites['id'] = sites['id'].apply(lambda x : x.lower().replace(' ', '')[:5])
sites = sites[(sites['id']!='mc014')&(sites['id']!='mc085')]

# read dataframe with target variable and predictor features for all timesteps
main = pd.read_csv('../datasets/df_pollution_2023_berlin_imputed.csv')[['time_step', 'id',	'NO2']]
main.head(5)

Unnamed: 0,time_step,id,NO2
0,2023010100,mc010,7.0
1,2023010101,mc010,19.0
2,2023010102,mc010,9.0
3,2023010103,mc010,5.0
4,2023010104,mc010,6.0


### spatial similarity estimation

In [8]:
# city-center distance weight (CCDW)
from sklearn.feature_selection import mutual_info_regression
from shapely.geometry import Point
import numpy as np


# 1. add distance to Fehrnsehturm as distance to berlin center

def return_distance_center(point, latitude = 52.520803, longitude = 13.40945):
    '''
    return distance in meter between point (geometry) and 
    second point specified by lat and long (EPSG:4326)
    '''
    center_point = gpd.GeoDataFrame(index=[0], crs='EPSG:4326', geometry=[Point(longitude, latitude)])
    center_point = center_point.to_crs('EPSG:25833')
    center_geometry = center_point.geometry[0]
    return center_geometry.distance(point)

sites['distance_city'] = sites['geometry'].apply(lambda x: return_distance_center(point =x)/1000) # in meter 

#merge with previous file
main = pd.merge(main, sites, on = 'id', how= 'outer').drop(['geometry'], axis= 1)

# 2. calculate weighted mean pollution based on similarity in distance to the city center

def cc_weighted_mean(row):
    # Filter for other stations at the same timestamp
    same_time = main[main['MESS_DATUM'] == row['MESS_DATUM']]
    # Exclude the current station's measurement
    other_stations = same_time[same_time['id'] != row['id']]
    
    if other_stations.empty:
        return np.nan  # Return NaN if no other stations are available at the same time
    
    # Calculate similarity weights based on the inverse of the absolute difference in distances
    weights = 1 / (1 + np.abs(other_stations['distance_city'] - row['distance_city']))
       
    # Calculate the weighted mean pollution
    weighted_mean = np.average(other_stations['NO2'], weights=weights)
    return weighted_mean

# Apply the function to each row to calculate the new feature
main['cc_weighted_mean_pollution'] = main.apply(cc_weighted_mean, axis=1)

# 3. return correlate
correlation, p_value = main['NO2'].corr(main['cc_weighted_mean_pollution'], method='pearson'), None
print(f"Pearson correlation for city center weighted average: {correlation}")

# 4. return mutual information
mi_score = mutual_info_regression(main.filter(['NO2']), main['cc_weighted_mean_pollution'])
print(f"Mutual Information for city center weighted average: {mi_score}.")



Pearson correlation for city center weighted average: 0.717581091107383
Mutual Information for city center weighted average: [0.35371728].


In [10]:
# city-center (sqrt difference) distance weight (CCDW)
from sklearn.feature_selection import mutual_info_regression
from shapely.geometry import Point
import numpy as np

# 1. calculate weighted mean pollution based on similarity in distance to the city center

def cc_weighted_mean(row):
    # Filter for other stations at the same timestamp
    same_time = main[main['MESS_DATUM'] == row['MESS_DATUM']]
    # Exclude the current station's measurement
    other_stations = same_time[same_time['id'] != row['id']]
    
    if other_stations.empty:
        return np.nan  # Return NaN if no other stations are available at the same time
    
    # Calculate similarity weights based on the inverse of the absolute difference in distances
    weights = 1 / ((np.abs(other_stations['distance_city'] - row['distance_city']))**(1/4))
    
    # Calculate the weighted mean pollution
    weighted_mean = np.average(other_stations['NO2'], weights=weights)
    return weighted_mean

# Apply the function to each row to calculate the new feature
main['cc_sqrt_weighted_mean_pollution'] = main.apply(cc_weighted_mean, axis=1)

# 3. return correlate
correlation, p_value = main['NO2'].corr(main['cc_sqrt_weighted_mean_pollution'], method='pearson'), None
print(f"Pearson correlation for sqrt city center weighted average: {correlation}")

# 4. return mutual information
mi_score = mutual_info_regression(main.filter(['NO2']), main['cc_sqrt_weighted_mean_pollution'])
print(f"Mutual Information for sqrt city center weighted average: {mi_score}.")


Pearson correlation for sqrt city center weighted average: 0.6654973088619369
Mutual Information for sqrt city center weighted average: [0.27772331].


In [18]:
# inverse distance weighting (IDW)
import numpy as np
# 1. calculate distance between all station
def calculate_distance_matrix(sites):
    """Calculate the pairwise distance matrix for sites."""

    site_points = sites['geometry']
    num_sites = len(site_points)
    dist_matrix = np.zeros((num_sites, num_sites))

    for i in range(num_sites):
        for j in range(num_sites):
            # Calculate distance between points
            dist_matrix[i, j] = site_points[i].distance(site_points[j]) / 1000 # convert to km

    dist_matrix = pd.DataFrame(dist_matrix, index= sites['id'], columns= sites['id'])
    return dist_matrix

dist_matrix = calculate_distance_matrix(sites= sites.reset_index())

# 2. calculate inverse distance weighted average of pollution

def inverse_distance_weighting(row, main, dist_matrix):
    # filter for correct date
    same_time = main[main['MESS_DATUM'] == row['MESS_DATUM']]
    # Exclude the current station's measurement
    other_stations = same_time[same_time['id'] != row['id']].sort_values('id')

    # Get distances and sort to ensure order matches
    distances = dist_matrix.loc[row['id'], other_stations['id']]
    
    # inverse distance + normalization    
    inverse_distances = 1 / distances
    inverse_distances = inverse_distances/ inverse_distances.sum()
    
    # Calculate the weighted mean pollution, ensuring the order matches
    weighted_mean = np.average(
        other_stations['NO2'],
        weights=inverse_distances
    )
    
    return weighted_mean

# Calculate the inverse distance mean
main['inverse_distance_pollution'] = main.apply(
    lambda row: inverse_distance_weighting(row, main, dist_matrix),
    axis=1
)

# 3. return correlation
correlation = main['NO2'].corr(main['inverse_distance_pollution'], method='pearson')
print(f"Pearson correlation for inverse distance average: {correlation}")

# 4. return mutual information
mi_score = mutual_info_regression(main.filter(['NO2']), main['inverse_distance_pollution'])
print(f"Mutual Information for inverse distance average: {mi_score}.")



Pearson correlation for inverse distance average: 0.648084059409781
Mutual Information for inverse distance average: [0.25721471].


In [19]:
# non- weighted average

def non_weighted_average(row, main):
    # filter for correct date
    same_time = main[main['MESS_DATUM'] == row['MESS_DATUM']]
    # Exclude the current station's measurement
    other_stations = same_time[same_time['id'] != row['id']].sort_values('id')

    weighted_mean = np.average(other_stations['NO2'])
    
    return weighted_mean

# Calculate the inverse distance mean
main['non_weighted_mean_pollution'] = main.apply(lambda row: non_weighted_average(row, main), axis=1)

# Correlation
correlation = main['NO2'].corr(main['non_weighted_mean_pollution'], method='pearson')
print(f"Pearson correlation for non-weighted average in pollution: {correlation}")

# 4. return mutual information
mi_score = mutual_info_regression(main.filter(['NO2']), main['non_weighted_mean_pollution'])
print(f"Mutual Information for non- weighted average: {mi_score}.")



Pearson correlation for non-weighted average in pollution: 0.623769773853018
Mutual Information for non- weighted average: [0.29397268].


### Results p. 15 of the Thesis:

"For empirical evaluation, a Pearson correlation test and a mutual information (MI) estimation are conducted to compare the true emission values with the weighted averages calculated by each method. The inverse distance method achieves a correlation of 0.648, surpassing the unweighted average. The proposed difference in distance approach achieves the best correlation score of 0.718. The entropy-based MI exceeds covariance (Kraskov et al., 2004, p. 1) and demonstrates the increased informative value of the difference in distance calculation,
with a 40% increase compared to the inverse distance. Nevertheless, the simplified distance-based similarity assumption and small sample of stations limit this evaluation and the different spatial proximity estimations are further compared through model performance."