# 3. Bias Correction Model

In [4]:
!pip install h3



In [2]:
!pip install h3 matplotlib seaborn pandas

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h3

# Sample DataFrame structure: replace with your actual data loading
data = pd.DataFrame({
    'hex_id': [
        '8928308280fffff',
        '89283082807ffff',
        '89283082877ffff',
        '89283082803ffff',
    ],
    'demographic_similarity': [0.8, 0.6, 0.9, 0.4]
})

# Aggregate demographic similarity by hex_id (if needed)
agg = data.groupby('hex_id')['demographic_similarity'].mean().reset_index()

# Get lat/lon of hex centers from hex IDs using h3
agg['lat'] = agg['hex_id'].apply(lambda x: h3.h3_to_geo(x)[0])
agg['lon'] = agg['hex_id'].apply(lambda x: h3.h3_to_geo(x)[1])

# Plot demographic similarity on scatter plot
plt.figure(figsize=(10,8))
scatter = plt.scatter(
    agg['lon'], agg['lat'], 
    c=agg['demographic_similarity'], 
    cmap='viridis', 
    s=100, edgecolor='k'
)
plt.colorbar(scatter, label='Demographic Similarity')
plt.title('Demographic Similarity by H3 Hex')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()

ModuleNotFoundError: No module named 'h3'

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h3

# Example: aggregate demographic similarity by hex_id
# Assuming you have a DataFrame `data` with columns 'hex_id' and 'demographic_similarity'

agg = data.groupby('hex_id')['demographic_similarity'].mean().reset_index()

# Use h3 to get approximate lat/lon of hex centers
agg['lat'] = agg['hex_id'].apply(lambda x: h3.h3_to_geo(x)[0])
agg['lon'] = agg['hex_id'].apply(lambda x: h3.h3_to_geo(x)[1])

plt.figure(figsize=(10,8))
sns.scatterplot(x='lon', y='lat', hue='demographic_similarity', data=agg, palette='viridis')
plt.title('Demographic Similarity by Hex')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.colorbar(label='Demographic Similarity')
plt.show()

ModuleNotFoundError: No module named 'h3'

In [None]:
import pandas as pd
import geopandas as gpd
import h3
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import time
import logging
import os

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of EcoCounter and Strava counts
sns.histplot(truth_df['EcoCntr_weekly_SUM'], kde=True, color='blue', label='EcoCounter')
sns.histplot(truth_df['SUM_total_trip_count'], kde=True, color='orange', label='Strava')
plt.legend()
plt.title('Distribution of EcoCounter and Strava Counts')
plt.xlabel('Weekly Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
for col in truth_df.columns:
    print(f"'{col}'")

### 3b. Covariate Selection & Model Setup

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define covariates and response
features = ['strava_count', 'trail_access', 'pop_density', 'median_income', 'age_18_34', 'bike_infra']
X = df[features]
y = df['eco_count']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation metrics
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R^2:", r2_score(y_test, y_pred))


### 3c. Correction Model Application

In [None]:

# Apply bias correction to full dataset
df['eco_pred'] = model.predict(X)

# Plot predicted vs actual
sns.scatterplot(x='eco_count', y='eco_pred', data=df)
plt.plot([df['eco_count'].min(), df['eco_count'].max()],
         [df['eco_count'].min(), df['eco_count'].max()],
         '--', color='red')
plt.xlabel("Observed EcoCounter")
plt.ylabel("Predicted EcoCounter")
plt.title("Bias Corrected Predictions vs Observations")
plt.show()


### 3d. Spatial Validation Across Demographic Clusters

In [None]:

# Load hex cluster and spatial data
hex_df = pd.read_csv('processed_data/hex_with_clusters.csv')

# Merge with predictions
merged = pd.merge(df, hex_df, on='hex_id')

# Calculate R² per cluster
cluster_scores = merged.groupby('cluster_label').apply(
    lambda g: r2_score(g['eco_count'], g['eco_pred'])
)

print(cluster_scores)

# Plot cluster scores
cluster_scores.plot(kind='bar', title='R² by Demographic Cluster')
plt.ylabel("R² Score")
plt.show()


### 3e. Time Series & Seasonal Comparison

In [None]:

# Compare corrected vs original across time
weekly = df.groupby('week').agg({'eco_count': 'mean', 'eco_pred': 'mean', 'strava_count': 'mean'}).reset_index()

plt.figure(figsize=(12, 4))
plt.plot(weekly['week'], weekly['eco_count'], label='Observed EcoCounter')
plt.plot(weekly['week'], weekly['eco_pred'], label='Corrected Prediction')
plt.plot(weekly['week'], weekly['strava_count'], label='Raw Strava')
plt.legend()
plt.title("Weekly Trends: Observed vs Corrected vs Raw Strava")
plt.xlabel("Week")
plt.ylabel("Mean Weekly Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
