# 3. Bias Correction Model

In [None]:
import os
import pandas as pd
import geopandas as gpd
import h3
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Path for data loading
data_path = '../processed_data/matched_weekly_strava_ecocounter_extended.csv'

# Load the file
df = pd.read_csv(data_path, parse_dates=['week_start'])

df.rename(columns={
    'SUM_total_trip_count': 'strava_count',
    'EcoCntr_weekly_SUM': 'eco_count'
}, inplace=True)

# Preview
print(df.head())

plt.figure(figsize=(10, 6))
sns.histplot(df['eco_count'], kde=True, color='blue', label='EcoCounter')
sns.histplot(df['strava_count'], kde=True, color='orange', label='Strava')
plt.title('Distribution of Weekly Counts')
plt.xlabel('Weekly Count')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='strava_count', y='eco_count')
plt.title('Strava vs. EcoCounter Weekly Counts')
plt.xlabel('Strava Weekly Count')
plt.ylabel('EcoCounter Weekly Count')
plt.grid(True)
plt.show()

correlation = df[['strava_count', 'eco_count']].corr().iloc[0, 1]
print(f"Correlation between Strava and EcoCounter counts: {correlation:.2f}")

# Drop rows with missing values
df_model = df[['strava_count', 'eco_count']].dropna()

X = df_model[['strava_count']]
y = df_model['eco_count']

model = LinearRegression()
model.fit(X, y)
df_model['eco_pred'] = model.predict(X)

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='strava_count', y='eco_count', data=df_model, label='Observed')
sns.lineplot(x='strava_count', y='eco_pred', data=df_model, color='red', label='Regression Line')
plt.title('Linear Regression: EcoCounter ~ Strava')
plt.xlabel('Strava Count')
plt.ylabel('EcoCounter Count')
plt.legend()
plt.grid(True)
plt.show()

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y, df_model['eco_pred']))
print(f"📉 RMSE: {rmse:.2f}")

# 📦 Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Set up plotting
sns.set(style="whitegrid")

# --- 📁 Load Data ---
data_path = "/home/jovyan/BiasCorrectionCrowdsourcedData-cookbook/processed_data/weekly_with_covariates.csv"
df = pd.read_csv(data_path, parse_dates=["week_start"])

# ✅ Rename for consistency
df = df.rename(columns={
    "SUM_total_trip_count": "strava_count",
    "EcoCntr_weekly_SUM": "ecocounter_count"
})
df["total_count"] = df["strava_count"] + df["ecocounter_count"]

# 🧭 Check a sample
print("🔍 Sample rows:")
print(df[["strava_count", "ecocounter_count", "total_count"]].head())

# 📊 Summary stats
print("\n📊 Summary Statistics:")
print(df[["strava_count", "ecocounter_count", "total_count"]].describe())

# 📈 Correlation matrix
print("\n📈 Correlation Matrix:")
print(df[["strava_count", "ecocounter_count", "total_count"]].corr())

# --- 📉 Visualize Distributions ---
plt.figure(figsize=(10, 6))
sns.histplot(df["total_count"], bins=50, kde=True)
plt.title("Distribution of Total Weekly Counts (Strava + EcoCounter)")
plt.xlabel("Total Weekly Count")
plt.ylabel("Frequency")
plt.show()

# --- 🎯 Covariate Analysis ---
covariate_cols = [
    "MAX_slopePct",
    "Minority LandClass Pct",
    "2024 Median Household Income",
    "2024 Diversity Index"
]

# Filter to existing columns
covariate_cols = [col for col in covariate_cols if col in df.columns]

print("\n🧭 Found covariates:", covariate_cols)

# Correlation with counts
correlation_results = df[["strava_count", "ecocounter_count", "total_count"] + covariate_cols].corr()
print("\n📈 Covariate correlations with counts:")
print(correlation_results.loc[covariate_cols, ["strava_count", "ecocounter_count", "total_count"]])

# --- 📊 Visualize Covariates vs. Count ---
for cov in covariate_cols:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=df[cov], y=df["total_count"])
    sns.regplot(x=df[cov], y=df["total_count"], scatter=False, color='red', label='Trend')
    plt.title(f"Total Count vs. {cov}")
    plt.xlabel(cov)
    plt.ylabel("Total Count")
    plt.legend()
    plt.tight_layout()
    plt.show()

# --- Load Libraries ---
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Set Display Options ---
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# --- Load Data ---
data_path = '../processed_data/weekly_with_covariates.csv'
df = pd.read_csv(data_path, parse_dates=['week_start'])

# --- Rename columns for clarity ---
df = df.rename(columns={
    'SUM_total_trip_count': 'strava_count',
    'EcoCntr_weekly_SUM': 'ecocounter_count'
})
df['total_count'] = df['strava_count'] + df['ecocounter_count']

# --- Identify numeric covariates ---
excluded_cols = ['GRID_ID', 'week_start', 'strava_count', 'ecocounter_count', 'total_count']
covariate_cols = [col for col in df.columns if col not in excluded_cols and pd.api.types.is_numeric_dtype(df[col])]

# --- Top 10 locations by average total activity ---
top_locations = df.groupby('GRID_ID')[['strava_count', 'ecocounter_count', 'total_count']].mean()
top_locations = top_locations.sort_values('total_count', ascending=False).reset_index()
print("🌍 Top 10 Grid Cells by Average Total Count:")
print(top_locations.head(10))

# --- Covariate correlation matrix ---
print("\n📈 Covariate Correlations with Total Counts:")
correlations = df[covariate_cols + ['strava_count', 'ecocounter_count', 'total_count']].corr()
print(correlations[['strava_count', 'ecocounter_count', 'total_count']].loc[covariate_cols].sort_values('total_count', ascending=False))

# --- Pairplot for top correlated covariates ---
top_covs = correlations['total_count'].abs().sort_values(ascending=False).head(5).index.tolist()
if top_covs:
    sns.pairplot(df, vars=top_covs + ['total_count'])
    plt.suptitle("📊 Pairplot of Top Covariates with Total Count", y=1.02)
    plt.show()
else:
    print("⚠️ No strong covariates found for pairplot.")

# --- Time series trend for Strava vs EcoCounter ---
plt.figure(figsize=(12, 6))
df_weekly = df.groupby('week_start')[['strava_count', 'ecocounter_count', 'total_count']].mean()
df_weekly.plot(ax=plt.gca())
plt.title("📆 Weekly Mean Counts: Strava vs EcoCounter")
plt.xlabel("Week Start Date")
plt.ylabel("Average Count")
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Boxplot of counts by slope class (if available) ---
if 'slopePct' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x='slopePct', y='total_count')
    plt.title("📦 Total Count by Slope Category")
    plt.xlabel("Slope Category")
    plt.ylabel("Total Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ 'slopePct' column not found in the data.")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- 📁 Load Data ---
data_path = '../processed_data/weekly_with_covariates.csv'
df = pd.read_csv(data_path, parse_dates=['week_start'])

# --- ✅ Rename Columns for Simplicity ---
df = df.rename(columns={
    'SUM_total_trip_count': 'strava_count',
    'EcoCntr_weekly_SUM': 'ecocounter_count'
})
df['total_count'] = df['strava_count'] + df['ecocounter_count']

# --- 🔍 Preview ---
print("✅ File loaded successfully.")
print(df[['GRID_ID', 'week_start', 'strava_count', 'ecocounter_count', 'total_count']].head())

# --- 📈 Summary Stats ---
print("\n📊 Summary Statistics:")
print(df[['strava_count', 'ecocounter_count', 'total_count']].describe())

# --- 🔗 Correlation Matrix ---
print("\n📈 Correlation Matrix:")
print(df[['strava_count', 'ecocounter_count', 'total_count']].corr())

# --- 🗺️ Top 10 Grid Cells by Mean Counts ---
grouped = df.groupby('GRID_ID')[['strava_count', 'ecocounter_count', 'total_count']].mean().reset_index()
top_grids = grouped.sort_values('total_count', ascending=False).head(10)
print("\n🌍 Top 10 Grid Cells by Average Total Count:")
print(top_grids)

# --- 📉 Plot Trends for Top Grids ---
top_ids = top_grids['GRID_ID'].tolist()
df_top = df[df['GRID_ID'].isin(top_ids)]

plt.figure(figsize=(14, 6))
sns.lineplot(data=df_top, x='week_start', y='total_count', hue='GRID_ID')
plt.title('📈 Weekly Total Count for Top 10 Grid Cells')
plt.xlabel('Week Start')
plt.ylabel('Total Count')
plt.legend(title='GRID_ID')
plt.tight_layout()
plt.show()

# --- 🧭 Covariate Exploration ---
covariates = [
    'MAX_slopePct', 'Minority LandClass Pct',
    '2024 Median Household Income', '2024 Diversity Index'
]
covariates = [col for col in covariates if col in df.columns]
print("\n🧭 Found covariates:", covariates)

# --- 📊 Covariate Correlations ---
correlations = df[covariates + ['strava_count', 'ecocounter_count', 'total_count']].corr()
print("\n📈 Covariate correlations with counts:")
print(correlations.loc[covariates, ['strava_count', 'ecocounter_count', 'total_count']])

# --- 🖼️ Visualize Covariate Relationships ---
for cov in covariates:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(data=df, x=cov, y='total_count', alpha=0.5)
    sns.regplot(data=df, x=cov, y='total_count', scatter=False, color='red')
    plt.title(f'Total Count vs. {cov}')
    plt.tight_layout()
    plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cluster import KMeans

# --- 📁 Load Data ---
df = pd.read_csv('../processed_data/weekly_with_covariates.csv', parse_dates=['week_start'])

# --- ✅ Rename for simplicity ---
df = df.rename(columns={
    'SUM_total_trip_count': 'strava_count',
    'EcoCntr_weekly_SUM': 'ecocounter_count'
})
df['total_count'] = df['strava_count'] + df['ecocounter_count']

# --- 🧹 Clean and Convert Slope Category to Numeric Midpoint ---
df['MAX_slopePct'] = df['MAX_slopePct'].astype(str).str.replace('–', '-', regex=False).str.strip()
slope_bins = {
    '0 to 1%': 0.5,
    '1 to 3%': 2.0,
    '3 to 6%': 4.5,
    '6 to 8%': 7.0,
    '8 to 12%': 10.0,
    '12 to 25%': 18.5,
    'Above 25%': 30.0
}
df['MAX_slopePct'] = df['MAX_slopePct'].replace(slope_bins)
df['MAX_slopePct'] = pd.to_numeric(df['MAX_slopePct'], errors='coerce')

# --- 🔢 Convert Other Covariates ---
df['2024 Median Household Income'] = pd.to_numeric(df['2024 Median Household Income'], errors='coerce')
df['2024 Diversity Index'] = pd.to_numeric(df['2024 Diversity Index'], errors='coerce')

# --- ❓ Missing values report ---
print("\n❓ Missing values before modeling:")
print(df[['strava_count', 'MAX_slopePct', '2024 Median Household Income', '2024 Diversity Index', 'ecocounter_count']].isna().sum())

# --- 🧠 Multivariate Regression ---
features = ['strava_count', 'MAX_slopePct', '2024 Median Household Income', '2024 Diversity Index']
df_model = df.dropna(subset=features + ['ecocounter_count'])

if not df_model.empty:
    X = df_model[features]
    y = df_model['ecocounter_count']

    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)

    rmse = np.sqrt(mean_squared_error(y, y_pred))

    print("\n🔢 Multivariate Regression Results:")
    print(f"  R² Score: {r2_score(y, y_pred):.3f}")
    print(f"  RMSE: {rmse:.2f}")
    print("  Coefficients:")
    for feat, coef in zip(features, model.coef_):
        print(f"    {feat}: {coef:.2f}")
    print(f"  Intercept: {model.intercept_:.2f}")
else:
    print("\n🚫 No valid rows for modeling after cleaning.")

# --- 📈 Seasonal Breakdown ---
df['month'] = df['week_start'].dt.month
monthly_avg = df.groupby('month')[['strava_count', 'ecocounter_count', 'total_count']].mean()

monthly_avg.plot(marker='o', figsize=(10, 5))
plt.title('📆 Average Monthly Bicycle Counts (2019–2023)')
plt.ylabel('Average Weekly Count')
plt.xlabel('Month')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- 🌍 Spatial Clustering & Mapping ---
df_grid = df.groupby('GRID_ID')[['strava_count', 'ecocounter_count', 'total_count']].mean().reset_index()

# Cluster grid cells by usage
kmeans = KMeans(n_clusters=3, random_state=42)
df_grid['cluster'] = kmeans.fit_predict(df_grid[['strava_count', 'ecocounter_count']])

# Assign mock coordinates for demo purposes
df_grid['lat'] = 36.3 + (np.arange(len(df_grid)) % 20) * 0.01
df_grid['lon'] = -94.3 + (np.arange(len(df_grid)) // 20) * 0.01

# Create folium map
m = folium.Map(location=[36.37, -94.21], zoom_start=10)
for _, row in df_grid.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=5 + row['total_count'] / 5000,
        color='blue',
        fill=True,
        fill_opacity=0.6,
        popup=(f"GRID: {row['GRID_ID']}<br>"
               f"Strava: {row['strava_count']:.0f}<br>"
               f"EcoCounter: {row['ecocounter_count']:.0f}<br>"
               f"Cluster: {row['cluster']}")
    ).add_to(m)

m.save('../processed_data/spatial_clusters_map.html')
print("\n🗺️ Interactive map saved: spatial_clusters_map.html")

# --- 💾 Save Cleaned Data ---
df.to_csv('../processed_data/df_cleaned.csv', index=False)
print("✅ Cleaned dataset saved as df_cleaned.csv")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# --- Load Data ---
df = pd.read_csv('../processed_data/weekly_with_covariates.csv', parse_dates=['week_start'])

# --- Rename columns ---
df = df.rename(columns={
    'SUM_total_trip_count': 'strava_count',
    'EcoCntr_weekly_SUM': 'ecocounter_count'
})
df['total_count'] = df['strava_count'] + df['ecocounter_count']

# --- Convert slope category to numeric ---
df['MAX_slopePct'] = df['MAX_slopePct'].astype(str).str.replace('–', '-', regex=False).str.strip()
slope_bins = {
    '0 to 1%': 0.5,
    '1 to 3%': 2.0,
    '3 to 6%': 4.5,
    '6 to 8%': 7.0,
    '8 to 12%': 10.0,
    '12 to 25%': 18.5,
    'Above 25%': 30.0
}
df['MAX_slopePct'] = df['MAX_slopePct'].replace(slope_bins)
df['MAX_slopePct'] = pd.to_numeric(df['MAX_slopePct'], errors='coerce')

# --- Convert other covariates to numeric ---
df['2024 Median Household Income'] = pd.to_numeric(df['2024 Median Household Income'], errors='coerce')
df['2024 Diversity Index'] = pd.to_numeric(df['2024 Diversity Index'], errors='coerce')

# --- Drop rows with missing predictor or target ---
features = ['strava_count', 'MAX_slopePct', '2024 Median Household Income', '2024 Diversity Index']
df_model = df.dropna(subset=features + ['ecocounter_count'])

# --- Regression (All Data) ---
if not df_model.empty:
    X = df_model[features]
    y = df_model['ecocounter_count']
    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)

    print("Multivariate Regression Results (All Data):")
    print(f"  R² Score: {r2_score(y, y_pred):.3f}")
    print(f"  RMSE: {np.sqrt(mean_squared_error(y, y_pred)):.2f}")
    print("  Coefficients:")
    for feat, coef in zip(features, model.coef_):
        print(f"    {feat}: {coef:.2f}")
    print(f"  Intercept: {model.intercept_:.2f}")
else:
    print("No valid rows for modeling after cleaning.")

# --- Seasonal Breakdown Plot ---
df['month'] = df['week_start'].dt.month
monthly_avg = df.groupby('month')[['strava_count', 'ecocounter_count', 'total_count']].mean()

monthly_avg.plot(marker='o', figsize=(10, 5))
plt.title('Average Monthly Bicycle Counts (2019–2023)')
plt.ylabel('Average Weekly Count')
plt.xlabel('Month')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Spatial Clustering ---
df_grid = df.groupby('GRID_ID')[['strava_count', 'ecocounter_count', 'total_count']].mean().reset_index()
kmeans = KMeans(n_clusters=3, random_state=42)
df_grid['cluster'] = kmeans.fit_predict(df_grid[['strava_count', 'ecocounter_count']])
df_grid['lat'] = 36.3 + (np.arange(len(df_grid)) % 20) * 0.01
df_grid['lon'] = -94.3 + (np.arange(len(df_grid)) // 20) * 0.01

m = folium.Map(location=[36.37, -94.21], zoom_start=10)
for _, row in df_grid.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=5 + row['total_count'] / 5000,
        color='blue',
        fill=True,
        fill_opacity=0.6,
        popup=(f"GRID: {row['GRID_ID']}<br>"
               f"Strava: {row['strava_count']:.0f}<br>"
               f"EcoCounter: {row['ecocounter_count']:.0f}<br>"
               f"Cluster: {row['cluster']}")
    ).add_to(m)
m.save('../processed_data/spatial_clusters_map.html')
print("Interactive map saved: spatial_clusters_map.html")