In [1]:

import pandas as pd
from datetime import datetime, timedelta
import warnings
%load_ext autoreload
%autoreload 2
from scripts.extract_data import get_wpdx_kenya , collapse_zinc_csv

wpdx = get_wpdx_kenya()
wpdx['report_date'] = pd.to_datetime(wpdx['report_date'])
wpdx['env_start_date'] = wpdx['report_date'] - timedelta(days=30)
wpdx['env_end_date'] = wpdx['report_date']

sample_size = 1000 
kenya_sample = wpdx.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Processing {len(kenya_sample)} water points with temporal accuracy")

✅ || wpdx_kenya.csv already exists with 21953 rows, skipping fetch.
Processing 1000 water points with temporal accuracy


In [2]:
# --- Standardize and Rename Columns ---
# No need for the full generic column cleaning with re.sub if you know the exact names.
# However, for robustness, we can still ensure they are lowercased and stripped IF needed.
# For now, let's target the exact names you provided.

# Rename 'wpdx_id' to 'pt_id'
if 'wpdx_id' in wpdx.columns:
    wpdx.rename(columns={'wpdx_id': 'pt_id'}, inplace=True)
else:
    # If for some reason 'wpdx_id' is missing, you'd need to identify the correct ID column.
    raise KeyError("'wpdx_id' column not found. Please verify the actual ID column name.")

# Ensure 'pt_id' is string type
wpdx['pt_id'] = wpdx['pt_id'].astype(str)
wpdx['latitude'] = pd.to_numeric(wpdx['lat_deg'], errors='coerce').round(6)
wpdx['longitude'] = pd.to_numeric(wpdx['lon_deg'], errors='coerce').round(6)
wpdx.dropna(subset=['latitude', 'longitude'], inplace=True)
if 'clean_country_name' in wpdx.columns:
    wpdx = wpdx[(wpdx['clean_country_name'].str.lower() == 'kenya')]
else:
    raise KeyError("'country_name' column not found. Please verify the country column name.")

# Sample 5000 points
DESIRED_SAMPLED_POINTS_COUNT = 5000
if len(wpdx) >= DESIRED_SAMPLED_POINTS_COUNT:
    wpdx = wpdx.sample(n=DESIRED_SAMPLED_POINTS_COUNT, random_state=42).reset_index(drop=True)
else:
    print(f"Warning: Less than {DESIRED_SAMPLED_POINTS_COUNT} points available for sampling. Using all {len(wpdx)} points.")

print(f"Sampled {len(wpdx)} water points for GEE processing.")

# --- Prepare for GEE (select only necessary columns with correct names for geemap) ---
wpdx_for_gee = wpdx[['pt_id', 'latitude', 'longitude']]
# Note: geemap.pandas_to_ee expects 'latitude' and 'longitude' by default if no other names specified.
# So, we map lat_deg/lon_deg to latitude/longitude for clean passing.

Sampled 5000 water points for GEE processing.


In [3]:
wpdx_for_gee.head()

Unnamed: 0,pt_id,latitude,longitude
0,6GGP5HF3+VR8,0.174668,34.554572
1,6GGP6HC8+85Q,0.22083,34.565483
2,6GGP28VQ+MWR,0.044235,34.339775
3,6GFPHQ2V+2MF,-0.449933,34.794163
4,6GGPFRWJ+VH8,0.49716,34.831463
