In [9]:
# -------------------------
# Drought Risk Model Training with XGBClassifier
# -------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import pickle

# -------------------------
# Load your dataset
# -------------------------
df = pd.read_csv("train_timeseries.csv")

# Drop rows with missing score
df = df.dropna(subset=['score'])

# -------------------------
# Feature selection
# -------------------------
# Use numeric columns only (exclude 'fips', 'date', 'score')
features = [c for c in df.columns if c not in ['fips','date','score']]
X = df[features]

# -------------------------
# Create categorical labels from 'score'
# -------------------------
# Assuming score is 0..1 normalized
y_raw = pd.cut(df['score'], bins=[-np.inf, 0.33, 0.66, np.inf], labels=['Low', 'Moderate', 'High'])

# Encode labels to integers for XGBClassifier
le = LabelEncoder()
y = le.fit_transform(y_raw)  # Low=0, Moderate=1, High=2 (mapping may vary)

print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# -------------------------
# Train/test split (stratified)
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Handle class imbalance using SMOTE
# -------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", np.bincount(y_train_res))

# -------------------------
# Initialize XGBClassifier
# -------------------------
clf = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    eval_metric='mlogloss',
    random_state=42
)

# Train the classifier
clf.fit(X_train_res, y_train_res)

# -------------------------
# Predictions & Probabilities
# -------------------------
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)  # Real probabilities per class

# -------------------------
# Evaluation
# -------------------------
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Sample probabilities
print("\nSample Probabilities for first 5 test samples:")
for i in range(5):
    print(f"Sample {i+1}: {y_proba[i]}")

# -------------------------
# Save model and LabelEncoder
# -------------------------
with open("drought_model_xgb.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("\nModel and LabelEncoder saved successfully.")


Class mapping: {'High': np.int64(0), 'Low': np.int64(1), 'Moderate': np.int64(2)}
Class distribution after SMOTE: [1291121 1291121 1291121]

Classification Report:

              precision    recall  f1-score   support

        High       0.63      0.52      0.57    213792
         Low       0.74      0.70      0.72    322781
    Moderate       0.04      0.18      0.06     14787

    accuracy                           0.62    551360
   macro avg       0.47      0.47      0.45    551360
weighted avg       0.68      0.62      0.64    551360


Confusion Matrix:

[[111717  72191  29884]
 [ 60134 225348  37299]
 [  5384   6812   2591]]

Sample Probabilities for first 5 test samples:
Sample 1: [0.2856576  0.29146287 0.4228795 ]
Sample 2: [0.34099838 0.4461195  0.2128821 ]
Sample 3: [0.04271686 0.8144609  0.1428223 ]
Sample 4: [0.6182695  0.19463164 0.18709888]
Sample 5: [0.20935132 0.5989744  0.19167434]

Model and LabelEncoder saved successfully.


In [10]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests

# -------------------------
# Load trained model & label encoder
# -------------------------
with open("drought_model_xgb.pkl", "rb") as f:
    clf = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

# -------------------------
# Example countries with lat/lon
# -------------------------
COUNTRY_COORDS = {
    "Sri Lanka": (7.8731, 80.7718),
    "India": (20.5937, 78.9629)
}

# -------------------------
# Weather API helpers (Open-Meteo)
# -------------------------
DAILY_VARS = "precipitation_sum,temperature_2m_mean,et0_fao_evapotranspiration"

def _safe_get(d, *keys, default=None):
    cur = d
    for k in keys:
        if k not in cur:
            return default
        cur = cur[k]
    return cur

def fetch_daily_timeseries(lat, lon, start_date, end_date):
    """Fetch daily weather data for given coordinates and date range."""
    frames = []

    # Past data (archive API)
    url = (
        f"https://archive-api.open-meteo.com/v1/archive?"
        f"latitude={lat}&longitude={lon}"
        f"&start_date={start_date}&end_date={end_date}"
        f"&daily={DAILY_VARS}&timezone=auto"
    )
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        data = r.json()
        times = _safe_get(data, "daily", "time", [])
        prcp = _safe_get(data, "daily", "precipitation_sum", [])
        tmean = _safe_get(data, "daily", "temperature_2m_mean", [])
        et0 = _safe_get(data, "daily", "et0_fao_evapotranspiration", [])

        if times and len(times) == len(prcp) == len(tmean) == len(et0):
            df = pd.DataFrame({
                "date": pd.to_datetime(times),
                "precipitation_sum": prcp,
                "temperature_2m_mean": tmean,
                "et0_fao_evapotranspiration": et0
            })
            frames.append(df)
    except Exception as e:
        print(f"Failed to fetch data: {e}")
        return pd.DataFrame()  # empty

    if not frames:
        return pd.DataFrame()
    return pd.concat(frames).reset_index(drop=True)

# -------------------------
# Feature engineering
# -------------------------
def compute_features(df):
    df = df.sort_values("date").reset_index(drop=True)
    df["RainfallLast30Days"] = df["precipitation_sum"].rolling(30, min_periods=1).sum()
    df["TemperatureAvg7"] = df["temperature_2m_mean"].rolling(7, min_periods=1).mean()
    df["ET0Last30Days"] = df["et0_fao_evapotranspiration"].rolling(30, min_periods=1).sum()
    return df[["date", "RainfallLast30Days", "TemperatureAvg7", "ET0Last30Days"]]

# -------------------------
# Prediction for two countries
# -------------------------
today = datetime.utcnow().date()
start_date = (today - timedelta(days=60)).isoformat()
end_date = today.isoformat()

for country, (lat, lon) in COUNTRY_COORDS.items():
    print(f"\n=== {country} ===")
    df_weather = fetch_daily_timeseries(lat, lon, start_date, end_date)
    if df_weather.empty:
        print("No weather data available.")
        continue

    feats = compute_features(df_weather)
    X_pred = feats.drop(columns=["date"])

    # Predict probabilities
    probs = clf.predict_proba(X_pred)
    last_probs = probs[-1]  # latest day

    # Map to class labels
    class_probs = dict(zip(le.classes_, last_probs))
    print("Probabilities for today:")
    for k, v in class_probs.items():
        print(f"  {k}: {v:.2f}")


  today = datetime.utcnow().date()



=== Sri Lanka ===
No weather data available.

=== India ===
No weather data available.


In [None]:
import requests
lat, lon = 7.8731, 80.7718  # Sri Lanka
url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date=2025-07-01&end_date=2025-08-26&daily=precipitation_sum,temperature_2m_mean,et0_fao_evapotranspiration&timezone=auto"
r = requests.get(url)
print(r.status_code, r.json())



200 {'latitude': 7.838313, 'longitude': 80.79284, 'generationtime_ms': 3.256678581237793, 'utc_offset_seconds': 19800, 'timezone': 'Asia/Colombo', 'timezone_abbreviation': 'GMT+5:30', 'elevation': 232.0, 'daily_units': {'time': 'iso8601', 'precipitation_sum': 'mm', 'temperature_2m_mean': '°C', 'et0_fao_evapotranspiration': 'mm'}, 'daily': {'time': ['2025-07-01', '2025-07-02', '2025-07-03', '2025-07-04', '2025-07-05', '2025-07-06', '2025-07-07', '2025-07-08', '2025-07-09', '2025-07-10', '2025-07-11', '2025-07-12', '2025-07-13', '2025-07-14', '2025-07-15', '2025-07-16', '2025-07-17', '2025-07-18', '2025-07-19', '2025-07-20', '2025-07-21', '2025-07-22', '2025-07-23', '2025-07-24', '2025-07-25', '2025-07-26', '2025-07-27', '2025-07-28', '2025-07-29', '2025-07-30', '2025-07-31', '2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04', '2025-08-05', '2025-08-06', '2025-08-07', '2025-08-08', '2025-08-09', '2025-08-10', '2025-08-11', '2025-08-12', '2025-08-13', '2025-08-14', '2025-08-15', '2025

In [None]:
try:
    r = requests.get(url, timeout=15)
    r.raise_for_status()
    data = r.json()
    print(data.keys())  # debug output
