In [3]:
# Predictive modeling script (fast / notebook-friendly)
# Save as `train_nfhs_model.py` or run in a notebook cell.

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# --------- CONFIG ----------
INPUT_CSV = "/Users/sahitipotini/Downloads/obesity/NFHS_5_cleaned_data.csv"   # change path if needed
TARGET = "women_overweight_obese_pct"                     # target variable
STATE_COL = "state"    # high-cardinality -> we drop for modeling to avoid overfitting
# ---------------------------

# Load data (attempt utf-8 then latin-1)
try:
    df = pd.read_csv(INPUT_CSV, encoding='utf-8')
except Exception:
    df = pd.read_csv(INPUT_CSV, encoding='latin-1')

print("Loaded:", df.shape)

# Verify target is present
if TARGET not in df.columns:
    raise ValueError(f"Target {TARGET} not found. Columns: {df.columns.tolist()}")

# Feature selection: use all except state and target
features = [c for c in df.columns if c not in (TARGET, STATE_COL)]
# drop any constant columns
features = [c for c in features if df[c].nunique() > 1]

X = df[features].copy()
y = df[TARGET].copy()

# Quick train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: numeric scaling + one-hot for categorical (area)
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Simple pipelines
lr_pipe = Pipeline(steps=[('pre', preprocessor), ('model', LinearRegression())])
rf_pipe = Pipeline(steps=[('pre', preprocessor), ('model', RandomForestRegressor(random_state=42))])
gbr_pipe = Pipeline(steps=[('pre', preprocessor), ('model', GradientBoostingRegressor(random_state=42))])

# Lightweight randomized search grids (quick)
rf_param_dist = {
    "model__n_estimators": [50, 100, 150],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [2, 4, 6]
}
gbr_param_dist = {
    "model__n_estimators": [50, 100],
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__max_depth": [3, 4, 5]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Fast RandomizedSearch (n_iter small to save time)
from sklearn.model_selection import RandomizedSearchCV
rf_search = RandomizedSearchCV(rf_pipe, rf_param_dist, n_iter=6, scoring='neg_mean_squared_error', cv=cv, random_state=42, n_jobs=-1)
gbr_search = RandomizedSearchCV(gbr_pipe, gbr_param_dist, n_iter=6, scoring='neg_mean_squared_error', cv=cv, random_state=42, n_jobs=-1)

print("Fitting Linear Regression (no hyperparams)...")
lr_pipe.fit(X_train, y_train)
lr_cv_rmse = np.sqrt(-cross_val_score(lr_pipe, X_train, y_train, cv=cv, scoring='neg_mean_squared_error').mean())
print("LR CV RMSE:", lr_cv_rmse)

print("\nRunning RandomForest search (fast)...")
rf_search.fit(X_train, y_train)
rf_best_rmse = np.sqrt(-rf_search.best_score_)
print("RF best (cv) RMSE:", rf_best_rmse)
print("RF best params:", rf_search.best_params_)

print("\nRunning GradientBoosting search (fast)...")
gbr_search.fit(X_train, y_train)
gbr_best_rmse = np.sqrt(-gbr_search.best_score_)
print("GBR best (cv) RMSE:", gbr_best_rmse)
print("GBR best params:", gbr_search.best_params_)

# Compare and select best
results = {"LinearRegression": lr_cv_rmse, "RandomForest": rf_best_rmse, "GradientBoosting": gbr_best_rmse}
best_name = min(results, key=results.get)
print("\nCV RMSE summary:", results)
print("Selected:", best_name)

best_model = {"LinearRegression": lr_pipe, "RandomForest": rf_search.best_estimator_, "GradientBoosting": gbr_search.best_estimator_}[best_name]

# Fit best model on full training data (ensures final fit)
best_model.fit(X_train, y_train)

# Evaluate on test set
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = best_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))   # fixed
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"\nTest metrics: RMSE={rmse_test:.4f}, MAE={mae_test:.4f}, R2={r2_test:.4f}")


# Feature importances for tree models
def get_feature_names(preprocessor):
    num_feats = numeric_features
    cat_feats = []
    if categorical_features:
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_feats = ohe.get_feature_names_out(categorical_features).tolist()
    return num_feats + cat_feats

if best_name in ("RandomForest", "GradientBoosting"):
    model_obj = best_model.named_steps['model']
    feat_names = get_feature_names(best_model.named_steps['pre'])
    importances = model_obj.feature_importances_
    fi = pd.DataFrame({'feature': feat_names, 'importance': importances}).sort_values('importance', ascending=False)
    print("\nTop features:\n", fi.head(15))
    fi.to_csv("/mnt/data/feature_importances.csv", index=False)

# Save model and predictions
joblib.dump(best_model, "/Users/sahitipotini/Downloads/obesity/best_nfhs_model.pkl")
print("Saved model to /Users/sahitipotini/Downloads/obesity/best_nfhs_model.pkl")

test_out = X_test.copy()
test_out[TARGET] = y_test.values
test_out["pred_" + TARGET] = y_pred
test_out.to_csv("/Users/sahitipotini/Downloads/obesity/predictions_test_set.csv", index=False)
print("Saved test predictions to /Users/sahitipotini/Downloads/obesity/predictions_test_set.csv")

# Full-data predictions
df_full = df.copy()
df_full["pred_" + TARGET] = best_model.predict(df_full[features])
df_full.to_csv("/Users/sahitipotini/Downloads/obesity/all_predictions.csv", index=False)
print("Saved all rows predictions to /Users/sahitipotini/Downloads/obesity/all_predictions.csv")

# Quick residual diagnostics (print)
resid = y_test - y_pred
print("\nResiduals: mean:", resid.mean(), "std:", resid.std())
print("Done.")


Loaded: (107, 28)
Fitting Linear Regression (no hyperparams)...
LR CV RMSE: 2.9044041598203365

Running RandomForest search (fast)...
RF best (cv) RMSE: 4.105862046811917
RF best params: {'model__n_estimators': 100, 'model__min_samples_split': 4, 'model__max_depth': 5}

Running GradientBoosting search (fast)...
GBR best (cv) RMSE: 3.9993716689582395
GBR best params: {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.2}

CV RMSE summary: {'LinearRegression': 2.9044041598203365, 'RandomForest': 4.105862046811917, 'GradientBoosting': 3.9993716689582395}
Selected: LinearRegression

Test metrics: RMSE=3.4133, MAE=2.7454, R2=0.8973
Saved model to /Users/sahitipotini/Downloads/obesity/best_nfhs_model.pkl
Saved test predictions to /Users/sahitipotini/Downloads/obesity/predictions_test_set.csv
Saved all rows predictions to /Users/sahitipotini/Downloads/obesity/all_predictions.csv

Residuals: mean: -0.4979669189639321 std: 3.456218377231901
Done.


In [4]:
# Final Predictive Modeling Script for NFHS-5 dataset
# Includes: LinearRegression, Ridge, Lasso, RandomForest, GradientBoosting
# Selects best based on CV RMSE and evaluates on test set

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

# --------- CONFIG ----------
INPUT_CSV = "/Users/sahitipotini/Downloads/obesity/NFHS_5_cleaned_data.csv"
TARGET = "women_overweight_obese_pct"            # target variable
STATE_COL = "state"                              # drop this (too many categories)
# ---------------------------

# Load data
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
except:
    df = pd.read_csv(INPUT_CSV, encoding="latin-1")

print("Loaded:", df.shape)

# Features (exclude state + target, drop constants)
features = [c for c in df.columns if c not in (TARGET, STATE_COL)]
features = [c for c in features if df[c].nunique() > 1]

X = df[features].copy()
y = df[TARGET].copy()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Pipelines
lr_pipe   = Pipeline([("pre", preprocessor), ("model", LinearRegression())])
ridge_pipe= Pipeline([("pre", preprocessor), ("model", Ridge())])
lasso_pipe= Pipeline([("pre", preprocessor), ("model", Lasso(max_iter=10000))])
rf_pipe   = Pipeline([("pre", preprocessor), ("model", RandomForestRegressor(random_state=42))])
gbr_pipe  = Pipeline([("pre", preprocessor), ("model", GradientBoostingRegressor(random_state=42))])

# Param grids
ridge_params = {"model__alpha": [0.01, 0.1, 1, 10, 50, 100]}
lasso_params = {"model__alpha": [0.01, 0.1, 1, 10, 50, 100]}
rf_params    = {"model__n_estimators": [50, 100, 150],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 4, 6]}
gbr_params   = {"model__n_estimators": [50, 100],
                "model__learning_rate": [0.05, 0.1, 0.2],
                "model__max_depth": [3, 4, 5]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Fit & evaluate all models
def evaluate_model(name, pipe, params=None):
    if params:
        search = RandomizedSearchCV(pipe, params, n_iter=min(len(list(params.values())[0]), 6),
                                    cv=cv, scoring="neg_mean_squared_error", n_jobs=-1, random_state=42)
        search.fit(X_train, y_train)
        best = search.best_estimator_
        rmse_cv = np.sqrt(-search.best_score_)
        print(f"{name} best params: {search.best_params_}")
    else:
        best = pipe.fit(X_train, y_train)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv,
                                 scoring="neg_mean_squared_error", n_jobs=-1)
        rmse_cv = np.sqrt(-scores.mean())
    print(f"{name} CV RMSE: {rmse_cv:.4f}")
    return best, rmse_cv

models = {}
models["LinearRegression"], rmse_lr = evaluate_model("LinearRegression", lr_pipe)
models["Ridge"], rmse_ridge = evaluate_model("Ridge", ridge_pipe, ridge_params)
models["Lasso"], rmse_lasso = evaluate_model("Lasso", lasso_pipe, lasso_params)
models["RandomForest"], rmse_rf = evaluate_model("RandomForest", rf_pipe, rf_params)
models["GradientBoosting"], rmse_gbr = evaluate_model("GradientBoosting", gbr_pipe, gbr_params)

cv_scores = {
    "LinearRegression": rmse_lr,
    "Ridge": rmse_ridge,
    "Lasso": rmse_lasso,
    "RandomForest": rmse_rf,
    "GradientBoosting": rmse_gbr,
}
print("\nCV RMSE summary:", cv_scores)

best_name = min(cv_scores, key=cv_scores.get)
best_model = models[best_name]
print("Selected best model:", best_name)

# Test evaluation
y_pred = best_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"\nTest metrics for {best_name}: RMSE={rmse_test:.4f}, MAE={mae_test:.4f}, R2={r2_test:.4f}")

# Feature importance (for RF/GBR)
def get_feature_names(preprocessor):
    num_feats = numeric_features
    cat_feats = []
    if categorical_features:
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        cat_feats = ohe.get_feature_names_out(categorical_features).tolist()
    return num_feats + cat_feats

if best_name in ("RandomForest", "GradientBoosting"):
    model_obj = best_model.named_steps["model"]
    feat_names = get_feature_names(best_model.named_steps["pre"])
    importances = model_obj.feature_importances_
    fi = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)
    fi.to_csv("feature_importances.csv", index=False)
    print("\nTop features:\n", fi.head(10))

# Save model and predictions
joblib.dump(best_model, "/Users/sahitipotini/Downloads/obesity/best_nfhs_model.pkl")
print("Model saved as best_nfhs_model.pkl")

test_out = X_test.copy()
test_out[TARGET] = y_test.values
test_out["pred_" + TARGET] = y_pred
test_out.to_csv("/Users/sahitipotini/Downloads/obesity/predictions_test_set.csv", index=False)

df_full = df.copy()
df_full["pred_" + TARGET] = best_model.predict(df_full[features])
df_full.to_csv("/Users/sahitipotini/Downloads/obesity/all_predictions.csv", index=False)
print("Predictions saved: predictions_test_set.csv & all_predictions.csv")


Loaded: (107, 28)
LinearRegression CV RMSE: 2.9044
Ridge best params: {'model__alpha': 1}
Ridge CV RMSE: 2.8398
Lasso best params: {'model__alpha': 0.01}
Lasso CV RMSE: 2.7830
RandomForest best params: {'model__n_estimators': 100, 'model__min_samples_split': 4, 'model__max_depth': 5}
RandomForest CV RMSE: 4.1059
GradientBoosting best params: {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.05}
GradientBoosting CV RMSE: 4.0959

CV RMSE summary: {'LinearRegression': 2.9044041598203365, 'Ridge': 2.8397905312819396, 'Lasso': 2.78300734475657, 'RandomForest': 4.105862046811917, 'GradientBoosting': 4.0958943913618295}
Selected best model: Lasso

Test metrics for Lasso: RMSE=3.3608, MAE=2.7230, R2=0.9005
Model saved as best_nfhs_model.pkl
Predictions saved: predictions_test_set.csv & all_predictions.csv


In [6]:
# Final Predictive Modeling Script for NFHS-5 dataset
# Automatically trains on multiple health targets using multiple model types
# and saves all predictions to a single file.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib
import warnings

warnings.filterwarnings("ignore")
np.random.seed(42)

# --------- CONFIGURATION ----------
INPUT_CSV = "/Users/sahitipotini/Downloads/obesity/NFHS_5_cleaned_data.csv"
OUTPUT_CSV = "/Users/sahitipotini/Downloads/obesity/all_predictions_multi.csv"
STATE_COL = "state"

# List of all target variables to model
TARGET_VARIABLES = [
    'women_overweight_obese_pct', 'men_overweight_obese_pct',
    'women_underweight_pct', 'men_underweight_pct',
    'women_high_blood_sugar_pct', 'men_high_blood_sugar_pct',
    'women_high_blood_pressure_pct', 'men_high_blood_pressure_pct',
    'women_tobacco_use_pct', 'men_tobacco_use_pct',
    'women_alcohol_use_pct', 'men_alcohol_use_pct'
]
# ------------------------------------

# Load data
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_CSV}")
    exit()
except Exception as e:
    df = pd.read_csv(INPUT_CSV, encoding="latin-1")

print(f"Loaded initial data with shape: {df.shape}")

# Define base features (exclude state col and ALL potential targets)
base_features = [c for c in df.columns if c not in TARGET_VARIABLES + [STATE_COL]]
base_features = [c for c in base_features if df[c].nunique() > 1] # Drop constant columns

# Define preprocessor
numeric_features = df[base_features].select_dtypes(include=np.number).columns.tolist()
categorical_features = df[base_features].select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Dictionary to store the best trained model for each target
best_models = {}

# --- Main loop to iterate through each target variable ---
for target in TARGET_VARIABLES:
    print("\n" + "="*50)
    print(f"Processing Target Variable: {target}")
    print("="*50)

    df_loop = df.copy()
    df_loop.dropna(subset=[target], inplace=True)
    
    if df_loop.empty:
        print(f"Skipping '{target}' due to no available data after dropping NaNs.")
        continue

    X = df_loop[base_features]
    y = df_loop[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # --- Model Selection for the current target ---
    # ADDED: Ridge, Lasso, and GradientBoostingRegressor to the comparison
    models_to_evaluate = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(random_state=42),
        "Lasso": Lasso(random_state=42, max_iter=10000),
        "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
        "GradientBoosting": GradientBoostingRegressor(random_state=42)
    }
    
    cv_scores = {}
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in models_to_evaluate.items():
        pipe = Pipeline([("pre", preprocessor), ("model", model)])
        
        scores = cross_val_score(pipe, X_train, y_train, cv=cv,
                                 scoring="neg_mean_squared_error", n_jobs=-1)
        rmse_cv = np.sqrt(-scores.mean())
        cv_scores[name] = rmse_cv
        print(f"  - {name} CV RMSE: {rmse_cv:.4f}")

    best_model_name = min(cv_scores, key=cv_scores.get)
    print(f"  --> Best model for '{target}': {best_model_name}")

    best_model_pipeline = Pipeline([
        ("pre", preprocessor),
        ("model", models_to_evaluate[best_model_name])
    ])
    best_model_pipeline.fit(X_train, y_train)
    best_models[target] = best_model_pipeline

    predictions = best_model_pipeline.predict(df[base_features])
    df[f"pred_{target}"] = predictions

# --- Final Step: Save the consolidated predictions ---
df.to_csv(OUTPUT_CSV, index=False)

print("\n" + "="*50)
print("Pipeline complete.")
print(f"All predictions have been saved to '{OUTPUT_CSV}'")

# Display the head of the final dataframe with original and new prediction columns
final_columns_to_show = ['state', 'area'] + [t for t in TARGET_VARIABLES if f"pred_{t}" in df.columns] + [f"pred_{t}" for t in TARGET_VARIABLES if f"pred_{t}" in df.columns]
print("\n--- Sample of the Final Output File ---")
print(df[final_columns_to_show].head())

Loaded initial data with shape: (107, 28)

Processing Target Variable: women_overweight_obese_pct
  - LinearRegression CV RMSE: 5.9760
  - Ridge CV RMSE: 5.8868
  - Lasso CV RMSE: 6.2271
  - RandomForest CV RMSE: 6.3210
  - GradientBoosting CV RMSE: 6.5376
  --> Best model for 'women_overweight_obese_pct': Ridge

Processing Target Variable: men_overweight_obese_pct
  - LinearRegression CV RMSE: 4.8362
  - Ridge CV RMSE: 4.8240
  - Lasso CV RMSE: 5.2895
  - RandomForest CV RMSE: 5.6541
  - GradientBoosting CV RMSE: 5.7537
  --> Best model for 'men_overweight_obese_pct': Ridge

Processing Target Variable: women_underweight_pct
  - LinearRegression CV RMSE: 4.1385
  - Ridge CV RMSE: 4.0242
  - Lasso CV RMSE: 4.1475
  - RandomForest CV RMSE: 4.0929
  - GradientBoosting CV RMSE: 4.0175
  --> Best model for 'women_underweight_pct': GradientBoosting

Processing Target Variable: men_underweight_pct
  - LinearRegression CV RMSE: 3.5522
  - Ridge CV RMSE: 3.5522
  - Lasso CV RMSE: 3.9614
  - Ran