In [1]:
import pathlib
import rasterio
import numpy as np
import pandas as pd
import torch
from PIL import Image
# Written Utils Functions
from Utils.utils import build_embedding_model, compute_image_embeddings
from Utils.utils import compute_tabular_features, combine_features
# Writtent Model Functions
from Utils.computation import compute_pca_components
from Utils.computation import estimate_treatment_effect_tabular 
from Utils.computation import estimate_treatment_effect_with_embeddings
from Utils.computation import evaluate_ite, get_train_test_indices

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Collect file paths
COV_DIR = pathlib.Path('/Users/sayedmorteza/Library/CloudStorage/Box-Box/Hetwet_Data')
DATA_DIR = pathlib.Path('/Users/sayedmorteza/Library/CloudStorage/Box-Box/Caltech Research/Scripts/ImageTRT/Synthetic Data Generation/Results')


folders = {
    'wet': DATA_DIR / 'Treatment',
    'dem': COV_DIR / 'DEM',
    'cap': COV_DIR / 'CAPITAL_1996',
    'outcome': DATA_DIR / 'Outcome_Post',
    'ite': DATA_DIR / 'ITE_TOTAL'
}


scene_ids = [str(i) for i in range(1, 3566)]

In [3]:
# Tabular features
tab_df = compute_tabular_features(folders, scene_ids)
tab_df.head()
tab_df.to_csv('tabular_features.csv', index=False)

Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.


Saved features to: /Users/sayedmorteza/Library/CloudStorage/Box-Box/Caltech Research/Scripts/ImageTRT/Model/features.csv


In [5]:
tab_df.describe()

Unnamed: 0,wet_mean,wet_std,dem_mean,dem_std,cap_mean,cap_std,outcome_mean,outcome_std,ite_mean,ite_std
count,3565.0,3565.0,3565.0,3565.0,3565.0,3565.0,3565.0,3565.0,3565.0,3565.0
mean,0.008493,0.037655,-433.918692,216.401709,41972.41,252052.2,3.747508,0.715218,0.000519,0.001882
std,0.029382,0.078367,1861.773938,907.998179,172172.4,1280287.0,1.121531,1.639506,0.001915,0.005024
min,0.0,0.0,-9999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.727104,0.834537,0.0,0.0,3.703125,0.390312,0.0,0.0
50%,0.0,0.0,9.806548,2.776222,2873.736,20009.96,3.78125,0.441665,0.0,0.0
75%,0.001251,0.03535,30.674374,7.810327,22786.76,75623.79,3.875,0.474959,0.0,0.0
max,0.418045,0.493238,103.384105,4999.444138,6145465.0,39609600.0,33.647849,49.727313,0.027906,0.034863


In [None]:
# Embeddings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = build_embedding_model(
    device=device,
    model_type='resnet18', 
    conv_layer=4             
)

# Compute image embeddings for each variable
# for var in folders:
#     _ = compute_image_embeddings(
#         folders=folders,
#         scene_ids=scene_ids,
#         var=var,
#         model=model,
#         device=device,
#         img_size=256,
#     )

# Compute image embeddings for only wetland
var = 'wet'
_ = compute_image_embeddings(
    folders=folders,
    scene_ids=scene_ids,
    var=var,
    model=model,
    device=device,
    img_size=256,
)

In [None]:
# Get pca features
pca_wet = compute_pca_components('wet_embeddings.csv', n_components=10)
# pca_dem = compute_pca_components('dem_embeddings.csv', n_components=2)
# pca_cap = compute_pca_components('cap_embeddings.csv', n_components=2)
# pca_claims_96 = compute_pca_components('claims_96_embeddings.csv', n_components=2)

pca_wet.to_csv("wet_pca10.csv")

In [None]:
# combine features
combined_df = combine_features(
    tab_df,
    pca_wet=pca_wet,
    pca_dem=pca_dem,
    pca_cap=pca_cap,
    pca_claims_96=pca_claims_96
)

In [None]:
# load tables
tab_df     = pd.read_csv('tabular_features.csv')
pca_wet    = pd.read_csv('wet_pca10.csv')

# outcome variable
tab_df['outcome'] = tab_df['claims_96_mean'] - tab_df['claims_16_mean']

# pick outcome & features
OUTCOME = 'outcome'
TREAT   = 'wet_mean'
COVS    = [c for c in tab_df.columns 
           if c not in ('scene_id', OUTCOME, 'wet_mean', 'outcome', 'claims_16_mean', 'claims_96_mean')]

# merge PCA back onto tab_df
df = tab_df.merge(pca_wet,  on='scene_id', how='left')

# split once
train_idx, test_idx = get_train_test_indices(tab_df, test_size=0.2, random_state=42)

In [None]:
# tabular-only
cate_tab, ate_tab, model_tab = estimate_treatment_effect_tabular(
    df=df,
    train_idx=train_idx,
    test_idx=test_idx,
    outcome_col=OUTCOME,
    treatment_col=TREAT,
    covariate_cols=COVS
)

In [None]:
# include image embeddings in the treatment
PCA_COLS = [f'PC{i+1}' for i in range(10)]

cate_img, ate_img, model_img = estimate_treatment_effect_with_embeddings(
    df,
    train_idx,
    test_idx,
    outcome_col=OUTCOME,
    treatment_col=PCA_COLS,
    covariate_cols=COVS
)

print("Tabular ATE (train/test):", ate_tab)
print("Image‐augmented ATE (train/test):", ate_img)

# ompare out‐of‐sample CATE distributions
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(cate_tab.query("dataset=='test'")['CATE'], label='tabular')
sns.kdeplot(cate_img.query("dataset=='test'")['CATE_wet'], label='with embeddings')
plt.legend(); plt.title("Test‐set CATE distributions")
plt.show()

In [None]:
# Actual ITE
cate_true = tab_df[['scene_id', 'ite']]

In [None]:
# evaluating ITE with the actual data for tabular
metrics = evaluate_ite(
    cate_true,
    cate_tab,
    id_col="scene_id",
    actual_col="ite",
    estimated_col="CATE",
    output_dir="results/ite_plots"
)
print(metrics)

In [None]:
# evaluating ITE with the actual data for image
metrics = evaluate_ite(
    cate_true,
    cate_img,
    id_col="scene_id",
    actual_col="actual_ite",
    estimated_col="CATE_wet",
    output_dir="results/ite_plots"
)
print(metrics)