In [82]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv
from PIL import Image

In [83]:
# Load a small subset first (important)
df = pd.read_excel("../data/train.xlsx")

# Sanity check
df[['id', 'lat', 'long']].head()


Unnamed: 0,id,lat,long
0,9117000170,47.4362,-122.187
1,6700390210,47.4034,-122.187
2,7212660540,47.2704,-122.313
3,8562780200,47.5321,-122.073
4,7760400350,47.3715,-122.074


In [84]:
load_dotenv()

CLIENT_ID = os.getenv("SH_CLIENT_ID")
CLIENT_SECRET = os.getenv("SH_CLIENT_SECRET")

print(CLIENT_ID is not None, CLIENT_SECRET is not None)


True True


In [85]:
from sentinelhub import SHConfig

config = SHConfig()
config.sh_client_id = CLIENT_ID
config.sh_client_secret = CLIENT_SECRET

assert config.sh_client_id is not None
assert config.sh_client_secret is not None


In [86]:
from sentinelhub import (
    BBox, CRS, SentinelHubRequest,
    DataCollection, MimeType, bbox_to_dimensions
)


In [87]:
evalscript = """
//VERSION=3
function setup() {
  return {
    input: [{
      bands: ["B04", "B03", "B02", "CLM"]
    }],
    output: { bands: 3 }
  };
}

function evaluatePixel(sample) {
  if (sample.CLM === 1) {
    return [0, 0, 0]; // mask clouds
  }
  return [sample.B04, sample.B03, sample.B02];
}
"""


In [88]:
def download_image(lat, lon, image_id, out_dir="../data/images"):
    # Bounding box (~400–500 m context)
    bbox = BBox(
        bbox=[lon-0.004, lat-0.004, lon+0.004, lat+0.004],
        crs=CRS.WGS84
    )

    # 10 m resolution → ~256x256 pixels
    size = bbox_to_dimensions(bbox, resolution=10)

    request = SentinelHubRequest(
        evalscript=evalscript,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L2A,
                time_interval=("2019-01-01", "2021-12-31"),
                mosaicking_order="leastCC"
            )
        ],
        responses=[
            SentinelHubRequest.output_response("default", MimeType.PNG)
        ],
        bbox=bbox,
        size=size,
        config=config
    )

    # Download image
    image = request.get_data()[0]

    # Contrast normalization (VERY IMPORTANT)
    p2, p98 = np.percentile(image, (2, 98))
    image = np.clip((image - p2) / (p98 - p2), 0, 1)
    image = (image * 255).astype(np.uint8)

    # Save
    os.makedirs(out_dir, exist_ok=True)
    Image.fromarray(image).save(f"{out_dir}/{image_id}.png")


row = df.iloc[0]

download_image(
    lat=row["lat"],
    lon=row["long"],
    image_id=row["id"]
)


In [89]:
for _, row in df.iterrows():
    download_image(
        lat=row["lat"],
        lon=row["long"],
        image_id=row["id"]
    )


In [90]:
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,9117000170,20150505T000000,268643,4,2.25,1810,9240,2.0,0,0,...,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,6700390210,20140708T000000,245000,3,2.50,1600,2788,2.0,0,0,...,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,7212660540,20150115T000000,200000,4,2.50,1720,8638,2.0,0,0,...,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,8562780200,20150427T000000,352499,2,2.25,1240,705,2.0,0,0,...,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,7760400350,20141205T000000,232000,3,2.00,1280,13356,1.0,0,0,...,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,5272200045,20141113T000000,378000,3,1.50,1000,6914,1.0,0,0,...,7,1000,0,1947,0,98125,47.7144,-122.319,1000,6947
16205,9578500790,20141111T000000,399950,3,2.50,3087,5002,2.0,0,0,...,8,3087,0,2014,0,98023,47.2974,-122.349,2927,5183
16206,7202350480,20140930T000000,575000,3,2.50,2120,4780,2.0,0,0,...,7,2120,0,2004,0,98053,47.6810,-122.032,1690,2650
16207,1723049033,20140620T000000,245000,1,0.75,380,15000,1.0,0,0,...,5,380,0,1963,0,98168,47.4810,-122.323,1170,15000
