In [13]:
from pathlib import Path
import zipfile
import pandas as pd
import yaml
import csv
import os
import xarray as xr
import rioxarray as rxr
import subprocess

In [4]:
with open('config_chelsea.yaml', 'r') as f:
    config = yaml.safe_load(f)
input_path = config['input_path']
output_dir = config['output_dir']
target_period = config['target_period']
parquet_path=config["final_dataframe_path"]
tsv_path=config["final_dataframe_path_tsv"]

In [6]:
os.makedirs(output_dir, exist_ok=True)

In [16]:
command = [
    "wget.exe", 
    "-i", str(input_path), 
    "-P", str(output_dir), 
    "-nc"
]

In [17]:
try:
    subprocess.run(command, check=True)
    print("Download complete.")

except FileNotFoundError:
    print("ERROR: 'wget' is not recognized.")
    print("Make sure you have installed wget for Windows and added it to your System PATH.")
    
except subprocess.CalledProcessError as e:
    print(f"Wget stopped with an error (Code: {e.returncode}).")

Download complete.


In [7]:
!wget -i {input_path} -P {output_dir}

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [19]:
with open('config_chelsea.yaml', 'r') as f:
    config = yaml.safe_load(f)

patterns = {}

switches = config['environmental_variables']['include_variables']
names = config['environmental_variables']['variable_names']

for code, is_active in switches.items():
    if is_active:
        nice_name = names.get(code, code)
        patterns[code] = nice_name

        if "bio" in code:
            alt_code = code.replace("bio", "bio10_")
            patterns[alt_code] = nice_name

            number = code.replace("bio", "")
            if len(number) == 1:
                patterns[f"bio0{number}"] = nice_name
                patterns[f"bio10_0{number}"] = nice_name
                

In [20]:
output_dir = Path(config['output_dir'])
datasets = []
for file_path in output_dir.glob("*.tif"):
    filename=file_path.name

    if target_period not in filename:
        continue
    found_var_name = None
    for code, nice_name in patterns.items():
        if code in filename:
            found_var_name = nice_name
            break
   
    if found_var_name:
        try:
            da=rxr.open_rasterio(file_path, masked=True)
            da = da.squeeze().drop_vars("band")
            da.name = found_var_name
            datasets.append(da)
        except Exception as e:
            print(f"   ❌ Error loading {filename}: {e}")


In [21]:
if datasets:
    env_stack = xr.merge(datasets)

  env_stack = xr.merge(datasets)


In [22]:
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)
    data_path=config["output_dir"]
occ_df=pd.read_parquet(data_path)

In [23]:
target_lons = xr.DataArray(occ_df["Longitude"], dims="points")
target_lats = xr.DataArray(occ_df["Latitude"], dims="points")

In [24]:
sampled_data = env_stack.sel(
    x=target_lons, 
    y=target_lats, 
    method="nearest", 
    tolerance=0.1
)

In [25]:
env_df = sampled_data.to_dataframe().drop(columns=["spatial_ref", "x", "y"])

In [26]:
env_df

Unnamed: 0_level_0,max_temp,annual_precipitation
points,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3082.0,2377.0
1,3081.0,1925.0
2,3108.0,1454.0
3,3120.0,1472.0
4,3143.0,827.0
...,...,...
318,3039.0,2757.0
319,3139.0,1455.0
320,3013.0,1012.0
321,3013.0,1012.0


In [27]:
env_df = env_df.reset_index(drop=True)
occ_df = occ_df.reset_index(drop=True)
env_df.columns = env_df.columns.str.strip()

In [28]:
final_df = pd.concat([occ_df, env_df], axis=1)
final_df.columns = final_df.columns.str.strip()

In [29]:
final_df

Unnamed: 0,Scientific Name,Kingdom,Class,Latitude,Longitude,Coordinate Uncertainty (m),Country Code,Year,Month,Basis of Record,Occurrence Status,Issue,max_temp,annual_precipitation
0,"Panthera tigris tigris (Linnaeus, 1758)",Animalia,Mammalia,27.546213,84.256294,250.0,NP,2025,12.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,3082.0,2377.0
1,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,27.215220,84.985470,30.0,NP,2021,1.0,HUMAN_OBSERVATION,PRESENT,,3081.0,1925.0
2,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,29.843404,78.189916,30.0,IN,2016,6.0,HUMAN_OBSERVATION,PRESENT,,3108.0,1454.0
3,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,23.663864,81.013351,250.0,IN,2009,2.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,3120.0,1472.0
4,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,25.996317,76.492310,250.0,IN,2007,2.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,3143.0,827.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,26.659422,91.001129,,IN,2016,12.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,3039.0,2757.0
319,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,20.248467,79.425378,,IN,2014,5.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,3139.0,1455.0
320,"Panthera tigris tigris (Linnaeus, 1758)",Animalia,Mammalia,40.166700,-83.083300,,US,1996,9.0,HUMAN_OBSERVATION,PRESENT,GEODETIC_DATUM_ASSUMED_WGS84;CONTINENT_DERIVED...,3013.0,1012.0
321,"Panthera tigris sumatrae Pocock, 1929",Animalia,Mammalia,40.166700,-83.083300,,US,1995,2.0,HUMAN_OBSERVATION,PRESENT,GEODETIC_DATUM_ASSUMED_WGS84;CONTINENT_DERIVED...,3013.0,1012.0


In [30]:
final_df["max_temp"] = (final_df["max_temp"] * 0.1) - 273.15
final_df["max_temp"] = final_df["max_temp"].round(2)
final_df['annual_precipitation'] = final_df['annual_precipitation'] * 0.1
final_df["annual_precipitation"] = final_df["annual_precipitation"].round(1)

In [31]:
final_df.to_parquet(parquet_path)
final_df.to_csv(tsv_path, sep='\t')

In [32]:
final_df

Unnamed: 0,Scientific Name,Kingdom,Class,Latitude,Longitude,Coordinate Uncertainty (m),Country Code,Year,Month,Basis of Record,Occurrence Status,Issue,max_temp,annual_precipitation
0,"Panthera tigris tigris (Linnaeus, 1758)",Animalia,Mammalia,27.546213,84.256294,250.0,NP,2025,12.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,35.049999,237.699997
1,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,27.215220,84.985470,30.0,NP,2021,1.0,HUMAN_OBSERVATION,PRESENT,,34.950001,192.500000
2,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,29.843404,78.189916,30.0,IN,2016,6.0,HUMAN_OBSERVATION,PRESENT,,37.650002,145.399994
3,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,23.663864,81.013351,250.0,IN,2009,2.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,38.849998,147.199997
4,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,25.996317,76.492310,250.0,IN,2007,2.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,41.150002,82.699997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,26.659422,91.001129,,IN,2016,12.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,30.750000,275.700012
319,"Panthera tigris (Linnaeus, 1758)",Animalia,Mammalia,20.248467,79.425378,,IN,2014,5.0,HUMAN_OBSERVATION,PRESENT,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS8...,40.750000,145.500000
320,"Panthera tigris tigris (Linnaeus, 1758)",Animalia,Mammalia,40.166700,-83.083300,,US,1996,9.0,HUMAN_OBSERVATION,PRESENT,GEODETIC_DATUM_ASSUMED_WGS84;CONTINENT_DERIVED...,28.150000,101.199997
321,"Panthera tigris sumatrae Pocock, 1929",Animalia,Mammalia,40.166700,-83.083300,,US,1995,2.0,HUMAN_OBSERVATION,PRESENT,GEODETIC_DATUM_ASSUMED_WGS84;CONTINENT_DERIVED...,28.150000,101.199997


In [33]:
n_records=len(final_df)
avg_temp=final_df["max_temp"].mean()
avg_precip=final_df["annual_precipitation"].mean()
year_min = final_df["Year"].min()
year_max = final_df["Year"].max()

In [34]:
with open("config_chelsea.yaml", "r") as f:
    config = yaml.safe_load(f)
    log_path = Path(config["log_dir"])
    with open(log_path, "w") as log:
        species_name = final_df["Scientific Name"].iloc[0]
        log.write(f"CHELSEA DATA INSPECTION LOG for {species_name}\n")
        log.write("=" * 35 + "\n\n")

        log.write(f"Total records: {n_records}\n")
        log.write(f"Average Max Temperature (°C): {avg_temp}\n")
        log.write(f"Average Annual Precipitation (mm): {avg_precip}\n")
        log.write(f"Year Range: {year_min} - {year_max}\n")