In [1]:
import h5py
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

In [7]:
h5_file_path = "elucidata_ai_challenge_data.h5"

In [8]:
with h5py.File(h5_file_path, "r") as h5f:
    print("Keys in the HDF5 file:", list(h5f.keys()))

Keys in the HDF5 file: ['images', 'spots']


In [9]:
def explore_h5py_structure(h5obj, prefix=""):
    for key in h5obj.keys():
        item = h5obj[key]
        path = f"{prefix}/{key}" if prefix else key
        if isinstance(item, h5py.Group):
            print(f"Group: {path}")
            explore_h5py_structure(item, path)  # Recursively explore groups
        elif isinstance(item, h5py.Dataset):
            print(f"Dataset: {path}, Shape: {item.shape}, Dtype: {item.dtype}")

with h5py.File(file_path, "r") as h5f:
    explore_h5py_structure(h5f)

Group: images
Group: images/Test
Dataset: images/Test/S_7, Shape: (1983, 2000, 3), Dtype: float32
Group: images/Train
Dataset: images/Train/S_1, Shape: (2000, 1974, 3), Dtype: float32
Dataset: images/Train/S_2, Shape: (2000, 1988, 3), Dtype: float32
Dataset: images/Train/S_3, Shape: (2000, 1966, 3), Dtype: float32
Dataset: images/Train/S_4, Shape: (2000, 1979, 3), Dtype: float32
Dataset: images/Train/S_5, Shape: (1985, 2000, 3), Dtype: float32
Dataset: images/Train/S_6, Shape: (2000, 1930, 3), Dtype: float32
Group: spots
Group: spots/Test
Dataset: spots/Test/S_7, Shape: (2088,), Dtype: [('x', '<i8'), ('y', '<i8'), ('Test_Set', '<i8')]
Group: spots/Train
Dataset: spots/Train/S_1, Shape: (2197,), Dtype: [('x', '<i8'), ('y', '<i8'), ('C1', '<f8'), ('C2', '<f8'), ('C3', '<f8'), ('C4', '<f8'), ('C5', '<f8'), ('C6', '<f8'), ('C7', '<f8'), ('C8', '<f8'), ('C9', '<f8'), ('C10', '<f8'), ('C11', '<f8'), ('C12', '<f8'), ('C13', '<f8'), ('C14', '<f8'), ('C15', '<f8'), ('C16', '<f8'), ('C17', '<f8'

In [11]:
def train_multioutput_xgb(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []
    fold = 1
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42))
        model.fit(X_train, y_train)
        models.append(model)
        
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Fold {fold}: MSE={mse:.4f}, R²={r2:.4f}")
        fold += 1
    
    return models

In [18]:
with h5py.File(h5_file_path, "r") as f:
    train_spots = f["spots/Train"]
    train_spot_tables = {}
    
    for slide_name in train_spots.keys():
        spot_array = np.array(train_spots[slide_name])
        df = pd.DataFrame(spot_array)
        train_spot_tables[slide_name] = df

# Example: Display the spots table for slide 'S_1'
train_spot_tables['S_1']

Unnamed: 0,x,y,C1,C2,C3,C4,C5,C6,C7,C8,...,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35
0,1554,1297,0.014401,0.057499,0.022033,0.001704,0.533992,1.511707,0.015313,0.020029,...,1.009572e-03,2.068237,0.121361,0.007344,0.000017,0.036891,0.035934,0.118937,0.001472,0.050057
1,462,1502,0.116196,0.197176,0.110600,0.042614,5.587681,0.006885,0.096346,0.001711,...,6.918171e-04,0.014442,0.000238,0.024071,0.000023,0.217589,0.100662,0.004027,0.004122,0.049491
2,1488,1548,0.133284,0.035880,0.061352,0.003073,1.104479,0.009174,0.009175,0.000114,...,9.577447e-05,0.149792,0.001401,0.000699,0.000009,0.024491,0.018810,0.004171,0.000425,0.015348
3,1725,1182,0.087715,0.235223,0.090382,0.013902,8.760482,0.140912,0.188859,0.010154,...,1.964150e-03,0.142549,0.002036,0.047165,0.000022,0.180372,0.202981,0.003709,0.001845,0.116022
4,581,1113,0.128468,0.066399,0.098982,0.047022,3.425771,0.001009,0.026881,0.000468,...,7.189078e-05,0.005920,0.000048,0.006359,0.000585,0.052661,0.032168,0.000107,0.000107,0.013103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,1055,701,0.047236,0.112721,0.052490,0.002092,0.000913,0.393805,0.045220,0.013414,...,7.530558e-03,0.559967,0.154793,0.133587,0.000013,0.051041,0.113100,0.003147,0.004971,0.128265
2193,1225,862,0.070764,0.119310,0.193938,0.001239,0.003851,0.588956,0.037731,0.004072,...,3.219223e-02,0.667076,0.094247,0.136623,0.000009,0.050518,0.119685,0.027743,0.016413,0.287171
2194,765,1479,0.194491,0.048068,0.150061,0.002978,0.119206,0.000213,0.005332,0.000078,...,2.628421e-04,0.000273,0.000022,0.000027,0.000042,0.029386,0.019977,0.000039,0.000119,0.006778
2195,607,1525,0.002968,0.151899,0.015931,0.000071,1.354983,0.000152,0.019254,0.001267,...,1.377452e-03,0.003754,0.001420,0.088000,0.000536,0.073240,0.113631,0.009040,0.000090,0.025716


In [22]:
with h5py.File(h5_file_path, "r") as f:
    train_image = f["images/Train"]
    train_image_tables = {}
    
    for slide_name in train_image.keys():
        print(slide_name)
        print(np.array(train_image[slide_name]))
        print("\n")

S_1
[[[0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  ...
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]]

 [[0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  ...
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]]

 [[0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  ...
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]
  [0.9647059  0.9607843  0.9882353 ]]

 ...

 [[0.9607843  0.95686275 0.9843137 ]
  [0.9607843  0.95686275 0.9843137 ]
  [0.9607843  0.95686275 0.9843137 ]
  ...
  [0.9647059  0.9647059  0.98039216]
  [0.9647059  0.9647059  0.98039216]
  [0.9647059  0.9647059  0.98039216]]

 [[0.9607843  0.95686275 0.9843137 ]
  [0.9607843  0.95686275 0.984313