In [1]:
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from tqdm import tqdm
import pickle

from sklearn.linear_model import Ridge
directory = r"G:\data\ariel-data-challenge-2024\test"

In [3]:
#f_read_and_preprocess

def f_read_and_preprocess(dataset, adc_info, planet_ids):
    """Read the FGS1 files for all planet_ids and extract the time series.
    
    Parameters
    dataset: 'train' or 'test'
    adc_info: metadata dataframe, either train_adc_info or test_adc_info
    planet_ids: list of planet ids
    
    Returns
    dataframe with one row per planet_id and 67500 values per row
    
    """
    f_raw_train = np.full((len(planet_ids), 67500), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        f_signal = pl.read_parquet(r'G:\data\ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_signal.parquet')
        mean_signal = f_signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / 1024 # mean over the 32*32 pixels
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        f_raw_train[i] = net_signal
    return f_raw_train

In [4]:
# a_read_and_preprocess
def a_read_and_preprocess(dataset, adc_info, planet_ids):
    """Read the AIRS-CH0 files for all planet_ids and extract the time series.
    
    Parameters
    dataset: 'train' or 'test'
    adc_info: metadata dataframe, either train_adc_info or test_adc_info
    planet_ids: list of planet ids
    
    Returns
    dataframe with one row per planet_id and 5625 values per row
    
    """
    a_raw_train = np.full((len(planet_ids), 5625), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pl.read_parquet(r'G:\data\ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_signal.parquet')
        mean_signal = signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / (32*356) # mean over the 32*356 pixels
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        a_raw_train[i] = net_signal
    return a_raw_train

In [5]:
# feature_engineering
def feature_engineering(f_raw, a_raw):
    """Create a dataframe with two features from the raw data.
    
    Parameters:
    f_raw: ndarray of shape (n_planets, 67500)
    a_raw: ndarray of shape (n_planets, 5625)
    
    Return value:
    df: DataFrame of shape (n_planets, 2)
    """
    obscured = f_raw[:, 23500:44000].mean(axis=1)
    unobscured = (f_raw[:, :20500].mean(axis=1) + f_raw[:, 47000:].mean(axis=1)) / 2
    f_relative_reduction = (unobscured - obscured) / unobscured
    
    half_obscured1 = f_raw[:, 20500:23500].mean(axis=1)
    half_obscured2 = f_raw[:, 44000:47000].mean(axis=1)
    f_half_reduction1 = (unobscured - half_obscured1) / unobscured
    f_half_reduction2 = (unobscured - half_obscured2) / unobscured
    
    obscured = a_raw[:, 1958:3666].mean(axis=1)
    unobscured = (a_raw[:, :1708].mean(axis=1) + a_raw[:, 3916:].mean(axis=1)) / 2
    a_relative_reduction = (unobscured - obscured) / unobscured
    
    half_obscured1 = a_raw[:, 1708:1958].mean(axis=1)
    half_obscured2 = a_raw[:, 3666:3916].mean(axis=1)
    a_half_reduction1 = (unobscured - half_obscured1) / unobscured
    a_half_reduction2 = (unobscured - half_obscured2) / unobscured

    df = pd.DataFrame({'a_relative_reduction': a_relative_reduction,
                       'f_relative_reduction': f_relative_reduction,
                      'f_half_reduction1': f_half_reduction1,
                       'f_half_reduction2': f_half_reduction2,
                       'a_half_reduction1': a_half_reduction1,
                       'a_half_reduction2': a_half_reduction2
                      
                      })
    
    return df

In [6]:
# 'postprocessing
def postprocessing(pred_array, index, sigma_pred):
    """Create a submission dataframe from its components
    
    Parameters:
    pred_array: ndarray of shape (n_samples, 283)
    index: pandas.Index of length n_samples with name 'planet_id'
    sigma_pred: float
    
    Return value:
    df: DataFrame of shape (n_samples, 566) with planet_id as index
    """
    return pd.concat([pd.DataFrame(pred_array.clip(0, None), index=index, columns=wavelengths.columns),
                      pd.DataFrame(sigma_pred, index=index, columns=[f"sigma_{i}" for i in range(1, 284)])],
                     axis=1)

In [7]:
# Load the data
wavelengths = pd.read_csv(r'G:\data\ariel-data-challenge-2024/wavelengths.csv')
test_adc_info = pd.read_csv(r'G:\data\ariel-data-challenge-2024/test_adc_info.csv',
                           index_col='planet_id')
f_raw_test = f_read_and_preprocess('test', test_adc_info, test_adc_info.index)
a_raw_test = a_read_and_preprocess('test', test_adc_info, test_adc_info.index)
test = feature_engineering(f_raw_test, a_raw_test)

# Load the model
with open(directory + 'model.pickle', 'rb') as f:
    model = pickle.load(f)
with open(directory + 'sigma_pred.pickle', 'rb') as f:
    sigma_pred = pickle.load(f)
    
# Predict
test_pred = model.predict(test)

# Package into submission file
sub_df = postprocessing(test_pred,
                        test_adc_info.index,
                        sigma_pred=np.tile(np.where(test_adc_info[['star']] <= 1, sigma_pred, 0.001), (1, 283)))
display(sub_df)
sub_df.to_csv('submission.csv')
#!head submission.csv

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]


FileNotFoundError: The system cannot find the path specified. (os error 3): G:\data\ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_signal.parquet