In [None]:
import numpy as np 
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats
import random
from tqdm import tqdm
import pickle

from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

## 任务分析：

**训练数据**：
* 行星数量：673颗行星。
* 预测label：需要预测的目标数量为283个(光谱分解)——多输出回归。

**测试数据**：
* 约800颗隐藏的行星用于测试。

In [None]:
train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_adc_info.csv',index_col='planet_id')
# test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/test_adc_info.csv',index_col='planet_id')
train_labels = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/train_labels.csv',index_col='planet_id')
wavelengths = pd.read_csv('/kaggle/input/ariel-data-challenge-2024/wavelengths.csv')
axis_info = pd.read_parquet('/kaggle/input/ariel-data-challenge-2024/axis_info.parquet')

## FGS1数据观察（暂时不用校准文件）

**数据描述**

* 每个文件包含135,000行图像，图像以0.1秒的时间间隔拍摄。每行是一个32x32的单波长图像。

取100468857号行星的FGS1数据进行观察

In [None]:
# 获取所有行星 ID 列表
planet_ids = train_adc_info.index.tolist()

# 随机从中选择 9 个行星 ID
random_ids = random.sample(planet_ids, 9)
print("从现有行星 ID 中随机生成的 9 个 ID：", random_ids)

# 使用 random_ids 遍历处理
for planet_id in random_ids:
    try:
        f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
        print(f"行星 {planet_id} 的数据：")
        print(f_signal)
    except FileNotFoundError:
        print(f"行星 {planet_id} 的数据文件未找到！")

将每一行数据恢复成32*32的像素点观察

In [None]:
# 遍历每个随机行星
for planet_id in random_ids:
    try:
        # 读取对应行星的 FGS1 数据
        f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
        
        # 可视化第 0 时刻和第 1 时刻的数据
        _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        sns.heatmap(f_signal.iloc[0].values.reshape(32, 32), ax=ax1, vmin=0, vmax=52000)
        ax1.set_aspect('equal')
        ax1.set_title(f"Planet {planet_id} - Time 0")

        sns.heatmap(f_signal.iloc[1].values.reshape(32, 32), ax=ax2, vmin=0, vmax=52000)
        ax2.set_aspect('equal')
        ax2.set_title(f"Planet {planet_id} - Time 1")

        plt.suptitle(f'Comparison of FGS1 Data for Planet {planet_id}')
        plt.show()
        
    except FileNotFoundError:
        print(f"行星 {planet_id} 的数据文件未找到！")

In [None]:


planet_id = 100468857
f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
f_signal

将每一行数据恢复成32\*32的像素点观察

In [None]:
# 取100468857号行星0时刻和1时刻的FGS1数据进行比较
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(f_signal.iloc[0].values.reshape(32, 32), ax=ax1, vmin=0, vmax=52000)
ax1.set_aspect('equal')
sns.heatmap(f_signal.iloc[1].values.reshape(32, 32), ax=ax2, vmin=0, vmax=52000)
ax2.set_aspect('equal')
plt.suptitle('A pair of FGS1 images')
plt.show()

观察时序变化

In [None]:
# 定义滑动窗口平滑函数
def smooth_signal(signal, window=800):
    """
    使用滑动窗口平滑信号。
    参数：
    - signal: ndarray, 累积信号。
    - window: int, 滑动窗口大小。
    返回：
    - smooth_signal: ndarray, 平滑后的信号。
    """
    return (signal[window:] - signal[:-window]) / window

# 定义处理单个行星信号的函数
def process_planet_signal(planet_id):
    """
    加载并处理指定行星的 FGS1 数据。
    参数：
    - planet_id: int, 行星 ID。
    返回：
    - net_signal: ndarray, 奇偶帧差分信号。
    - smooth_signal: ndarray, 平滑后的信号。
    """
    f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
    mean_signal = f_signal.values.mean(axis=1)  # 直接对每帧图像取平均值
    net_signal = mean_signal[1::2] - mean_signal[0::2]  # 奇数帧 - 偶数帧
    cum_signal = net_signal.cumsum()  # 累积信号
    smoothed_signal = smooth_signal(cum_signal)  # 平滑信号
    return net_signal, smoothed_signal


# 创建子图，根据行星数量动态调整布局
num_planets = len(random_ids)
fig, axes = plt.subplots(num_planets, 2, figsize=(12, 4 * num_planets))
if num_planets == 1:
    axes = [axes]  # 保证 axes 是二维列表形式，适配后续循环

# 遍历每个随机行星 ID
for i, planet_id in enumerate(random_ids):
    ax_signal, ax_smooth = axes[i]

    # 处理每个行星的信号
    net_signal, smoothed_signal = process_planet_signal(planet_id)

    # 绘制原始信号
    ax_signal.set_title(f'FGS1: time series of planet {planet_id} (raw)')
    ax_signal.plot(net_signal, label='raw signal', alpha=0.7)
    ax_signal.legend()

    # 绘制平滑信号
    ax_smooth.set_title(f'FGS1: time series of planet {planet_id} (smoothed)')
    ax_smooth.plot(smoothed_signal, color='c', label='smoothened signal')
    ax_smooth.legend()
    ax_smooth.set_xlabel('time step')

    # 可选：标注特定的时间点（这里根据 net_signal 自动计算关键点）
    for time_step in [20500, 23500, 44000, 47000]:  # 示例特定时间点
        ax_smooth.axvline(time_step, color='gray', linestyle='--', alpha=0.7)

# 调整布局并显示图像
plt.tight_layout()
plt.show()


In [None]:
_, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 4))

#变化比较明显的一个行星
planet_id = 100468857
f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')

#直接对每行图像取平均？是否合理？怎么改进？
mean_signal = f_signal.values.mean(axis=1)
# 奇数帧-偶数帧（观察数据好像两帧之间会有跳变）
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()

#滑动窗口平滑数据
window=800
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax1.set_title('FGS1: time series of planet with strong signal')
ax1.plot(net_signal, label='raw signal')
ax1.legend()
ax3.plot(smooth_signal, color='c', label='smoothened signal')
ax3.legend()
ax3.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax3.axvline(time_step, color='gray')

#变化没有那么明显的一个行星
planet_id = 4249337798
f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')

mean_signal = f_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window=800
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax2.set_title('FGS1: time series of planet with weak signal')
ax2.plot(net_signal, label='raw signal')
ax2.legend()
ax4.plot(smooth_signal, color='c', label='smoothened signal')
ax4.legend()
ax4.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax4.axvline(time_step, color='gray')

# plt.suptitle('FGS1 time series', y=0.96)
plt.show()

## AIRS 数据观察（暂时未校准）

**数据描述**

* 每个文件包含11,250行图像，图像以0.1秒的时间间隔拍摄。每行是一个32 x 356的单波长图像。

还是来观察100468857

In [None]:
planet_id = 100468857
a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')
a_signal

将观察random_ids 的数据

In [None]:
# 遍历每个行星 ID
for planet_id in random_ids:
    try:
        # 加载 A 信号
        a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')
        print(f"Planet ID: {planet_id}")
        print(a_signal)  # 输出数据内容或结构信息
    except Exception as e:
        print(f"Failed to process Planet ID {planet_id}: {e}")

In [None]:
# 取100468857号行星0时刻和1时刻的AIRS数据进行比较
_, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

sns.heatmap(a_signal.iloc[0].values.reshape(32, 356), ax=ax1, vmin=0, vmax=52000)
ax1.set_title('AIRS Image at Time 0')
ax1.set_aspect('equal')
ax1.set_ylim(32, 0)
ax1.set_aspect('auto')

sns.heatmap(a_signal.iloc[1].values.reshape(32, 356), ax=ax2, vmin=0, vmax=52000)
ax2.set_title('AIRS Image at Time 1')
ax2.set_aspect('equal')
ax2.set_ylim(32, 0)
ax2.set_aspect('auto') 

plt.suptitle('A Pair of AIRS Images')

plt.show()

对random_ids进行操作

In [None]:
# 遍历每个行星 ID
for planet_id in random_ids:
    try:
        # 加载 A 信号数据
        a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')

        # 创建热力图进行比较
        _, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

        sns.heatmap(a_signal.iloc[0].values.reshape(32, 356), ax=ax1, vmin=0, vmax=52000)
        ax1.set_title(f'AIRS Image at Time 0 (Planet ID: {planet_id})')
        ax1.set_ylim(32, 0)
        ax1.set_aspect('auto')

        sns.heatmap(a_signal.iloc[1].values.reshape(32, 356), ax=ax2, vmin=0, vmax=52000)
        ax2.set_title(f'AIRS Image at Time 1 (Planet ID: {planet_id})')
        ax2.set_ylim(32, 0)
        ax2.set_aspect('auto')

        plt.suptitle(f'A Pair of AIRS Images for Planet ID: {planet_id}')
        plt.show()
        
    except Exception as e:
        print(f"Failed to process Planet ID {planet_id}: {e}")

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Conv2D
from astropy.stats import sigma_clip
# 构建 CNN 模型，用于对死点进行插值
def build_cnn_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(None, None, 1)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(1, kernel_size=(3, 3), padding='same'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def mask_hot_dead_single_frame(frame, dead, dark):
    """
    针对单个帧（即二维图像数据）进行热点和死点的掩盖与插值。

    参数：
    - frame: ndarray, 单个帧数据，形状为 (32, 356) 或类似
    - dead: ndarray, 死点掩码，形状与 frame 相同
    - dark: ndarray, 暗噪声数据，形状与 frame 相同
    
    返回值：
    - 填充插值后的帧数据，形状与输入帧一致
    """
    # 识别热点和死点，并将其转换为布尔类型
    hot = sigma_clip(dark, sigma=5, maxiters=5).mask.astype(bool)  # 将 `hot` 转换为布尔类型
    dead = dead.astype(bool)  # 将 `dead` 转换为布尔类型

    # 合并热点与死点掩码
    combined_mask = dead

    # 使用掩码遮盖信号中的热点和死点位置
    frame_masked = np.ma.masked_where(combined_mask, frame).filled(np.nan)  # 用 `NaN` 表示缺失值

    # 使用周围点均值进行插值
    filled_frame = np.copy(frame_masked)
    nan_indices = np.argwhere(np.isnan(filled_frame))  # 找出所有的 `NaN` 位置

    for (i, j) in nan_indices:
        # 找出邻近的四个点并计算均值（跳过超出边界的点）
        neighbors = []
        if j - 1 >= 0:  # 左
            neighbors.append(filled_frame[i, j - 1])
        if j + 1 < frame.shape[1]:  # 右
            neighbors.append(filled_frame[i, j + 1])
        
        # 计算邻居均值，排除 `NaN` 值
        neighbors = [val for val in neighbors if not np.isnan(val)]
        if len_neighbors := len(neighbors):  # 如果邻居存在非 NaN 的值
            filled_frame[i, j] = sum(neighbors) / len_neighbors

    return filled_frame

In [None]:
f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')

In [None]:
dark = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/100468857/AIRS-CH0_calibration/dark.parquet')
dead = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/100468857/AIRS-CH0_calibration/dead.parquet')

In [None]:
dark

In [None]:
dead

In [None]:
dead.iloc[14:17,210:230]

In [None]:
p1=a_signal[1:2].values.reshape(32, 356)
p1 = p1.astype(float)

In [None]:
from astropy.stats import sigma_clip
p1=mask_hot_dead_single_frame(p1, dead, dark)

对random_ids进行相同操作

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Conv2D
from astropy.stats import sigma_clip
# 构建 CNN 模型，用于对死点进行插值
def build_cnn_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(None, None, 1)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(1, kernel_size=(3, 3), padding='same'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def mask_hot_dead_single_frame(frame, dead, dark):
    """
    针对单个帧（即二维图像数据）进行热点和死点的掩盖与插值。

    参数：
    - frame: ndarray, 单个帧数据，形状为 (32, 356) 或类似
    - dead: ndarray, 死点掩码，形状与 frame 相同
    - dark: ndarray, 暗噪声数据，形状与 frame 相同
    
    返回值：
    - 填充插值后的帧数据，形状与输入帧一致
    """
    # 识别热点和死点，并将其转换为布尔类型
    hot = sigma_clip(dark, sigma=5, maxiters=5).mask.astype(bool)  # 将 `hot` 转换为布尔类型
    dead = dead.astype(bool)  # 将 `dead` 转换为布尔类型

    # 合并热点与死点掩码
    combined_mask = dead

    # 使用掩码遮盖信号中的热点和死点位置
    frame_masked = np.ma.masked_where(combined_mask, frame).filled(np.nan)  # 用 `NaN` 表示缺失值

    # 使用周围点均值进行插值
    filled_frame = np.copy(frame_masked)
    nan_indices = np.argwhere(np.isnan(filled_frame))  # 找出所有的 `NaN` 位置

    for (i, j) in nan_indices:
        # 找出邻近的四个点并计算均值（跳过超出边界的点）
        neighbors = []
        if j - 1 >= 0:  # 左
            neighbors.append(filled_frame[i, j - 1])
        if j + 1 < frame.shape[1]:  # 右
            neighbors.append(filled_frame[i, j + 1])
        
        # 计算邻居均值，排除 `NaN` 值
        neighbors = [val for val in neighbors if not np.isnan(val)]
        if len_neighbors := len(neighbors):  # 如果邻居存在非 NaN 的值
            filled_frame[i, j] = sum(neighbors) / len_neighbors

    return filled_frame
    # 遍历所有行星 ID
for planet_id in random_ids:
    try:
        f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
        dark = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_calibration/dark.parquet')
        dead = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_calibration/dead.parquet')
        p1=a_signal[1:2].values.reshape(32, 356)
        p1 = p1.astype(float)
        p1=mask_hot_dead_single_frame(p1, dead, dark)
        # 取100468857号行星0时刻和1时刻的AIRS数据进行比较
        _, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

        sns.heatmap(p1.reshape(32, 356), ax=ax1, vmin=0, vmax=52000)
        ax1.set_title(f'AIRS Image at Time 0 (Planet ID: {planet_id})')
        ax1.set_aspect('equal')
        ax1.set_ylim(32, 0)
        ax1.set_aspect('auto')

        sns.heatmap(p1.reshape(32, 356), ax=ax2, vmin=0, vmax=52000)
        ax2.set_title(f'AIRS Image at Time 1 (Planet ID: {planet_id})')
        ax2.set_aspect('equal')
        ax2.set_ylim(32, 0)
        ax2.set_aspect('auto') 

        
        plt.suptitle(f'A Pair of AIRS Images for Planet ID: {planet_id}')
        plt.show()
    except Exception as e:
        print(f"Failed to process Planet ID {planet_id}: {e}")

In [None]:
# 取100468857号行星0时刻和1时刻的AIRS数据进行比较
_, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

sns.heatmap(p1.reshape(32, 356), ax=ax1, vmin=0, vmax=52000)
ax1.set_title('AIRS Image at Time 0')
ax1.set_aspect('equal')
ax1.set_ylim(32, 0)
ax1.set_aspect('auto')

sns.heatmap(p1.reshape(32, 356), ax=ax2, vmin=0, vmax=52000)
ax2.set_title('AIRS Image at Time 1')
ax2.set_aspect('equal')
ax2.set_ylim(32, 0)
ax2.set_aspect('auto') 

plt.suptitle('A Pair of AIRS Images')

plt.show()

这个图里面似乎就有一个坏点（）

生成点线图

但是缺数据，没有没有遮掩的数据

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from astropy.stats import sigma_clip

# 构建 CNN 模型，用于对死点进行插值
def build_cnn_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(None, None, 1)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(1, kernel_size=(3, 3), padding='same'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# 掩盖热点与死点，并进行插值的函数
def mask_hot_dead_single_frame(frame, dead, dark):
    hot = sigma_clip(dark, sigma=5, maxiters=5).mask.astype(bool)
    dead = dead.astype(bool)
    
    combined_mask = dead
    frame_masked = np.ma.masked_where(combined_mask, frame).filled(np.nan)
    
    filled_frame = np.copy(frame_masked)
    nan_indices = np.argwhere(np.isnan(filled_frame))
    
    for (i, j) in nan_indices:
        neighbors = []
        if j - 1 >= 0:  # 左
            neighbors.append(filled_frame[i, j - 1])
        if j + 1 < frame.shape[1]:  # 右
            neighbors.append(filled_frame[i, j + 1])
        
        # 计算邻居均值，排除 NaN 值
        neighbors = [val for val in neighbors if not np.isnan(val)]
        if len(neighbors) > 0:  # 如果邻居存在非 NaN 的值
            filled_frame[i, j] = sum(neighbors) / len(neighbors)

    return filled_frame

# 循环处理 random_ids 中的每个行星
for planet_id in random_ids:
    # 读取该行星的 AIRC 和 AIRS 信号数据
    f_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/FGS1_signal.parquet')
    a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')
    dark = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_calibration/dark.parquet')
    dead = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_calibration/dead.parquet')
    
    # 假设我们选择第1帧进行处理
    p1 = a_signal.iloc[1:2].values.reshape(32, 356)
    p1 = p1.astype(float)
    
    # 处理热点和死点
    p1 = mask_hot_dead_single_frame(p1, dead, dark)
    
    # 计算亮度变化
    frame_brightness = np.sum(p1, axis=(0, 1))  # 计算该帧的亮度总和
    
    # 假设有一个基准亮度 baseline_flux（没有行星遮挡时的亮度），这里我们做一个简化处理,但是出问题了
    baseline_flux = np.sum(a_signal.iloc[0].values.reshape(32, 356))  # 使用第一帧作为基准亮度

    print(f"frame_bringhtness = {frame_brightness}, baseline_flux = {baseline_flux}")
    
    # 计算亮度变化
    flux_change = baseline_flux - frame_brightness  # 计算基准帧和当前帧的亮度差
    
    if flux_change > 0 and baseline_flux > 0:
        Rp_Rs = np.sqrt(flux_change / baseline_flux)
    else:
        Rp_Rs = np.nan  # 或者其他处理方式
        print(f"Invalid flux values: flux_change = {flux_change}, baseline_flux = {baseline_flux}")
    
    # 根据亮度变化计算 Rp/Rs
    Rp_Rs = np.sqrt(flux_change / baseline_flux)  # 根据透过率模型估算行星半径与恒星半径之比
    
    # 打印计算的 Rp/Rs 值
    print(f"Planet ID: {planet_id}, Rp/Rs: {Rp_Rs}")
    
    

和上面fgs1的数据处理基本一致，观察时序图

In [None]:
_, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 4))

#还是取上面两个行星
planet_id = 100468857
a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')

mean_signal = a_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window=80
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax1.set_title('AIRS-CH0 time series of planet 100468857')
ax1.plot(net_signal, label='raw signal')
ax1.legend()
ax3.plot(smooth_signal, color='c', label='smoothened signal')
ax3.legend()
ax3.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax3.axvline(time_step* 11250 // 135000, color='gray')
    

planet_id = 4249337798
a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')

mean_signal = a_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window=80
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax2.set_title('AIRS-CH0 time series of planet 4249337798')
ax2.plot(net_signal, label='raw signal')
ax2.legend()
ax4.plot(smooth_signal, color='c', label='smoothened signal')
ax4.legend()
ax4.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax4.axvline(time_step* 11250 // 135000, color='gray')

plt.show()

与上文一样，用random_ids来替换

In [None]:
# 创建子图，根据行星数量动态调整布局
num_planets = len(random_ids)
fig, axes = plt.subplots(num_planets, 2, figsize=(12, 4 * num_planets))
if num_planets == 1:
    axes = [axes]  # 保证 axes 是二维列表形式，适配后续循环

# 遍历每个随机行星 ID
for i, planet_id in enumerate(random_ids):
    ax_signal, ax_smooth = axes[i]

    a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')

    mean_signal = a_signal.values.mean(axis=1)
    net_signal = mean_signal[1::2] - mean_signal[0::2]
    cum_signal = net_signal.cumsum()
    window=80
    smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax1.set_title('AIRS-CH0 time series of planet 100468857')
ax1.plot(net_signal, label='raw signal')
ax1.legend()
ax3.plot(smooth_signal, color='c', label='smoothened signal')
ax3.legend()
ax3.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax3.axvline(time_step* 11250 // 135000, color='gray')
    

planet_id = 4249337798
a_signal = pd.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/train/{planet_id}/AIRS-CH0_signal.parquet')

mean_signal = a_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window=80
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

ax2.set_title('AIRS-CH0 time series of planet 4249337798')
ax2.plot(net_signal, label='raw signal')
ax2.legend()
ax4.plot(smooth_signal, color='c', label='smoothened signal')
ax4.legend()
ax4.set_xlabel('time step')
for time_step in [20500, 23500, 44000, 47000]:
    ax4.axvline(time_step* 11250 // 135000, color='gray')

plt.show()

为所有673个训练行星读取FGS1数据和AIRS-CH0数据。

由于数据集无法完全放入RAM，我们仅保留每个行星的两个**一维时间序列**。即：

* 从FGS1数据中提取的每个行星67500步的时间序列

* 从AIRS-CH0数据中提取的每个行星5625步的时间序列


In [None]:
import numpy as np
import polars as pl
from tqdm import tqdm

def f_read_and_preprocess(dataset, adc_info, planet_ids):
    
#     读取所有行星ID的FGS1文件并提取时间序列。

#     参数：
#     dataset：'train' 或 'test'
#     adc_info：元数据数据框，可能是 train_adc_info 或 test_adc_info
#     planet_ids：行星ID列表

#     返回：
#     每个行星ID对应一行的 数据框，每行包含67500个值

    f_raw_train = np.full((len(planet_ids), 67500), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        f_signal = pl.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/FGS1_signal.parquet')
        mean_signal = f_signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / 1024 
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        f_raw_train[i] = net_signal
    return f_raw_train

In [None]:
%%time
f_raw_train = f_read_and_preprocess('train', train_adc_info, train_labels.index)
with open('f_raw_train.pickle', 'wb') as f:
    pickle.dump(f_raw_train, f)

In [None]:
import numpy as np
import polars as pl
from tqdm import tqdm

def a_read_and_preprocess(dataset, adc_info, planet_ids):
    
#     读取所有行星ID的AIRS-CH0文件并提取时间序列。
#     参数：
#     dataset：'train' 或 'test'
#     adc_info：元数据数据框，可能是 train_adc_info 或 test_adc_info
#     planet_ids：行星ID列表

#     返回：
#     每个行星ID对应一行的 数据框，每行包含5625个值

    a_raw_train = np.full((len(planet_ids), 5625), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pl.read_parquet(f'/kaggle/input/ariel-data-challenge-2024/{dataset}/{planet_id}/AIRS-CH0_signal.parquet')
        mean_signal = signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / (32*356) # 对32*356个像素求均值
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        a_raw_train[i] = net_signal
    return a_raw_train

In [None]:
%%time
a_raw_train = a_read_and_preprocess('train', train_adc_info, train_labels.index)
with open('a_raw_train.pickle', 'wb') as f:
    pickle.dump(a_raw_train, f)

全数据观察

In [None]:
f_raw_train.shape

In [None]:
plt.figure(figsize=(6, 2))
plt.plot(f_raw_train.mean(axis=0))
for time_step in [20500, 23500, 44000, 47000]:
    plt.axvline(time_step, color='gray')
plt.xlabel('time step')
plt.title('FGS1: Overall mean')
plt.show()

plt.figure(figsize=(6, 2))
plt.plot(a_raw_train.mean(axis=0))
for time_step in [20500, 23500, 44000, 47000]:
    plt.axvline(time_step * 11250 // 135000, color='gray')
plt.xlabel('time step')
plt.title('AIRS-CH0: Overall mean')
plt.show()