In [1]:
# Cell 1: Sunspot Group Coordinate Self-consistency Check

import pandas as pd
import numpy as np
import astropy.units as u
from astropy.time import Time
from astropy.coordinates import SkyCoord
from sunpy.coordinates import frames
from sunpy.coordinates.sun import angular_radius as solar_semidiameter_angular_size
import warnings
from erfa import ErfaWarning
import os

# Ignore specific types of warnings
warnings.filterwarnings('ignore', category=ErfaWarning)

# Define data directory
data_dir = '../../data/interm'

# Read filtered source data
input_file = os.path.join(data_dir, 'sg_gsn_ar_source_filtered.csv')
df = pd.read_csv(input_file)

print(f"Performing self-consistency check on {len(df)} records...")

# 1. Vectorized Calculation of Coordinates from dist_c and pa
df['date_dt'] = pd.to_datetime(df['date'], format='mixed')
times = Time(df['date_dt'].tolist())
angular_radii = solar_semidiameter_angular_size(times).to(u.arcsec)

radius = df['dist_c'].values
pa_rad = np.radians(df['pa'].values)
x_arcsec = -1 * radius * angular_radii.value * np.sin(pa_rad)
y_arcsec = radius * angular_radii.value * np.cos(pa_rad)

hpc_coords = SkyCoord(x_arcsec * u.arcsec, y_arcsec * u.arcsec, 
                      frame=frames.Helioprojective, 
                      observer="earth", 
                      obstime=times)

# Transform to HGS and HGC
hgs_coords = hpc_coords.transform_to(frames.HeliographicStonyhurst)
hgc_coords = hpc_coords.transform_to(frames.HeliographicCarrington)

# 2. Compare with original columns
# Original columns: hg_lon, hg_lat, hcc_lon
# Threshold for inconsistency (degrees)
threshold = 0.1

lon_diff = np.abs(df['hg_lon'] - hgs_coords.lon.deg)
lat_diff = np.abs(df['hg_lat'] - hgs_coords.lat.deg)
hcc_diff = np.abs(df['hcc_lon'] - hgc_coords.lon.deg)

# Handle 360 degree wrap-around for longitudes
lon_diff = np.minimum(lon_diff, 360 - lon_diff)
hcc_diff = np.minimum(hcc_diff, 360 - hcc_diff)

inconsistent_mask = (lon_diff > threshold) | (lat_diff > threshold) | (hcc_diff > threshold)
inconsistent_count = inconsistent_mask.sum()
total_count = len(df)

print(f"\nConsistency Check Results (Threshold: {threshold} deg):")
print(f"Total Records: {total_count}")
print(f"Inconsistent Records: {inconsistent_count}")
print(f"Inconsistency Rate: {inconsistent_count/total_count:.2%}")

if inconsistent_count > 0:
    print("\nSample of inconsistent records (Original vs Calculated):")
    check_df = df[inconsistent_mask].copy()
    check_df['calc_hg_lon'] = hgs_coords.lon.deg[inconsistent_mask]
    check_df['calc_hg_lat'] = hgs_coords.lat.deg[inconsistent_mask]
    check_df['calc_hcc_lon'] = hgc_coords.lon.deg[inconsistent_mask]
    print(check_df[['date', 'group_id', 'hg_lon', 'calc_hg_lon', 'hcc_lon', 'calc_hcc_lon']].head())

print("\nValidation complete. Proceed to Cell 2 for final data generation.")

Performing self-consistency check on 256861 records...

Consistency Check Results (Threshold: 0.1 deg):
Total Records: 256861
Inconsistent Records: 236821
Inconsistency Rate: 92.20%

Sample of inconsistent records (Original vs Calculated):
                  date  group_id  hg_lon  calc_hg_lon  hcc_lon  calc_hcc_lon
0  1874-05-09 11:55:40      8600   -30.7   -30.548356    171.6    171.805220
1  1874-05-09 11:55:40      8700   -47.2   -46.983239    155.1    155.370337
2  1874-05-09 11:55:40      8800    46.8    46.564835    249.1    248.918411
3  1874-05-10 00:00:00      8600   -16.7   -16.602151    172.2    179.099845
4  1874-05-10 00:00:00      8500    59.7    59.449574    248.5    255.151570

Validation complete. Proceed to Cell 2 for final data generation.


In [2]:
# Cell 2: Final Base Data Generation (High Precision)

import pandas as pd
import numpy as np
import astropy.units as u
from astropy.time import Time
from astropy.coordinates import SkyCoord, HeliocentricMeanEcliptic
from sunpy.coordinates import frames
from sunpy.coordinates.sun import angular_radius as solar_semidiameter_angular_size
import warnings
from erfa import ErfaWarning
import os

# Ignore specific types of warnings
warnings.filterwarnings('ignore', category=ErfaWarning)

# Define data directory
data_dir = '../../data/interm'

# Read filtered source data
input_file = os.path.join(data_dir, 'sg_gsn_ar_source_filtered.csv')
df = pd.read_csv(input_file)

print(f"Generating high-precision base data for {len(df)} records...")

# 1. Vectorized Calculation
df['date_dt'] = pd.to_datetime(df['date'], format='mixed')
times = Time(df['date_dt'].tolist())
angular_radii = solar_semidiameter_angular_size(times).to(u.arcsec)

radius = df['dist_c'].values
pa_rad = np.radians(df['pa'].values)
x_arcsec = -1 * radius * angular_radii.value * np.sin(pa_rad)
y_arcsec = radius * angular_radii.value * np.cos(pa_rad)

hpc_coords = SkyCoord(x_arcsec * u.arcsec, y_arcsec * u.arcsec, 
                      frame=frames.Helioprojective, 
                      observer="earth", 
                      obstime=times)

# 2. Transform to all required frames
print("Transforming to HGS, HGC, and HME...")
hgs_coords = hpc_coords.transform_to(frames.HeliographicStonyhurst)
hgc_coords = hpc_coords.transform_to(frames.HeliographicCarrington)
hme_coords = hgs_coords.transform_to(HeliocentricMeanEcliptic)

# 3. Update DataFrame with calculated values
# We replace original coordinates with calculated ones for consistency across the project
df['hg_lon'] = hgs_coords.lon.deg
df['hg_lat'] = hgs_coords.lat.deg
df['hcc_lon'] = hgc_coords.lon.deg
df['hcc_lon_calc'] = hgc_coords.lon.deg # Keep for backward compatibility
df['hme_lon'] = hme_coords.lon.deg
df['hme_lat'] = hme_coords.lat.deg

# Drop temporary column
df.drop(columns=['date_dt'], inplace=True)

# 4. Save to base data file
output_file = os.path.join(data_dir, 'sg_base_data.csv')
df.to_csv(output_file, index=False)

print(f'Task completed. Saved to {output_file}')
print(df[['date', 'group_id', 'hg_lon', 'hg_lat', 'hcc_lon', 'hme_lon']].head())

Generating high-precision base data for 256861 records...
Transforming to HGS, HGC, and HME...
Task completed. Saved to ../../data/interm/sg_base_data.csv
                  date  group_id     hg_lon     hg_lat     hcc_lon     hme_lon
0  1874-05-09 11:55:40      8600 -30.548356   7.263551  171.805220  200.788266
1  1874-05-09 11:55:40      8700 -46.983239  -7.091279  155.370337  183.466083
2  1874-05-09 11:55:40      8800  46.564835  20.340915  248.918411  279.457868
3  1874-05-10 00:00:00      8600 -16.602151   7.078445  179.099845  215.385935
4  1874-05-10 00:00:00      8500  59.449574  -5.255530  255.151570  289.754839
