# Classification Extraction: Start, End, Full Lifecycle, Single Day

- **Onset**: The position (Carrington longitude/latitude) on the first day of appearance was visible on the solar disk the previous day but had no record.
- **Diss**: The position on the last day of appearance was visible on the solar disk the following day but had no record.
- **Dur**: Sunspots where both start and end were observed, meaning their entire lifespan was recorded.
- **Daily**: Full lifecycle sunspots with a lifespan of only one day.

In [1]:
# Mark Duplicates, Continuity, and Lifespan

import pandas as pd
import os

# Define data directory
data_dir = '../../data/interm'

# Read CSV file (output from 06)
input_file = os.path.join(data_dir, 'sg_base_data.csv')
df = pd.read_csv(input_file)

# Check and report duplicate records
# Note: 06 output uses 'date' and 'group_id'
date_col = 'date'
id_col = 'group_id'

duplicates = df[df.duplicated(subset=[date_col, id_col], keep=False)]
if not duplicates.empty:
    duplicate_count = len(duplicates)
    print(f"Warning: Found {duplicate_count} duplicate records (same {date_col} and {id_col})")
    duplicates.to_csv(os.path.join(data_dir, 'duplicate_records.csv'), index=False)

# Save original index
df['original_index'] = df.index
# Mark duplicate dates and IDs
df['is_duplicate'] = df.duplicated(subset=[date_col, id_col], keep=False).astype(int)

# Save original date string
df['original_date_format'] = df[date_col].copy()

# Convert date column to datetime type
df[date_col] = pd.to_datetime(df[date_col], format='mixed')
# Create a date-only column for calculations
df['date_only'] = df[date_col].dt.normalize()

# Sort by group_id and date
df = df.sort_values(by=[id_col, date_col])
# Calculate date difference within each group_id to check if it exceeds 2 Carrington cycles (28 days * 2)
df['date_diff'] = df.groupby(id_col)['date_only'].diff().gt(pd.Timedelta(days=56))
# Apply cumulative sum to date differences within each group_id to form continuity markers
df['temp_continuous'] = df.groupby(id_col)['date_diff'].cumsum()
# Calculate number of rows for each group_id
counts = df.groupby(id_col)['temp_continuous'].transform('size')
# If a group_id has only one row, mark as 101, otherwise use "temp_continuous"
df['continuity_id'] = df['temp_continuous']
df.loc[counts == 1, 'continuity_id'] = 101

# Calculate lifespan based on dates (ignoring time), add 1 because both start and end dates are inclusive
df['lifespan'] = df.groupby([id_col, 'continuity_id'])['date_only'].transform(lambda x: (x.max() - x.min()).days + 1)

# Re-sort by original index
df.sort_values('original_index', inplace=True)

# Restore original date string format
df[date_col] = df['original_date_format']

# Drop temporary columns
df.drop(columns=['temp_continuous', 'date_diff', 'original_index', 'original_date_format', 'date_only'], inplace=True)

# Save the modified DataFrame to a new CSV file
output_file = os.path.join(data_dir, 'sg_base_data_marked.csv')
df.to_csv(output_file, index=False)

print('Task completed')

Task completed


In [2]:
# Calculate Pre-visibility and Post-visibility

import pandas as pd
import numpy as np
import astropy.units as u
from sunpy.coordinates import frames
from astropy.coordinates import SkyCoord
from astropy.time import Time
import warnings
from erfa import ErfaWarning
from tqdm import tqdm
import os

# Ignore specific types of warnings
warnings.filterwarnings('ignore', category=ErfaWarning)

def is_sun_disk(date, L, B):
    """Check if a Carrington coordinate (L, B) is visible on the solar disk from Earth at a given date"""
    # Create time object
    time = Time(date, scale='utc')
    # Create Carrington coordinate object
    hgc_coord = SkyCoord(L * u.deg, B * u.deg, radius=1 * u.solRad, obstime=time, observer='earth', frame=frames.HeliographicCarrington)
    # Transform to Helioprojective coordinates
    hpc_coord = hgc_coord.transform_to(frames.Helioprojective)
    return hpc_coord.is_visible()

# Define data directory
data_dir = '../../data/interm'

# Read CSV file
input_file = os.path.join(data_dir, 'sg_base_data_marked.csv')
df = pd.read_csv(input_file)

# Define column names
date_col = 'date'
id_col = 'group_id'
lon_col = 'hcc_lon' if 'hcc_lon' in df.columns else 'hcc_lon_calc'
lat_col = 'hg_lat'

# Save original date string
original_date = df[date_col].copy()
# Convert date column format for calculation
df[date_col] = pd.to_datetime(df[date_col], format='mixed')

# Initialize 'pre_visible' and 'post_visible' columns to 0
df['pre_visible'] = 0
df['post_visible'] = 0

# Process by group_id and continuity_id
print("Processing all sunspot groups...")
for (number, continuous), group in tqdm(df.groupby([id_col, 'continuity_id'])):
    # Get all rows for the minimum date
    min_date = group[date_col].min()
    min_indices = group[group[date_col] == min_date].index
    
    # Calculate pre-visibility for all rows of the minimum date
    for min_idx in min_indices:
        try:
            result = is_sun_disk(
                (group.loc[min_idx, date_col] - pd.DateOffset(days=1)).replace(hour=0, minute=0, second=0),
                group.loc[min_idx, lon_col], group.loc[min_idx, lat_col]
            )
            df.loc[min_idx, 'pre_visible'] = 1 if result else -1
        except Exception as e:
            print(f"Error processing pre-visibility for ID {number} Continuity {continuous} Date {group.loc[min_idx, date_col]}: {e}")
            df.loc[min_idx, 'pre_visible'] = 99

    # Get all rows for the maximum date
    max_date = group[date_col].max()
    max_indices = group[group[date_col] == max_date].index
    
    # Calculate post-visibility for all rows of the maximum date
    for max_idx in max_indices:
        try:
            result = is_sun_disk(
                (group.loc[max_idx, date_col] + pd.DateOffset(days=2)).replace(hour=0, minute=0, second=0),
                group.loc[max_idx, lon_col], group.loc[max_idx, lat_col]
            )
            df.loc[max_idx, 'post_visible'] = 1 if result else -1
        except Exception as e:
            print(f"Error processing post-visibility for ID {number} Continuity {continuous} Date {group.loc[max_idx, date_col]}: {e}")
            df.loc[max_idx, 'post_visible'] = 99

# Restore original date string
df[date_col] = original_date

# Save to CSV
output_file = os.path.join(data_dir, 'sg_1874_2025_all_raw.csv')
df.to_csv(output_file, index=False)
print('Task completed')

Processing all sunspot groups...


100%|█████████████████████████████████████| 42941/42941 [13:50<00:00, 51.70it/s]


Task completed


In [3]:
# Extraction: Onset, Diss, Dur, Daily
# Discard: Earliest ID of onset, latest ID of diss, and the year 1982 when ID rules changed. These are handled separately.

import pandas as pd
import warnings
from erfa import ErfaWarning
import os

# Ignore specific types of warnings
warnings.filterwarnings('ignore', category=ErfaWarning)

# Define data directory
data_dir = '../../data/interm'

# Configuration parameters
INPUT_FILE = os.path.join(data_dir, 'sg_1874_2025_all_raw.csv')
OUTPUT_FILES = {
    'onset': os.path.join(data_dir, 'sg_1874_2025_onset_raw.csv'),
    'diss': os.path.join(data_dir, 'sg_1874_2025_diss_raw.csv'),
    'dur': os.path.join(data_dir, 'sg_1874_2025_dur_raw.csv'),
    'daily': os.path.join(data_dir, 'sg_1874_2025_daily_raw.csv')
}

try:
    # Read INPUT_FILE
    print(f"Reading data file: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE)
    print(f"Successfully read data, total {len(df)} rows")
    
    # Define column names
    id_col = 'group_id'
    
    # Check if necessary columns exist
    required_columns = ['pre_visible', 'post_visible', id_col, 'continuity_id', 'lifespan']
    if not all(col in df.columns for col in required_columns):
        missing = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"Missing required columns in data file: {missing}")

    # Process onset data
    print("Processing onset data...")
    onset_data = df[df['pre_visible'] == 1].copy()
    onset_data.to_csv(OUTPUT_FILES['onset'], index=False)
    print(f"Onset data processing complete, total {len(onset_data)} rows")

    # Process diss data
    print("Processing diss data...")
    diss_data = df[df['post_visible'] == 1].copy()
    diss_data.to_csv(OUTPUT_FILES['diss'], index=False)
    print(f"Diss data processing complete, total {len(diss_data)} rows")

    # Process dur data: find sunspot groups with both pre_visible and post_visible markers
    print("Processing dur data...")
    # Get pre_visible and post_visible status for each sunspot group
    group_status = df.groupby([id_col, 'continuity_id']).agg({
        'pre_visible': 'max',  # 1 if any row in group has 1
        'post_visible': 'max'   # 1 if any row in group has 1
    }).reset_index()

    # Find groups with both pre_visible and post_visible
    valid_groups = group_status.query('pre_visible == 1 and post_visible == 1')
    
    # Efficiently get all data for these groups
    mask = df.set_index([id_col, 'continuity_id']).index.isin(
        valid_groups.set_index([id_col, 'continuity_id']).index)
    dur_data = df[mask].copy()

    dur_data.to_csv(OUTPUT_FILES['dur'], index=False)
    print(f"Dur data processing complete, contains {len(valid_groups)} sunspot groups, total {len(dur_data)} rows")

    # Process daily data
    print("Processing daily data...")
    daily_data = dur_data[dur_data['lifespan'] == 1].copy()
    daily_data.to_csv(OUTPUT_FILES['daily'], index=False)
    print(f"Daily data processing complete, total {len(daily_data)} rows")

    print("All data processing complete!")

except FileNotFoundError:
    print(f"Error: File {INPUT_FILE} not found, please check path and filename")
except Exception as e:
    print(f"Error occurred during processing: {str(e)}")

Reading data file: ../../data/interm/sg_1874_2025_all_raw.csv
Successfully read data, total 256861 rows
Processing onset data...
Onset data processing complete, total 33295 rows
Processing diss data...
Diss data processing complete, total 27888 rows
Processing dur data...
Dur data processing complete, contains 22976 sunspot groups, total 75679 rows
Processing daily data...
Daily data processing complete, total 8305 rows
All data processing complete!


## Remove Sunspot Groups Mixed into Onset or Diss Boundaries

- **Daily**: Remove duplicate rows with -1.
- **Database Start**: 1874-05-09, 1874-05-10 (4 groups)
- **Database End**: 2025-04-02 (7 groups)
- **1977-01-01**: 1 group before and 1 after
- **1982-01-01**: 9 groups before, 12 after

In [4]:
import pandas as pd
import os

# Define data directory
data_dir = '../../data/interm'

# Mapping of stages to their English names
stages = ['onset', 'diss', 'daily']

for stage in stages:
    print(f"\nProcessing stage: {stage}")
    try:
        # Read boundary processing data
        excel_path = os.path.join(data_dir, 'sg_boundary_processing.xlsx')
        df_boundary = pd.read_excel(excel_path, sheet_name=stage)
        
        # Read raw lifecycle data
        input_csv = os.path.join(data_dir, f'sg_1874_2025_{stage}_raw.csv')
        df_raw = pd.read_csv(input_csv)
        
        # Define column names
        date_col = 'date'
        id_col = 'group_id'
        
        # Check if key columns exist
        if date_col not in df_boundary.columns or id_col not in df_boundary.columns:
            raise ValueError(f"Sheet '{stage}' is missing '{date_col}' or '{id_col}' column")
        
        # Standardize date format (convert to datetime)
        df_boundary[date_col] = pd.to_datetime(df_boundary[date_col], format='mixed', errors='coerce')
        df_raw[date_col] = pd.to_datetime(df_raw[date_col], format='mixed', errors='coerce')
        
        # Check for invalid dates
        if df_boundary[date_col].isna().any() or df_raw[date_col].isna().any():
            print(f"Warning: Malformed dates found in {stage}, please check source data!")
        
        # Create merge keys (standardized as strings for comparison)
        df_boundary['merge_key'] = df_boundary[date_col].dt.strftime('%Y%m%d') + '_' + df_boundary[id_col].astype(str)
        df_raw['merge_key'] = df_raw[date_col].dt.strftime('%Y%m%d') + '_' + df_raw[id_col].astype(str)
        
        # Filter and save
        df_filtered = df_raw[~df_raw['merge_key'].isin(df_boundary['merge_key'])].drop(columns=['merge_key'])
        output_csv = os.path.join(data_dir, f'sg_1874_2025_{stage}.csv')
        df_filtered.to_csv(output_csv, index=False)
        
        print(f"Successfully processed: deleted {len(df_boundary)} records, {len(df_filtered)} records remaining")
        
    except Exception as e:
        print(f"Processing failed for {stage}: {str(e)}")


Processing stage: onset
Successfully processed: deleted 16 records, 33279 records remaining

Processing stage: diss
Successfully processed: deleted 18 records, 27870 records remaining

Processing stage: daily
Successfully processed: deleted 3 records, 8301 records remaining


In [5]:
# Date Formatting and Cleanup

import pandas as pd
import os

# Define directories
data_dir = '../../data/interm'
final_dir = '../../data/ready'

# Ensure final directory exists
os.makedirs(final_dir, exist_ok=True)

# Stages to format and move to final
stages = ['onset', 'diss', 'dur', 'daily', 'all']

for stage in stages:
    # Determine input filename
    if stage in ['dur', 'all']:
        input_filename = f'sg_1874_2025_{stage}_raw.csv'
    else:
        input_filename = f'sg_1874_2025_{stage}.csv'
        
    input_path = os.path.join(data_dir, input_filename)
    
    if os.path.exists(input_path):
        # Read CSV file
        df = pd.read_csv(input_path)
        
        # 1. Date formatting
        date_col = 'date'
        df[date_col] = pd.to_datetime(df[date_col], format='mixed').dt.strftime('%Y-%m-%d %H:%M:%S')
        
        # 2. Round numeric columns (angles, etc.) to 4 decimal places
        float_cols = df.select_dtypes(include=['float']).columns
        df[float_cols] = df[float_cols].round(4)
        
        # 3. Drop internal marker columns after hme_lat
        cols_to_drop = ['is_duplicate', 'continuity_id', 'lifespan', 'pre_visible', 'post_visible']
        df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True, errors='ignore')
        
        # Save to final directory with clean name
        output_filename = f'sg_1874_2025_{stage}.csv'
        output_path = os.path.join(final_dir, output_filename)
        
        df.to_csv(output_path, index=False)
        print(f'{stage} formatting and cleanup complete, saved to {output_path}')
    else:
        print(f'Input file not found: {input_path}')

onset formatting and cleanup complete, saved to ../../data/ready/sg_1874_2025_onset.csv
diss formatting and cleanup complete, saved to ../../data/ready/sg_1874_2025_diss.csv
dur formatting and cleanup complete, saved to ../../data/ready/sg_1874_2025_dur.csv
daily formatting and cleanup complete, saved to ../../data/ready/sg_1874_2025_daily.csv
all formatting and cleanup complete, saved to ../../data/ready/sg_1874_2025_all.csv
