In [None]:
# Download Flare Data

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

url = 'https://www.ngdc.noaa.gov/stp/space-weather/solar-data/solar-features/solar-flares/x-rays/goes/xrs/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')

for link in links:
    href = link.get('href')
    # Ensure href is not empty and points to a .txt file
    if href and href.endswith('.txt'):
        # Convert relative links to absolute links
        href = urljoin(url, href)
        try:
            r = requests.get(href, timeout=100)
            if r.status_code == 200:
                # Parse URL path to get filename
                path = urlparse(href).path
                filename = path.split('/')[-1]
                # Ensure filename is not empty
                if filename:
                    with open(filename, 'wb') as f:
                        f.write(r.content)
                    print(f"Downloaded {filename}")
                else:
                    print(f"Invalid filename for URL: {href}")
            else:
                print(f"Failed to download {href}: Status code {r.status_code}")
        except requests.RequestException as e:
            print(f"Request failed for {href}: {e}")

In [1]:
# Extract Data Columns from Raw GOES Reports

import pandas as pd
import glob
import os

# Path to raw txt files
file_pattern = "../../data/00_raw/flares_1975-2017/goes-xrs-report_*.txt"
files = glob.glob(file_pattern)

data = []

# Read and process each file
for file_name in files:
    with open(file_name, "r") as file:
        lines = file.readlines()

        for line in lines:
            # Skip short lines
            if len(line) < 70:
                continue
                
            try:
                # Parse each line of data (Fixed-width format)
                # Following the original logic: try to parse date components as integers
                data_code = int(line[0:2])
                station_code = int(line[2:5])
                year_short = int(line[5:7])
                month = int(line[7:9])
                day = int(line[9:11])
                
                # Handle year prefix (19xx or 20xx)
                year_str = line[5:7]
                if year_str[0] in '789':
                    year_full = '19' + year_str
                else:
                    year_full = '20' + year_str

                asterisks = line[11:13].strip()
                start_time = line[13:17].strip()
                end_time = line[18:22].strip()
                max_time = line[23:27].strip()
                lat_lon = line[28:34].strip()
                sxi = line[34:37].strip()
                
                # X-ray class and intensity
                xray_class = line[59:60].strip()
                xray_intensity = line[60:63].strip()
                
                # Note: The original code didn't have the 100-char intensity fix, 
                # but we'll keep it if it doesn't hurt, or remove it to be strictly "original".
                # The user said "不要改除了目录和名称", so I will remove the extra fix to match original results.
                
                station_name = line[67:71].strip()
                integrated_flux = line[72:80].strip()
                group_id = line[80:85].strip()
                cmp_year = line[86:88].strip()
                cmp_month = line[88:90].strip()
                cmp_day = line[90:94].strip()
                region_area = line[95:102].strip()
                total_intensity = line[103:110].strip()

                # Add to data list
                data.append([
                    year_full, data_code, station_code, year_short, month, day, asterisks, 
                    start_time, end_time, max_time, lat_lon, sxi, xray_class, xray_intensity,
                    station_name, integrated_flux, group_id, cmp_year, cmp_month, cmp_day,
                    region_area, total_intensity
                ])
            except ValueError:
                # Skip invalid lines (headers, etc.)
                continue

# Convert to DataFrame
df = pd.DataFrame(data, columns=[
    "year", "data_code", "station_code", "year_short", "month", "day", "asterisks",
    "start_time", "end_time", "max_time", "lat_lon", "sxi", "xray_class", "xray_intensity", 
    "station_name", "integrated_flux", "group_id", "cmp_year", "cmp_month", "cmp_day", 
    "region_area", "total_intensity"
])

# Ensure output directory exists
output_dir = "../../data/interm"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save as intermediate CSV
output_path = os.path.join(output_dir, "flare_1975_2017_source.csv")
df.to_csv(output_path, index=False)

print(f"Task completed. Processed {len(df)} lines. Saved to {output_path}")

Task completed. Processed 77597 lines. Saved to ../../data/interm/flare_1975_2017_source.csv


In [2]:
# Process Date, Duration, and Heliographic Coordinates

import pandas as pd
import numpy as np
import os

# Function to format time (HHMM -> HH:MM)
def format_time(time_str):
    time_str = str(time_str).strip()
    if not time_str or time_str == 'nan':
        return "00:00"
    if len(time_str) <= 3:
        time_str = time_str.zfill(4)
    return time_str[:2] + ':' + time_str[2:]

# Ensure month and day are 2 digits
def pad_date_component(component):
    return str(component).zfill(2)

# Parse latitude and longitude (e.g., N10W20)
def parse_lat_long(value):
    if pd.isna(value) or len(str(value)) < 6:
        return np.nan, np.nan
    try:
        lat_sign = 1 if value[0] == 'N' else -1
        long_sign = 1 if value[3] == 'W' else -1
        latitude = lat_sign * int(value[1:3])
        longitude = long_sign * int(value[4:])
        return latitude, longitude
    except:
        return np.nan, np.nan

# Handle times exceeding 24 hours
def handle_special_time(date_str, time_str):
    try:
        hour = int(time_str[:2])
        minute = int(time_str[3:])
        if hour >= 24:
            hour -= 24
            date_str = (pd.to_datetime(date_str) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
        return date_str, f'{hour:02d}:{minute:02d}'
    except:
        return date_str, time_str

# Read intermediate CSV
input_path = '../../data/interm/flare_1975_2017_source.csv'
# Use low_memory=False to avoid DtypeWarning for mixed types in columns like max_time
df = pd.read_csv(input_path, low_memory=False)

# Format times
df['formatted_start'] = df['start_time'].apply(format_time)
df['formatted_end'] = df['end_time'].apply(format_time)

# Format date components
df['date_base'] = df['year'].astype(str) + '-' + \
                  df['month'].astype(str).apply(pad_date_component) + '-' + \
                  df['day'].astype(str).apply(pad_date_component)

# Handle special times and convert to datetime
df[['start_date', 'start_time_final']] = df.apply(
    lambda row: handle_special_time(row['date_base'], row['formatted_start']), axis=1, result_type='expand')
df[['end_date', 'end_time_final']] = df.apply(
    lambda row: handle_special_time(row['date_base'], row['formatted_end']), axis=1, result_type='expand')

df['datetime_start'] = pd.to_datetime(df['start_date'] + ' ' + df['start_time_final'], format='%Y-%m-%d %H:%M', errors='coerce')
df['datetime_end'] = pd.to_datetime(df['end_date'] + ' ' + df['end_time_final'], format='%Y-%m-%d %H:%M', errors='coerce')

# Parse Heliographic coordinates
df['hg_lat'], df['hg_lon'] = zip(*df['lat_lon'].apply(parse_lat_long))

# Calculate duration in minutes
df['duration'] = (df['datetime_end'] - df['datetime_start']).dt.total_seconds() / 60

# Fix negative duration (cross-day flares)
df.loc[df['duration'] < 0, 'duration'] += 1440

# Save to next intermediate file
output_path = '../../data/interm/flare_1975_2017_source_pos_dur.csv'
df.to_csv(output_path, index=False)

print(f"Task completed. Saved to {output_path}")

Task completed. Saved to ../../data/interm/flare_1975_2017_source_pos_dur.csv


In [None]:
# Calculate Astronomical Parameters and Carrington Coordinates

import pandas as pd
import numpy as np
import astropy.units as u
from astropy.time import Time
from astropy.coordinates import SkyCoord, HeliocentricMeanEcliptic
from sunpy.coordinates import frames
from tqdm.notebook import tqdm
from sunpy.coordinates.sun import angular_radius as solar_semidiameter_angular_size
import warnings
from erfa import ErfaWarning
import os

# Ignore specific warnings
warnings.filterwarnings('ignore', category=ErfaWarning)
tqdm.pandas(desc="Processing")

def calculate_astronomical_parameters(time_str):
    """Calculate solar angular radius for a given time."""
    try:
        time = Time(time_str)
        angular_radius_arcsec = solar_semidiameter_angular_size(time).to(u.arcsec).value
        return time, angular_radius_arcsec
    except:
        return None, None
    
def hgs_to_all(time, angular_radius_arcsec, lon, lat):
    """Transform HGS coordinates to Disk Center Distance, Position Angle, Carrington, and Ecliptic."""
    if time is None or pd.isna(lon) or pd.isna(lat):
        return np.nan, np.nan, np.nan, np.nan, np.nan
    
    # Create Heliographic Stonyhurst coordinate
    hgs_coord = SkyCoord(lon * u.deg, lat * u.deg, frame=frames.HeliographicStonyhurst, observer='earth', obstime=time)
    
    # 1. Disk Center Distance (dist_c) and Position Angle (pa) via Helioprojective
    hpc_coord = hgs_coord.transform_to(frames.Helioprojective(observer='earth', obstime=time))
    x_arcsec = hpc_coord.Tx.to(u.arcsec).value
    y_arcsec = hpc_coord.Ty.to(u.arcsec).value
    dist_c = np.sqrt(x_arcsec**2 + y_arcsec**2) / angular_radius_arcsec
    pa = np.degrees(np.arctan2(-x_arcsec, y_arcsec)) % 360
    
    # 2. Carrington Longitude (hgc_lon)
    hgc_coord = hgs_coord.transform_to(frames.HeliographicCarrington)
    hgc_lon = hgc_coord.lon.deg
    
    # 3. Heliocentric Mean Ecliptic (hme_lon, hme_lat)
    hme_coord = hgs_coord.transform_to(HeliocentricMeanEcliptic)
    hme_lon = hme_coord.lon.deg
    hme_lat = hme_coord.lat.deg
    
    return dist_c, pa, hgc_lon, hme_lon, hme_lat
   
# Read intermediate CSV
input_path = '../../data/interm/flare_1975_2017_source_pos_dur.csv'
df = pd.read_csv(input_path, low_memory=False)

# Drop rows with missing dates
df = df.dropna(subset=['datetime_start'])

# Cache astronomical parameters for unique datetimes to speed up (matching original logic)
unique_datetimes = df['datetime_start'].unique()
date_params = {dt: calculate_astronomical_parameters(dt) for dt in unique_datetimes}

# Apply transformation
results = df.progress_apply(
    lambda row: hgs_to_all(*date_params[row['datetime_start']], row['hg_lon'], row['hg_lat']), 
    axis=1
)

df[['dist_c', 'pa', 'hgc_lon', 'hme_lon', 'hme_lat']] = pd.DataFrame(results.tolist(), index=df.index)

# Save to final intermediate file
output_path = '../../data/interm/flare_1975_2017_final_interm.csv'
df.to_csv(output_path, index=False)

print(f"Task completed. Saved to {output_path}")

Processing:   0%|          | 0/77597 [00:00<?, ?it/s]

Task completed. Saved to ../../data/raw_interm/processing/flare_1975_2017_final_interm.csv


In [3]:
# Final Filtering and Save to Base Data

import pandas as pd
import os

# Read final intermediate file
input_path = '../../data/interm/flare_1975_2017_final_interm.csv'
df = pd.read_csv(input_path, low_memory=False)

# Filter for major flare classes (X, M, C, B)
filtered_df = df[df['xray_class'].isin(['X', 'M', 'C', 'B'])].copy()

# Drop rows with missing coordinates (to match original Chinese code results)
filtered_df = filtered_df.dropna(subset=['dist_c'])

# Sort by datetime_start to ensure 1975 data is at the top
filtered_df['datetime_start'] = pd.to_datetime(filtered_df['datetime_start'])
filtered_df = filtered_df.sort_values('datetime_start')

# Select and reorder relevant columns for the final dataset
final_columns = [
    'datetime_start', 'max_time', 'duration', 'lat_lon', 'sxi',
    'xray_class', 'xray_intensity', 'group_id',
    'hg_lat', 'hg_lon', 'dist_c', 'pa', 'hgc_lon', 
    'hme_lon', 'hme_lat'
]

# Ensure all requested columns exist
existing_columns = [col for col in final_columns if col in filtered_df.columns]
final_df = filtered_df[existing_columns]

# Save to base_data directory
output_path = '../../data/ready/flare_1975_2017.csv'
final_df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"Task completed. Final dataset contains {len(final_df)} flares.")
print(f"Saved to {output_path}")

Task completed. Final dataset contains 39267 flares.
Saved to ../../data/ready/flare_1975_2017.csv
