In [12]:
import os
import pandas as pd
import time
from sunpy.net import Fido, attrs as a
from sunpy.net.jsoc import JSOCClient
from pandas.tseries.offsets import DateOffset
from astropy.utils.data import conf as astropy_conf

# =========================
# 1. CONFIGURATION
# =========================
astropy_conf.remote_timeout = 90.0

EMAIL_FOR_JSOC = 'your_email@gmail.com'
TIME_RANGE_START = '2014-06-14T00:00:00'
TIME_RANGE_END = '2014-06-16T23:59:59'

SHARP_DIR_OUTPUT = 'raw_sharp_data_parquet'
FLARE_DIR_OUTPUT = 'raw_flare_data_parquet'
os.makedirs(SHARP_DIR_OUTPUT, exist_ok=True)
os.makedirs(FLARE_DIR_OUTPUT, exist_ok=True)

MAX_RETRIES = 3

SHARP_FEATURES = [
    'USFLUX', 'MEANGAM', 'MEANGBT', 'MEANGBZ', 'MEANGBH', 'TOTUSJH',
    'TOTPOT', 'MEANPOT', 'TOTUSJZ', 'SAVNCPP', 'ABSNJZH', 'AREA_ACR',
    'MEANJZH', 'R_VALUE', 'LAT_FWT', 'TOTUSI', 'TOTBSQ', 'TOTFX', 'TOTFY',
    'TOTFZ', 'EPSX', 'EPSY', 'EPSZ', 'MEANALP', 'MEANSHR'
]

# =========================
# 2. DATA COLLECTION
# =========================
jsoc_client = JSOCClient()
start_time = pd.to_datetime(TIME_RANGE_START)
end_time = pd.to_datetime(TIME_RANGE_END)

current_start = start_time
while current_start < end_time:
    current_end = current_start + DateOffset(days=1)
    if current_end > end_time:
        current_end = end_time

    print(f"\n--- Processing chunk: {current_start.date()} ---")
    
    sharp_output_file = f"{SHARP_DIR_OUTPUT}/{current_start.date()}.parquet"
    flare_output_file = f"{FLARE_DIR_OUTPUT}/{current_start.date()}.parquet"

    for attempt in range(MAX_RETRIES):
        try:
            print(f"  Attempt {attempt + 1} of {MAX_RETRIES}...")
            start_str = current_start.isoformat()
            end_str = current_end.isoformat()
            
            # --- Fetch SHARP data ---
            if not os.path.exists(sharp_output_file):
                print("    Querying JSOC for SHARP data...")
                sharp_response = jsoc_client.search(
                    a.Time(start_str, end_str),
                    a.jsoc.Series('hmi.sharp_720s'),
                    a.jsoc.Notify(EMAIL_FOR_JSOC)
                )

                if len(sharp_response) > 0:
                    sharp_df_chunk = sharp_response.to_pandas()
                    identifiers = ['T_REC', 'HARPNUM']
                    features_that_exist = [col for col in SHARP_FEATURES if col in sharp_df_chunk.columns]
                    sharp_df_filtered = sharp_df_chunk[identifiers + features_that_exist]
                    sharp_df_filtered.to_parquet(sharp_output_file)
                    print(f"    Saved {len(sharp_df_filtered)} SHARP records.")
                else:
                    print("    No SHARP data found for this day.")
            else:
                print("    SHARP data already exists. Skipping SHARP.")

            # --- Fetch Flare data ---
            if not os.path.exists(flare_output_file):
                print("    Querying for SWPC GOES flare catalog...")
                flare_response = Fido.search(
                    a.Time(start_str, end_str),
                    a.hek.EventType('FL'),
                    a.hek.FRM.Name == 'SWPC'
                )

                if flare_response:
                    flare_table = flare_response[0]
                    names = [n for n in flare_table.colnames if len(flare_table[n].shape) <= 1]
                    flare_table = flare_table[names]

                    flares_df_chunk = flare_table.to_pandas()

                    keep_cols = [c for c in ['event_starttime', 'fl_goescls', 'ar_noaanum'] if c in flares_df_chunk.columns]
                    flares_df_chunk = flares_df_chunk[keep_cols]

                    if 'event_starttime' in flares_df_chunk.columns:
                        flares_df_chunk['event_starttime'] = pd.to_datetime(
                            flares_df_chunk['event_starttime'], errors='coerce'
                        )

                    flares_df_chunk.to_parquet(flare_output_file)
                    print(f"    Saved {len(flares_df_chunk)} flare events.")
                else:
                    print("    No flare data found for this day.")
            else:
                print("    Flare data already exists. Skipping flares.")

            break # Success

        except Exception as e:
            if '[status=6]' in str(e):
                print("    Server response: No data found for this day.")
                break
            else:
                print(f"    Attempt {attempt + 1} failed with error: {e}")
                if attempt < MAX_RETRIES - 1:
                    wait_time = (attempt + 1) * 15
                    print(f"    Waiting {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
    
    current_start = current_end
    time.sleep(1)

print(f"\n✅ Raw data collection complete!")



--- Processing chunk: 2014-06-14 ---
  Attempt 1 of 3...
    Querying JSOC for SHARP data...
    Saved 1828 SHARP records.
    Querying for SWPC GOES flare catalog...
    Saved 5 flare events.

--- Processing chunk: 2014-06-15 ---
  Attempt 1 of 3...
    Querying JSOC for SHARP data...
    Saved 1518 SHARP records.
    Querying for SWPC GOES flare catalog...
    Saved 9 flare events.

--- Processing chunk: 2014-06-16 ---
  Attempt 1 of 3...
    Querying JSOC for SHARP data...
    Saved 1275 SHARP records.
    Querying for SWPC GOES flare catalog...
    Saved 10 flare events.

✅ Raw data collection complete!


In [14]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}' directory.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()
    print("No flare data files found.")

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and Preparing Data ---")

# --- Fix T_REC datetime ---
sharp_df['T_REC'] = (
    sharp_df['T_REC']
    .astype(str)
    .str.replace('_TAI', '', regex=False)   # remove trailing _TAI
)
sharp_df['T_REC'] = pd.to_datetime(
    sharp_df['T_REC'], 
    format='%Y.%m.%d_%H:%M:%S', 
    errors='coerce'
)

sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)

# Initialize label columns
sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

# Clean flare data (if available)
if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
    flares_df.sort_values('event_starttime', inplace=True)
    print("All data cleaned and sorted.")

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Labeling SHARP data using a {PREDICTION_WINDOW_HOURS}-hour prediction window ---")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']  # strongest to weakest

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = (
                (sharp_df['HARPNUM'] == flare['ar_noaanum']) &
                (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                (sharp_df['T_REC'] < flare['event_starttime'])
            )
            sharp_df.loc[mask, 'classification'] = f_class

    # Binary flare label: 1 if X or M flare, else 0
    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

# =========================
# 5. SUMMARY
# =========================
print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary label counts ('flare'):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class label counts ('classification'):")
print(sharp_df['classification'].value_counts())
print(f"\n✅ Dataset saved to {FINAL_OUTPUT_FILE}")


--- Step 1: Loading all raw data chunks ---
Loaded 965310 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and Preparing Data ---
All data cleaned and sorted.

--- Step 3: Labeling SHARP data using a 24-hour prediction window ---

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary ---
Total SHARP records: 957265

Binary label counts ('flare'):
flare
0    957265
Name: count, dtype: int64

Multi-class label counts ('classification'):
classification
Non-flare    957265
Name: count, dtype: int64

✅ Dataset saved to final_labeled_solar_dataset.parquet


In [2]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}' directory.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()
    print("No flare data files found.")

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and Preparing Data ---")

# --- Clean SHARP data ---
sharp_df['T_REC'] = (
    sharp_df['T_REC']
    .astype(str)
    .str.replace('_TAI', '', regex=False)
)
sharp_df['T_REC'] = pd.to_datetime(
    sharp_df['T_REC'], 
    format='%Y.%m.%d_%H:%M:%S', 
    errors='coerce'
)
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)

# Initialize label columns
sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

# Clean flare data (if available)
if not flares_df.empty:
    print("Cleaning flare data...")
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    
    # --- THE FIX: Filter out flares with no valid AR number (like 0) ---
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
    flares_df.sort_values('event_starttime', inplace=True)
    print("All data cleaned and sorted.")

# =========================
# 3. CREATE LABELS
# =========================
# (The rest of your script is correct and remains the same)
if not flares_df.empty:
    print(f"\n--- Step 3: Labeling SHARP data using a {PREDICTION_WINDOW_HOURS}-hour prediction window ---")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = (
                (sharp_df['HARPNUM'] == flare['ar_noaanum']) &
                (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                (sharp_df['T_REC'] < flare['event_starttime'])
            )
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

# =========================
# 5. SUMMARY
# =========================
print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary label counts ('flare'):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class label counts ('classification'):")
print(sharp_df['classification'].value_counts())
print(f"\n✅ Dataset saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks ---
Loaded 965310 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and Preparing Data ---
Cleaning flare data...
All data cleaned and sorted.

--- Step 3: Labeling SHARP data using a 24-hour prediction window ---

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary ---
Total SHARP records: 957265

Binary label counts ('flare'):
flare
0    957265
Name: count, dtype: int64

Multi-class label counts ('classification'):
classification
Non-flare    957265
Name: count, dtype: int64

✅ Dataset saved to final_labeled_solar_dataset.parquet


In [3]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}' directory.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    raise FileNotFoundError(f"No flare files found in '{FLARE_DIR_INPUT}' directory.")

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and Preparing Data ---")
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI")
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)

flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
flares_df.sort_values('event_starttime', inplace=True)
print("All data cleaned and sorted.")

# ===================================================================
# 3. DIAGNOSTIC BLOCK
# ===================================================================
print("\n--- Step 3: Running Diagnostics on a Single Flare ---")

# Find the first powerful flare (M or X class)
powerful_flares = flares_df[flares_df['fl_goescls'].str.startswith(('M', 'X'), na=False)]

if powerful_flares.empty:
    print("\n>> DIAGNOSTIC RESULT: No M-class or X-class flares were found in your flare data files.")
    print(">> This is the reason no labels are being created.")
else:
    # Get the details of the first powerful flare
    first_flare = powerful_flares.iloc[0]
    flare_time = first_flare['event_starttime']
    ar_num = first_flare['ar_noaanum']
    
    print("\n[1] Details of the first powerful flare found:")
    print(first_flare.to_string())
    
    # Define the 24-hour window *before* this flare
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    start_window = flare_time - prediction_window
    end_window = flare_time
    
    print(f"\n[2] Checking for SHARP data for HARPNUM {ar_num} within the time window:")
    print(f"    Start of window: {start_window}")
    print(f"    End of window:   {end_window}")
    
    # Find all SHARP records that match the Active Region number
    sharp_subset = sharp_df[sharp_df['HARPNUM'] == ar_num]
    
    if sharp_subset.empty:
        print(f"\n>> DIAGNOSTIC RESULT: Found a flare from AR {ar_num}, but there is NO SHARP data at all for this HARPNUM.")
    else:
        print(f"\n[3] Found {len(sharp_subset)} total SHARP records for HARPNUM {ar_num}.")
        print(f"    Earliest SHARP record for this AR is at: {sharp_subset['T_REC'].min()}")
        print(f"    Latest SHARP record for this AR is at:   {sharp_subset['T_REC'].max()}")
        
        # Now, find the records within the specific 24-hour window
        potential_matches = sharp_subset[
            (sharp_subset['T_REC'] >= start_window) & (sharp_subset['T_REC'] < end_window)
        ]
        
        print("\n[4] Searching for SHARP records within the 24-hour pre-flare window...")
        if potential_matches.empty:
            print("\n>> DIAGNOSTIC RESULT: Found SHARP data for this AR, but NONE of it falls within the 24-hour window before the flare.")
            print(">> This could be due to a data gap right before the flare.")
        else:
            print(f"\n>> DIAGNOSTIC SUCCESS: Found {len(potential_matches)} matching SHARP records that should be labeled!")
            print(">> First 5 matching records:")
            print(potential_matches.head())
            
# ===================================================================

--- Step 1: Loading all raw data chunks ---
Loaded 965310 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and Preparing Data ---
All data cleaned and sorted.

--- Step 3: Running Diagnostics on a Single Flare ---

[1] Details of the first powerful flare found:
event_starttime    2013-01-05 09:26:00
fl_goescls                        M1.7
ar_noaanum                       11652

[2] Checking for SHARP data for HARPNUM 11652 within the time window:
    Start of window: 2013-01-04 09:26:00
    End of window:   2013-01-05 09:26:00

>> DIAGNOSTIC RESULT: Found a flare from AR 11652, but there is NO SHARP data at all for this HARPNUM.


In [2]:
import pandas as pd
import os

# --- CONFIGURATION ---
# The folder where your daily SHARP files are saved
SHARP_DIR = 'raw_sharp_data_parquet'
# The specific daily file you want to inspect
FILE_TO_INSPECT = '2014-01-01.parquet' 
# ---------------------

# Construct the full path to the file
file_path = os.path.join(SHARP_DIR, FILE_TO_INSPECT)

try:
    # Load the Parquet file into a pandas DataFrame
    # Note: Parquet is read differently than CSV, so we don't need nrows.
    # We just load the whole (small) daily file.
    sharp_df = pd.read_parquet(file_path)

    print(f"The actual column names in '{file_path}' are:")
    print(sharp_df.columns)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please make sure you have run the data collector script and the file exists.")

The actual column names in 'raw_sharp_data_parquet\2014-01-01.parquet' are:
Index(['T_REC', 'HARPNUM', 'USFLUX', 'MEANGAM', 'MEANGBT', 'MEANGBZ',
       'MEANGBH', 'TOTUSJH', 'TOTPOT', 'MEANPOT', 'TOTUSJZ', 'SAVNCPP',
       'ABSNJZH', 'AREA_ACR', 'MEANJZH', 'R_VALUE', 'LAT_FWT', 'MEANALP',
       'MEANSHR'],
      dtype='object')


In [3]:
import pandas as pd
import glob

# --- Configuration ---
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
# ---------------------

print(f"--- Verifying data in '{FLARE_DIR_INPUT}' ---")

# 1. Load all raw flare data from the folder
try:
    flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')
    if not flare_files:
        raise FileNotFoundError
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
    print(f"\nStep 1: Successfully loaded a total of {len(flares_df)} raw flare records.")
except FileNotFoundError:
    print(f"Error: No files found in '{FLARE_DIR_INPUT}'. Please run the collector for the 2013-2014 period first.")
    exit()

# 2. Clean the data step-by-step and report the counts

# Initial cleaning for missing values in key columns
initial_count = len(flares_df)
flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
print(f"Step 2: After dropping rows with any missing values, {len(flares_df)} records remain (removed {initial_count - len(flares_df)}).")

# Convert ar_noaanum to a numeric type, marking non-numbers as NaN
flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
initial_count = len(flares_df)
flares_df.dropna(subset=['ar_noaanum'], inplace=True) # Drop the new NaNs
print(f"Step 3: After removing non-numeric AR numbers, {len(flares_df)} records remain (removed {initial_count - len(flares_df)}).")

# --- THIS IS THE CRITICAL STEP ---
# Filter for only the flares with a valid, positive Active Region number.
# "Orphan" flares often have an AR number of 0 or a negative placeholder.
initial_count = len(flares_df)
flares_df_cleaned = flares_df[flares_df['ar_noaanum'] > 0]
print(f"Step 4: After filtering for positive AR numbers (> 0), {len(flares_df_cleaned)} records remain (removed {initial_count - len(flares_df_cleaned)}).")
# ---------------------------------

print("\n--- VERIFICATION COMPLETE ---")
if len(flares_df_cleaned) == 0:
    print("RESULT: ❌ Confirmed. The dataset contains 0 flares with a valid, linkable Active Region number.")
else:
    print(f"RESULT: ✅ Found {len(flares_df_cleaned)} usable flare records with valid Active Region numbers.")

--- Verifying data in 'raw_flare_data_parquet' ---

Step 1: Successfully loaded a total of 4380 raw flare records.
Step 2: After dropping rows with any missing values, 4380 records remain (removed 0).
Step 3: After removing non-numeric AR numbers, 4380 records remain (removed 0).
Step 4: After filtering for positive AR numbers (> 0), 3846 records remain (removed 534).

--- VERIFICATION COMPLETE ---
RESULT: ✅ Found 3846 usable flare records with valid Active Region numbers.


In [5]:
import glob
sharp_files = glob.glob('raw_sharp_data_parquet/*.parquet')
print(f"Found {len(sharp_files)} daily SHARP data files.")

Found 730 daily SHARP data files.


In [6]:
import pandas as pd
# Load the first file found
df_sharp_sample = pd.read_parquet(sharp_files[0])
# Print a summary of the columns and their data types
print(df_sharp_sample.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   T_REC     973 non-null    object 
 1   HARPNUM   973 non-null    int64  
 2   USFLUX    973 non-null    float64
 3   MEANGAM   958 non-null    float64
 4   MEANGBT   958 non-null    float64
 5   MEANGBZ   958 non-null    float64
 6   MEANGBH   958 non-null    float64
 7   TOTUSJH   973 non-null    float64
 8   TOTPOT    973 non-null    float64
 9   MEANPOT   958 non-null    float64
 10  TOTUSJZ   973 non-null    float64
 11  SAVNCPP   973 non-null    float64
 12  ABSNJZH   973 non-null    float64
 13  AREA_ACR  973 non-null    float64
 14  MEANJZH   958 non-null    float64
 15  R_VALUE   973 non-null    float64
 16  LAT_FWT   973 non-null    float64
 17  MEANALP   958 non-null    float64
 18  MEANSHR   958 non-null    float64
dtypes: float64(17), int64(1), object(1)
memory usage: 144.6+ KB
None


In [7]:
import glob
flare_files = glob.glob('raw_flare_data_parquet/*.parquet')
print(f"Found {len(flare_files)} daily flare data files.")

Found 730 daily flare data files.


In [8]:
import pandas as pd
# Load a flare file (make sure it's one that isn't empty)
# You might need to try a few different dates from your folder
df_flare_sample = pd.read_parquet('raw_flare_data_parquet/2014-10-24.parquet') 
# Print the first few rows
print(df_flare_sample.head())

      event_starttime fl_goescls  ar_noaanum
0 2014-10-24 02:35:00       C4.2       12192
1 2014-10-24 02:55:00       C3.4       12192
2 2014-10-24 03:56:00       C3.6       12192
3 2014-10-24 07:37:00       M4.0       12192
4 2014-10-24 09:58:00       C3.6       12192


In [9]:
# --- NEW: Drop any rows with missing feature values ---
sharp_df.dropna(inplace=True)

In [2]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()
    print("No flare data files found.")

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and preparing the complete dataset ---")

# --- THE FIX: Added the format code to correctly parse the T_REC date string ---
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True) # Drop rows that failed to parse

sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)

# Drop any rows with missing feature values
sharp_df.dropna(inplace=True)
print(f"After cleaning, {len(sharp_df)} SHARP records remain.")

sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df.dropna(subset=['ar_noaanum'], inplace=True)
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
    flares_df.sort_values('event_starttime', inplace=True)
    print("All data cleaned and sorted.")

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels based on a {PREDICTION_WINDOW_HOURS}-hour window ---")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['HARPNUM'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print("-------------------------")
print(f"✅ Successfully saved final labeled dataset to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks ---
Loaded 965310 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and preparing the complete dataset ---
After cleaning, 927593 SHARP records remain.
All data cleaned and sorted.

--- Step 3: Creating labels based on a 24-hour window ---

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary ---
Total SHARP records: 927593

Binary Label Counts ('flare' column):
flare
0    927593
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    927593
Name: count, dtype: int64
-------------------------
✅ Successfully saved final labeled dataset to final_labeled_solar_dataset.parquet


In [6]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
FINAL_DATASET_FILE = 'final_labeled_solar_dataset.parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'

# =========================
# 1. LOAD THE DATA
# =========================
print("--- Loading Final Labeled Dataset and Raw Flare Data ---")
try:
    # Load the final dataset we want to check
    final_df = pd.read_parquet(FINAL_DATASET_FILE)
    
    # Load the raw flare files to find a known event
    flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
except FileNotFoundError as e:
    print(f"Error: Could not find necessary files. {e}")
    exit()

# --- Clean the flare data to find our target ---
flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
flares_df.sort_values('event_starttime', inplace=True)

# =========================
# 2. FIND A TARGET FLARE
# =========================
print("\n--- Finding a powerful flare to verify ---")
# Look for the strongest flare class available (X first, then M)
target_flares = flares_df[flares_df['fl_goescls'].str.startswith('X', na=False)]
if target_flares.empty:
    target_flares = flares_df[flares_df['fl_goescls'].str.startswith('M', na=False)]

if target_flares.empty:
    print("No M-class or X-class flares found in the raw data to verify against.")
else:
    # Pick the first strong flare from the list
    target_flare = target_flares.iloc[0]
    flare_time = target_flare['event_starttime']
    ar_num = target_flare['ar_noaanum']
    flare_class = target_flare['fl_goescls']
    
    print(f"Found a {flare_class} flare from AR #{ar_num} at {flare_time}")

    # =========================
    # 3. VERIFY THE LABELS IN THE FINAL DATASET
    # =========================
    print("\n--- Checking labels in the 24-hour window before the flare ---")
    prediction_window = pd.Timedelta(hours=24)
    start_window = flare_time - prediction_window
    end_window = flare_time

    # Filter the final dataset for the records that SHOULD be labeled
    verification_df = final_df[
        (final_df['HARPNUM'] == ar_num) &
        (final_df['T_REC'] >= start_window) &
        (final_df['T_REC'] < end_window)
    ]

    if verification_df.empty:
        print("\nRESULT: ❌ Verification FAILED. No SHARP records were found in the pre-flare window in the final dataset.")
    else:
        print(f"Found {len(verification_df)} records in the pre-flare window. Checking their labels...")
        print("First 5 records in window:")
        print(verification_df[['T_REC', 'HARPNUM', 'flare', 'classification']].head())
        print("\nLast 5 records in window:")
        print(verification_df[['T_REC', 'HARPNUM', 'flare', 'classification']].tail())

        # Check if ALL 'flare' labels in this window are 1
        if verification_df['flare'].all() == 1:
            print("\nRESULT: ✅ Verification successful! All records in the window are correctly labeled as '1'.")
        else:
            print("\nRESULT: ❌ Verification FAILED. Some records in the window have an incorrect '0' label.")

--- Loading Final Labeled Dataset and Raw Flare Data ---

--- Finding a powerful flare to verify ---
Found a X1.7 flare from AR #11748 at 2013-05-13 01:53:00

--- Checking labels in the 24-hour window before the flare ---

RESULT: ❌ Verification FAILED. No SHARP records were found in the pre-flare window in the final dataset.


In [9]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
FINAL_DATASET_FILE = 'final_labeled_solar_dataset.parquet'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'

# =========================
# 1. LOAD THE DATA
# =========================
print("--- Loading Final Labeled Dataset and Raw Flare Data ---")
try:
    final_df = pd.read_parquet(FINAL_DATASET_FILE)
    flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files], ignore_index=True)
except FileNotFoundError as e:
    print(f"Error: Could not find necessary files. {e}")
    exit()

# --- Convert date columns first ---
final_df['T_REC'] = pd.to_datetime(final_df['T_REC'])
flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')

# --- THE FIX: Filter both dataframes to only include dates from 2014 onwards ---
print("\nFiltering data to only include records from 2014 onwards...")
final_df = final_df[final_df['T_REC'] > '2013-12-31'].copy()
flares_df = flares_df[flares_df['event_starttime'] > '2013-12-31'].copy()
print(f"Found {len(final_df)} SHARP records and {len(flares_df)} flare records from 2014 onwards.")
# --------------------------------------------------------------------------

# --- Clean the flare data to find our target ---
flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
flares_df.sort_values('event_starttime', inplace=True)

# =========================
# 2. FIND A TARGET FLARE
# =========================
print("\n--- Finding a powerful flare to verify ---")
target_flares = flares_df[flares_df['fl_goescls'].str.startswith('X', na=False)]
if target_flares.empty:
    target_flares = flares_df[flares_df['fl_goescls'].str.startswith('M', na=False)]

if target_flares.empty:
    print("No M-class or X-class flares found in the 2014+ data to verify against.")
else:
    target_flare = target_flares.iloc[0]
    flare_time = target_flare['event_starttime']
    ar_num = target_flare['ar_noaanum']
    flare_class = target_flare['fl_goescls']
    
    print(f"Found a {flare_class} flare from AR #{ar_num} at {flare_time}")

    # =========================
    # 3. VERIFY THE LABELS IN THE FINAL DATASET
    # =========================
    print("\n--- Checking labels in the 24-hour window before the flare ---")
    prediction_window = pd.Timedelta(hours=24)
    start_window = flare_time - prediction_window
    end_window = flare_time

    verification_df = final_df[
        (final_df['HARPNUM'] == ar_num) &
        (final_df['T_REC'] >= start_window) &
        (final_df['T_REC'] < end_window)
    ]

    if verification_df.empty:
        print("\nRESULT: ❌ Verification FAILED. No SHARP records were found in the pre-flare window in the final dataset.")
    else:
        print(f"Found {len(verification_df)} records in the pre-flare window. Checking their labels...")
        print("First 5 records in window:")
        print(verification_df[['T_REC', 'HARPNUM', 'flare', 'classification']].head())
        print("\nLast 5 records in window:")
        print(verification_df[['T_REC', 'HARPNUM', 'flare', 'classification']].tail())

        if verification_df['flare'].all() == 1:
            print("\nRESULT: ✅ Verification successful! All records in the window are correctly labeled as '1'.")
        else:
            print("\nRESULT: ❌ Verification FAILED. Some records in the window have an incorrect '0' label.")

--- Loading Final Labeled Dataset and Raw Flare Data ---

Filtering data to only include records from 2014 onwards...
Found 488247 SHARP records and 2308 flare records from 2014 onwards.

--- Finding a powerful flare to verify ---
Found a X1.2 flare from AR #11944 at 2014-01-07 18:04:00

--- Checking labels in the 24-hour window before the flare ---

RESULT: ❌ Verification FAILED. No SHARP records were found in the pre-flare window in the final dataset.


In [14]:
import pandas as pd
df = pd.read_parquet("raw_sharp_data_parquet/2015-01-01.parquet")
print(df.head())

                     T_REC  HARPNUM  NOAA_AR        USFLUX  MEANGAM  MEANGBT  \
0  2015.01.01_00:00:00_TAI     4973    12246  3.272105e+21   59.454   68.212   
1  2015.01.01_00:12:00_TAI     4973    12246  3.347525e+21   58.861   67.663   
2  2015.01.01_00:24:00_TAI     4973    12246  3.310049e+21   58.952   67.954   
3  2015.01.01_00:36:00_TAI     4973    12246  3.313561e+21   58.703   68.925   
4  2015.01.01_00:48:00_TAI     4973    12246  3.316598e+21   58.692   68.839   

   MEANGBZ  MEANGBH  TOTUSJH        TOTPOT   MEANPOT       TOTUSJZ  \
0   66.337   47.056  185.674  1.066154e+23  17729.69  2.905874e+12   
1   66.407   46.233  191.828  1.074211e+23  17342.77  3.059147e+12   
2   66.976   46.539  183.896  1.075039e+23  17624.43  2.960252e+12   
3   68.356   46.733  188.307  1.060242e+23  17686.05  2.902295e+12   
4   67.741   46.193  187.325  1.049565e+23  17350.36  3.035269e+12   

        SAVNCPP  ABSNJZH    AREA_ACR   MEANJZH  R_VALUE    LAT_FWT   MEANALP  \
0  7.099509e+11   

In [15]:
import pandas as pd
df = pd.read_parquet("raw_sharp_data_parquet/2014-01-01.parquet")
print(df.head())


                     T_REC  HARPNUM        USFLUX  MEANGAM  MEANGBT  MEANGBZ  \
0  2014.01.01_00:00:00_TAI     3520  2.425337e+22   30.447   59.207   63.758   
1  2014.01.01_00:12:00_TAI     3520  2.397443e+22   29.694   58.999   62.370   
2  2014.01.01_00:24:00_TAI     3520  2.368803e+22   29.602   59.602   64.187   
3  2014.01.01_00:36:00_TAI     3520  2.362252e+22   30.041   58.261   61.272   
4  2014.01.01_00:48:00_TAI     3520  2.307921e+22   29.986   59.347   64.168   

   MEANGBH  TOTUSJH        TOTPOT   MEANPOT       TOTUSJZ       SAVNCPP  \
0   28.449  782.811  1.239458e+23  2609.887  1.545143e+13  4.359763e+12   
1   28.882  769.198  1.144742e+23  2455.562  1.488627e+13  5.104859e+12   
2   28.698  766.378  1.136556e+23  2467.099  1.510670e+13  4.399965e+12   
3   27.703  744.015  1.123239e+23  2421.855  1.438206e+13  4.098449e+12   
4   27.976  736.776  1.122267e+23  2467.163  1.450569e+13  4.130472e+12   

   ABSNJZH    AREA_ACR   MEANJZH  R_VALUE    LAT_FWT   MEANALP  MEAN

In [16]:
from sunpy.net import attrs as a
from sunpy.net.jsoc import JSOCClient

# The Active Region that had a huge X-class flare
AR_NUMBER = 11748

# The time range around the flare
START_TIME = "2013-05-12T00:00:00"
END_TIME = "2013-05-14T23:59:59"

print(f"Querying JSOC for SHARP data for AR #{AR_NUMBER} between {START_TIME} and {END_TIME}...")

try:
    jsoc_client = JSOCClient()
    
    # This is a direct query for the specific data we need
    response = jsoc_client.search(
        a.Time(START_TIME, END_TIME),
        a.jsoc.Series('hmi.sharp_720s'),
        a.jsoc.Keyword('NOAA_AR') == AR_NUMBER
    )
    
    print("\n--- QUERY COMPLETE ---")
    print(f"Result: Found {len(response)} records.")
    
    if len(response) > 0:
        print("✅ Data was found.")
    else:
        print("\n❌ This is the definitive proof: The final, science-quality SHARP data for this major, flaring active region is MISSING from the archive.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

Querying JSOC for SHARP data for AR #11748 between 2013-05-12T00:00:00 and 2013-05-14T23:59:59...

--- QUERY COMPLETE ---
Result: Found 213 records.
✅ Data was found.


In [23]:
import pandas as pd
df = pd.read_parquet("raw_sharp_data_parquet_new/2013-01-01.parquet")
print(df.head())

                     T_REC  HARPNUM  NOAA_AR        USFLUX  MEANGAM  MEANGBT  \
0  2013.01.01_00:00:00_TAI     2322    11636  4.509503e+21   35.303  141.572   
1  2013.01.01_00:12:00_TAI     2322    11636  4.483117e+21   34.914  141.825   
2  2013.01.01_00:24:00_TAI     2322    11636  4.500127e+21   34.246  142.994   
3  2013.01.01_00:36:00_TAI     2322    11636  4.474255e+21   33.887  142.560   
4  2013.01.01_00:48:00_TAI     2322    11636  4.420313e+21   33.728  144.255   

   MEANGBZ  MEANGBH  TOTUSJH        TOTPOT   MEANPOT       TOTUSJZ  \
0  141.166   68.409  377.037  2.285518e+22  2343.044  8.023545e+12   
1  141.105   68.116  378.145  2.243908e+22  2311.717  8.098751e+12   
2  142.356   67.673  378.075  2.221918e+22  2302.926  8.000354e+12   
3  142.544   67.399  377.511  2.183968e+22  2300.644  7.891649e+12   
4  144.067   68.401  377.976  2.157904e+22  2312.989  7.930061e+12   

        SAVNCPP  ABSNJZH    AREA_ACR   MEANJZH  R_VALUE    LAT_FWT   MEANALP  \
0  1.437391e+12   

In [44]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet_new'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and preparing the complete dataset ---")
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.dropna(inplace=True) # Drop rows with any missing feature values
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)

sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels...")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print(f"✅ Successfully saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks ---
Loaded 962956 SHARP records.
Loaded 548 flare records.

--- Step 2: Cleaning and preparing the complete dataset ---

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary ---
Total SHARP records: 631316

Binary Label Counts ('flare' column):
flare
0    631316
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    631316
Name: count, dtype: int64
✅ Successfully saved to final_labeled_solar_dataset.parquet


In [49]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. ROBUST CLEANING AND VERIFICATION
# =========================
print("\n--- Step 2: Cleaning and Verifying Data Alignment ---")

# --- Clean SHARP data ---
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.dropna(inplace=True) # Drop rows with any missing feature values

# Robustly clean the NOAA_AR column
if 'NOAA_AR' in sharp_df.columns:
    sharp_df['NOAA_AR'] = pd.to_numeric(sharp_df['NOAA_AR'], errors='coerce')
    sharp_df.dropna(subset=['NOAA_AR'], inplace=True)
    sharp_df = sharp_df[sharp_df['NOAA_AR'] > 0].copy()
    sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)
    sharp_ar_numbers = set(sharp_df['NOAA_AR'].unique())
else:
    raise KeyError("The 'NOAA_AR' column is missing from the raw SHARP data.")

# --- Clean Flare data ---
if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df.dropna(subset=['ar_noaanum'], inplace=True)
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
    flare_ar_numbers = set(flares_df['ar_noaanum'].unique())
else:
    flare_ar_numbers = set()

# --- Run the Sanity Check ---
print("\n--- Sanity Check ---")
print(f"Found {len(sharp_ar_numbers)} unique, valid AR numbers in the SHARP data.")
print(f"Found {len(flare_ar_numbers)} unique, valid AR numbers in the Flare data.")
matching_ar_numbers = sharp_ar_numbers.intersection(flare_ar_numbers)
if not matching_ar_numbers:
    print("\n>> CRITICAL: No matching AR numbers found. Cannot proceed with labeling.")
    exit()
else:
    print(f"\n>> SUCCESS: Found {len(matching_ar_numbers)} matching AR numbers. Proceeding with labeling.")
print("--------------------")

# =========================
# 3. CREATE LABELS
# =========================
sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels...")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print("-------------------------")
print(f"✅ Successfully saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks ---
Loaded 962956 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and Verifying Data Alignment ---

--- Sanity Check ---
Found 426 unique, valid AR numbers in the SHARP data.
Found 372 unique, valid AR numbers in the Flare data.

>> SUCCESS: Found 261 matching AR numbers. Proceeding with labeling.
--------------------

--- Step 3: Creating labels...

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary ---
Total SHARP records: 346372

Binary Label Counts ('flare' column):
flare
0    344158
1      2214
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    258691
C             61663
B             23804
M              2153
X                61
Name: count, dtype: int64
-------------------------
✅ Successfully saved to final_labeled_solar_dataset.parquet


In [54]:
import os
import glob
import pandas as pd
from datetime import timedelta
from astropy.time import Time

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_solar_dataset.parquet'
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA
# =========================
print("--- Step 1: Loading all raw data chunks ---")
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. CLEAN & TIMESTAMP CONVERSION
# =========================
print("\n--- Step 2: Cleaning and Converting Time ---")

# --- Clean SHARP ---
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)

print("Converting SHARP timestamps from TAI → UTC ...")
print("Before conversion:", sharp_df['T_REC'].min(), "to", sharp_df['T_REC'].max())

sharp_times = Time(list(sharp_df['T_REC']), format="datetime", scale="tai")
sharp_df['T_REC'] = sharp_times.utc.datetime

print("After conversion :", sharp_df['T_REC'].min(), "to", sharp_df['T_REC'].max())

# --- Clean NOAA_AR ---
if 'NOAA_AR' not in sharp_df.columns:
    raise KeyError("The 'NOAA_AR' column is missing from SHARP data.")

sharp_df['NOAA_AR'] = pd.to_numeric(sharp_df['NOAA_AR'], errors='coerce')
sharp_df.dropna(subset=['NOAA_AR'], inplace=True)
sharp_df = sharp_df[sharp_df['NOAA_AR'] > 0].copy()
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)
sharp_ar_numbers = set(sharp_df['NOAA_AR'].unique())

# --- Clean Flare data ---
if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df.dropna(subset=['ar_noaanum'], inplace=True)
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
    flare_ar_numbers = set(flares_df['ar_noaanum'].unique())
else:
    flare_ar_numbers = set()

# --- Sanity Check ---
print("\n--- Sanity Check ---")
print(f"Found {len(sharp_ar_numbers)} unique AR numbers in SHARP.")
print(f"Found {len(flare_ar_numbers)} unique AR numbers in Flares.")
matching_ar_numbers = sharp_ar_numbers.intersection(flare_ar_numbers)
if not matching_ar_numbers:
    print(">> CRITICAL: No matching AR numbers found. Exiting.")
    exit()
else:
    print(f">> SUCCESS: Found {len(matching_ar_numbers)} matching AR numbers.")
print("--------------------")

# =========================
# 3. CREATE LABELS
# =========================
sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    print(f"\n--- Step 3: Creating Labels (window={PREDICTION_WINDOW_HOURS}h) ---")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    labeled_rows = 0
    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class
            labeled_rows += mask.sum()

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)
    print(f"Labeled {labeled_rows} rows with flare classes.")

# =========================
# 4. SAVE + DEBUG PREVIEW
# =========================
print("\n--- Step 4: Saving Final Dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("Binary label counts:\n", sharp_df['flare'].value_counts())
print("Multi-class counts:\n", sharp_df['classification'].value_counts())
print("-------------------------")
print(f"✅ Saved to {FINAL_OUTPUT_FILE}")

# --- Debug Preview: Show first matches ---
debug_matches = sharp_df[sharp_df['classification'] != 'Non-flare'].head(20)
if not debug_matches.empty:
    print("\n--- Sample Flare Matches ---")
    print(debug_matches[['T_REC', 'NOAA_AR', 'classification', 'flare']])
else:
    print("\n⚠️ No flare matches found in preview. Check AR number alignment!")


--- Step 1: Loading all raw data chunks ---
Loaded 962956 SHARP records.
Loaded 4380 flare records.

--- Step 2: Cleaning and Converting Time ---
Converting SHARP timestamps from TAI → UTC ...
Before conversion: NaT to NaT
After conversion : nan to nan

--- Sanity Check ---
Found 0 unique AR numbers in SHARP.
Found 372 unique AR numbers in Flares.
>> CRITICAL: No matching AR numbers found. Exiting.
--------------------

--- Step 3: Creating Labels (window=24h) ---
Labeled 0 rows with flare classes.

--- Step 4: Saving Final Dataset ---

--- Dataset Summary ---
Total SHARP records: 0
Binary label counts:
 Series([], Name: count, dtype: int64)
Multi-class counts:
 Series([], Name: count, dtype: int64)
-------------------------
✅ Saved to final_labeled_solar_dataset.parquet

⚠️ No flare matches found in preview. Check AR number alignment!


In [7]:
import pandas as pd
import glob
from datetime import timedelta

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'

# =========================
# 1. LOAD AND CLEAN DATA
# =========================
print("--- Loading and cleaning data ---")
sharp_df = pd.concat([pd.read_parquet(f) for f in glob.glob(f'{SHARP_DIR_INPUT}/*.parquet') if os.path.getsize(f) > 0], ignore_index=True)
flares_df = pd.concat([pd.read_parquet(f) for f in glob.glob(f'{FLARE_DIR_INPUT}/*.parquet') if os.path.getsize(f) > 0], ignore_index=True)

# Clean SHARP data
sharp_df['T_REC_str'] = sharp_df['T_REC'] # Keep original string for inspection
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC', 'NOAA_AR'], inplace=True)
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)

# Clean Flare data
flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)
flares_df.sort_values('event_starttime', inplace=True)

# --- THE FIX: Filter to only include dates from 2014 onwards ---
flares_df = flares_df[flares_df['event_starttime'].dt.year >= 2014].copy()
print(f"Filtered to {len(flares_df)} usable flare records from 2014 onwards.")
# ---------------------------------------------------------------

# =========================
# 2. FIND A TARGET FLARE AND MATCHING SHARP DATA
# =========================
print("\n--- Finding a powerful flare to test from 2014 ---")
target_flares = flares_df[flares_df['fl_goescls'].str.startswith('X')]
if target_flares.empty:
    target_flares = flares_df[flares_df['fl_goescls'].str.startswith('M')]

if not target_flares.empty:
    target_flare = target_flares.iloc[0]
    flare_time = target_flare['event_starttime']
    ar_num = target_flare['ar_noaanum']
    
    print(f"\nFound target flare from AR #{ar_num} at (UTC): {flare_time}")
    
    # Find all SHARP records for this AR
    sharp_subset = sharp_df[sharp_df['NOAA_AR'] == ar_num].copy()
    
    if not sharp_subset.empty:
        print(f"Found {len(sharp_subset)} SHARP records for this AR.")
        
        # Define the 24-hour window
        prediction_window = pd.Timedelta(hours=24)
        start_window = flare_time - prediction_window
        end_window = flare_time
        
        print(f"Prediction window starts at (UTC): {start_window}")
        print(f"Prediction window ends at (UTC):   {end_window}")
        
        # Check the timestamps of the SHARP data
        print("\n--- Comparing SHARP T_REC to the UTC window ---")
        
        sharp_subset['in_window'] = (
            (sharp_subset['T_REC'] >= start_window) &
            (sharp_subset['T_REC'] < end_window)
        )
        
        num_matches = sharp_subset['in_window'].sum()
        
        if num_matches > 0:
            print(f"✅ SUCCESS: Found {num_matches} records that fall inside the time window.")
        else:
            print("❌ FAILURE: Found 0 records inside the time window.")
            print("\nFirst SHARP record for this AR:")
            print(sharp_subset.head(1)[['T_REC_str', 'T_REC']])
            print("\nLast SHARP record for this AR:")
            print(sharp_subset.tail(1)[['T_REC_str', 'T_REC']])
    else:
        print(f"Could not find any SHARP data for AR #{ar_num}.")
else:
    print("Could not find any M or X class flares to test in 2014+.")

--- Loading and cleaning data ---
Filtered to 2055 usable flare records from 2014 onwards.

--- Finding a powerful flare to test from 2014 ---

Found target flare from AR #11944 at (UTC): 2014-01-07 18:04:00
Found 1601 SHARP records for this AR.
Prediction window starts at (UTC): 2014-01-06 18:04:00
Prediction window ends at (UTC):   2014-01-07 18:04:00

--- Comparing SHARP T_REC to the UTC window ---
✅ SUCCESS: Found 182 records that fall inside the time window.


In [8]:
import pandas as pd
import glob
from datetime import timedelta
import os

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_2014_dataset.parquet' # Renamed for the test
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA (ONLY FOR 2014)
# =========================
print("--- Step 1: Loading all raw data chunks for 2014 ---")
# THE FIX: Added '/2014-' to glob to select only 2014 files
sharp_files = glob.glob(f'{SHARP_DIR_INPUT}/2014-*.parquet')
flare_files = glob.glob(f'{FLARE_DIR_INPUT}/2014-*.parquet')

if not sharp_files:
    raise FileNotFoundError(f"No 2014 SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records from 2014.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records from 2014.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and preparing the complete dataset ---")
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.dropna(inplace=True) 
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)

sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels...")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X', 'M', 'C', 'B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary for 2014 ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print(f"✅ Successfully saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks for 2014 ---
Loaded 493308 SHARP records from 2014.
Loaded 2299 flare records from 2014.

--- Step 2: Cleaning and preparing the complete dataset ---

--- Step 3: Creating labels...

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary for 2014 ---
Total SHARP records: 343055

Binary Label Counts ('flare' column):
flare
0    341912
1      1143
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    299971
C             37117
B              4824
M              1127
X                16
Name: count, dtype: int64
✅ Successfully saved to final_labeled_2014_dataset.parquet


In [1]:
import pandas as pd
import glob
from datetime import timedelta
import os

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_2014-2015_dataset.parquet' # Renamed for clarity
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA (ONLY FOR 2014-2015)
# =========================
print("--- Step 1: Loading all raw data chunks for 2014-2015 ---")

# --- THE FIX: Select only files from 2014 and 2015 ---
sharp_files_2014 = glob.glob(f'{SHARP_DIR_INPUT}/2014-*.parquet')
sharp_files_2015 = glob.glob(f'{SHARP_DIR_INPUT}/2015-*.parquet')
sharp_files = sharp_files_2014 + sharp_files_2015

flare_files_2014 = glob.glob(f'{FLARE_DIR_INPUT}/2014-*.parquet')
flare_files_2015 = glob.glob(f'{FLARE_DIR_INPUT}/2015-*.parquet')
flare_files = flare_files_2014 + flare_files_2015
# --------------------------------------------------------

if not sharp_files:
    raise FileNotFoundError(f"No 2014-2015 SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records from 2014-2015.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records from 2014-2015.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and preparing the complete dataset ---")
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.dropna(inplace=True) 
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)

sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels...")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['B', 'C', 'M', 'X']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary for 2014-2015 ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print(f"✅ Successfully saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks for 2014-2015 ---
Loaded 918328 SHARP records from 2014-2015.
Loaded 2299 flare records from 2014-2015.

--- Step 2: Cleaning and preparing the complete dataset ---

--- Step 3: Creating labels...

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary for 2014-2015 ---
Total SHARP records: 681458

Binary Label Counts ('flare' column):
flare
0    673093
1      8365
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    638374
C             32060
M              7269
B              2659
X              1096
Name: count, dtype: int64
✅ Successfully saved to final_labeled_2014-2015_dataset.parquet


In [1]:
import pandas as pd
import glob
from datetime import timedelta
import os

# =========================
# CONFIGURATION
# =========================
SHARP_DIR_INPUT = 'raw_sharp_data_parquet_new'
FLARE_DIR_INPUT = 'raw_flare_data_parquet'
FINAL_OUTPUT_FILE = 'final_labeled_2014-2015_dataset_old_logic.parquet' # Renamed for clarity
PREDICTION_WINDOW_HOURS = 24

# =========================
# 1. LOAD AND COMBINE RAW DATA (ONLY FOR 2014-2015)
# =========================
print("--- Step 1: Loading all raw data chunks for 2014-2015 ---")

# --- THE FIX: Select only files from 2014 and 2015 ---
sharp_files_2014 = glob.glob(f'{SHARP_DIR_INPUT}/2014-*.parquet')
sharp_files_2015 = glob.glob(f'{SHARP_DIR_INPUT}/2015-*.parquet')
sharp_files = sharp_files_2014 + sharp_files_2015

flare_files_2014 = glob.glob(f'{FLARE_DIR_INPUT}/2014-*.parquet')
flare_files_2015 = glob.glob(f'{FLARE_DIR_INPUT}/2015-*.parquet')
flare_files = flare_files_2014 + flare_files_2015
# --------------------------------------------------------

if not sharp_files:
    raise FileNotFoundError(f"No 2014-2015 SHARP files found in '{SHARP_DIR_INPUT}'.")

sharp_df = pd.concat([pd.read_parquet(f) for f in sharp_files if os.path.getsize(f) > 0], ignore_index=True)
print(f"Loaded {len(sharp_df)} SHARP records from 2014-2015.")

if flare_files:
    flares_df = pd.concat([pd.read_parquet(f) for f in flare_files if os.path.getsize(f) > 0], ignore_index=True)
    print(f"Loaded {len(flares_df)} flare records from 2014-2015.")
else:
    flares_df = pd.DataFrame()

# =========================
# 2. CLEAN AND PREPARE DATA
# =========================
print("\n--- Step 2: Cleaning and preparing the complete dataset ---")
sharp_df['T_REC'] = pd.to_datetime(sharp_df['T_REC'], format="%Y.%m.%d_%H:%M:%S_TAI", errors='coerce')
sharp_df.dropna(subset=['T_REC'], inplace=True)
sharp_df.sort_values(by=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.drop_duplicates(subset=['HARPNUM', 'T_REC'], inplace=True)
sharp_df.dropna(inplace=True) 
sharp_df['HARPNUM'] = sharp_df['HARPNUM'].astype(int)
sharp_df['NOAA_AR'] = sharp_df['NOAA_AR'].astype(int)

sharp_df['classification'] = 'Non-flare'
sharp_df['flare'] = 0

if not flares_df.empty:
    flares_df['event_starttime'] = pd.to_datetime(flares_df['event_starttime'], errors='coerce')
    flares_df.dropna(subset=['event_starttime', 'ar_noaanum', 'fl_goescls'], inplace=True)
    flares_df['ar_noaanum'] = pd.to_numeric(flares_df['ar_noaanum'], errors='coerce')
    flares_df = flares_df[flares_df['ar_noaanum'] > 0].copy()
    flares_df['ar_noaanum'] = flares_df['ar_noaanum'].astype(int)

# =========================
# 3. CREATE LABELS
# =========================
if not flares_df.empty:
    print(f"\n--- Step 3: Creating labels...")
    prediction_window = pd.Timedelta(hours=PREDICTION_WINDOW_HOURS)
    flare_classes = ['X','M','C','B']

    for f_class in flare_classes:
        class_flares = flares_df[flares_df['fl_goescls'].str.startswith(f_class, na=False)]
        for _, flare in class_flares.iterrows():
            mask = ((sharp_df['NOAA_AR'] == flare['ar_noaanum']) &
                    (sharp_df['T_REC'] >= flare['event_starttime'] - prediction_window) &
                    (sharp_df['T_REC'] < flare['event_starttime']))
            sharp_df.loc[mask, 'classification'] = f_class

    sharp_df['flare'] = sharp_df['classification'].isin(['X', 'M']).astype(int)

# =========================
# 4. SAVE FINAL DATASET
# =========================
print("\n--- Step 4: Saving final labeled dataset ---")
sharp_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

print("\n--- Dataset Summary for 2014-2015 ---")
print(f"Total SHARP records: {len(sharp_df)}")
print("\nBinary Label Counts ('flare' column):")
print(sharp_df['flare'].value_counts())
print("\nMulti-class Label Counts ('classification' column):")
print(sharp_df['classification'].value_counts())
print(f"✅ Successfully saved to {FINAL_OUTPUT_FILE}")

--- Step 1: Loading all raw data chunks for 2014-2015 ---
Loaded 918328 SHARP records from 2014-2015.
Loaded 2299 flare records from 2014-2015.

--- Step 2: Cleaning and preparing the complete dataset ---

--- Step 3: Creating labels...

--- Step 4: Saving final labeled dataset ---

--- Dataset Summary for 2014-2015 ---
Total SHARP records: 681458

Binary Label Counts ('flare' column):
flare
0    680315
1      1143
Name: count, dtype: int64

Multi-class Label Counts ('classification' column):
classification
Non-flare    638374
C             37117
B              4824
M              1127
X                16
Name: count, dtype: int64
✅ Successfully saved to final_labeled_2014-2015_dataset_old_logic.parquet
