In [1]:
# Format Sunspot Number data

import pandas as pd
# Load CSV file, using semicolon as delimiter
file_path = '../../data/00_raw/SN_d_tot_V2.0.csv'
data = pd.read_csv(file_path, delimiter=';', header=None)
# Generate date string column, format YYYY-MM-DD
data['date_str'] = data[0].astype(str) + '-' + data[1].astype(str).str.zfill(2) + '-' + data[2].astype(str).str.zfill(2)
# Convert date string column to datetime objects
data['date'] = pd.to_datetime(data['date_str'])
# Set start and end dates
start_date = '1849-01-01'
end_date = '2025-07-31'
# Convert string dates to datetime objects for comparison
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
# Filter rows within the specified date range
filtered_data = data[(data['date'] >= start_date) & (data['date'] <= end_date)]
# Select date column and the 5th column (SSN)
final_data = filtered_data[['date', 4]]
# Rename columns
final_data.columns = ['date', 'ssn']
# Save filtered data to a new CSV file
output_path = '../../data/ready/ssn_daily_1849_2025.csv'
final_data.to_csv(output_path, index=False)
print("Task completed.")

Task completed.


In [None]:
# Automatically download annual sunspot group position txt data

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

url = 'http://solarcyclescience.com/activeregions.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')

for link in links:
    href = link.get('href')
    # Ensure href is not empty and points to a .txt file
    if href and href.endswith('.txt'):
        # Convert relative links to absolute links
        href = urljoin(url, href)
        try:
            r = requests.get(href, timeout=100)
            if r.status_code == 200:
                # Parse URL path to get filename
                path = urlparse(href).path
                filename = path.split('/')[-1]
                # Ensure filename is not empty
                if filename:
                    with open(filename, 'wb') as f:
                        f.write(r.content)
                    print(f"Downloaded {filename}")
                else:
                    print(f"Invalid filename for URL: {href}")
            else:
                print(f"Failed to download {href}: Status code {r.status_code}")
        except requests.RequestException as e:
            print(f"Request failed for {href}: {e}")

In [2]:
# Merge multiple txt files into one by writing directly to the output file

import os

# Define start and end years
start = 1874
end = 2025
output_file = '../../data/interm/1874-2025.txt'

total_lines_written = 0

# Open the output file once and write lines directly to avoid memory overhead
with open(output_file, 'w') as outfile:
    for i in range(start, end + 1):
        file_path = f'../../data/00_raw/solar_cycle_data/g{i}.txt'
        if os.path.exists(file_path):
            with open(file_path, 'r') as infile:
                for line in infile:
                    if line.strip():  # Check for non-empty lines
                        outfile.write(line)
                        total_lines_written += 1

print(f"Task completed. Total lines written: {total_lines_written}")

Task completed. Total lines written: 256992


In [3]:
# Extract data based on format.txt

import pandas as pd
import glob

# Dynamically adjust column specifications
def get_column_specs_for_year(year):
    column_specs = [
        (12, 22 if year <= 1981 else 20, 'group_id'),  # Sunspot group ID
        (40, 44, 'area'),                              # Area
        (44, 50, 'dist_c'),                            # Disk Center Distance
        (50, 56, 'pa'),                                # Position Angle
        (56, 62, 'hcc_lon'),                           # Carrington Longitude
        (62, 68, 'hg_lat'),                            # Heliographic Latitude
        (68, 74, 'hg_lon'),                            # Heliographic Longitude
    ]
    return column_specs

# Parse a single line of data and handle date "%Y-%m-%d %H:%M:%S"
def parse_line(line):
    year = int(line[0:4])
    month = str(int(line[4:6])).zfill(2) 
    day = str(int(line[6:8])).zfill(2) 
    # Parse time information
    time_fraction = float(line[8:12])
    total_seconds = time_fraction * 86400  # Convert thousandths of a day to seconds
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)    
    # Combine date and time into a string
    date_obj = f"{year}-{month}-{day} {int(hours):02}:{int(minutes):02}:{int(seconds):02}"
    # Get column specs for the year
    column_specs = get_column_specs_for_year(year)    
    # Parse other fields
    record = {"date": date_obj}
    for start, end, name in column_specs:
        record[name] = line[start:end].strip()
    return record

# Parse the entire file
def parse_data_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return [parse_line(line) for line in file if line.strip()]

# Combine all files and save
def combine_and_save_data(txt_files, output_csv_path):
    all_data = []
    for filepath in txt_files:
        all_data.extend(parse_data_file(filepath)) 
    df = pd.DataFrame(all_data)
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

# Example usage
directory_path = '../../data/00_raw/solar_cycle_data/g*.txt'
output_csv_path = '../../data/interm/sg_gsn_ar_source.csv'
txt_files = glob.glob(directory_path)
combine_and_save_data(txt_files, output_csv_path)
print("Task completed.")

Task completed.


In [4]:
import pandas as pd

# Load the dataset
input_path = '../../data/interm/sg_gsn_ar_source.csv'
data = pd.read_csv(input_path)

# Filter rows where group_id is 0 or invalid
data['group_id'] = pd.to_numeric(data['group_id'], errors='coerce')
filtered_data = data[data['group_id'] != 0].copy()

# Sorting by date
# Note: Ensure the column name matches your CSV (e.g., 'date', 'timestamp', or 'obs_date')
date_col = 'date'
if date_col in filtered_data.columns:
    filtered_data[date_col] = pd.to_datetime(filtered_data[date_col])
    filtered_data = filtered_data.sort_values(by=date_col).reset_index(drop=True)
    print(f"Dataset successfully sorted by {date_col}.")
else:
    print(f"Warning: Column '{date_col}' not found. Skipping sort.")

# Export to CSV
output_path = '../../data/interm/sg_gsn_ar_source_filtered.csv'
filtered_data.to_csv(output_path, encoding='utf-8-sig', index=False)

print("Process completed: Data filtered and ordered.")

Dataset successfully sorted by date.
Process completed: Data filtered and ordered.
