In [45]:
import requests
import pandas as pd

# 1. Fetching data from NOAA API (X-ray Flux Data)
noaa_url = "https://services.swpc.noaa.gov/json/goes/primary/xrays-1-day.json"
noaa_response = requests.get(noaa_url)
noaa_data = noaa_response.json()

# Convert NOAA data to pandas DataFrame
noaa_df = pd.DataFrame(noaa_data)

# Ensure 'time_tag' is in datetime format with UTC timezone
noaa_df['time_tag'] = pd.to_datetime(noaa_df['time_tag'], utc=True)
print("NOAA data after conversion:")
print(noaa_df.head(), "\n")


# 2. Fetching data from Helioviewer API
helioviewer_url = "https://api.helioviewer.org/v2/getClosestImage/?date=2023-09-15T00:00:00Z&sourceId=14"
helioviewer_response = requests.get(helioviewer_url)
helioviewer_data = helioviewer_response.json()

# Convert Helioviewer response to DataFrame (assuming 'date' is present)
helioviewer_df = pd.DataFrame([helioviewer_data])

# Inspect what data is available in Helioviewer to find relevant date
print("Helioviewer data:")
print(helioviewer_df.head(), "\n")

# 3. Check if the 'date' field exists, and if not, handle appropriately
if 'date' in helioviewer_df.columns:
    # Ensure 'date' is in datetime format without timezone
    helioviewer_df['date'] = pd.to_datetime(helioviewer_df['date'])
else:
    # If 'date' is missing, print a message
    print("Date field not found in Helioviewer data. Creating mock 'date' for demo purposes.")
    # Create a mock 'date' column for merging (use the date from the API request for now)
    helioviewer_df['date'] = pd.to_datetime("2023-09-15")

print("Helioviewer data after date conversion:")
print(helioviewer_df.head(), "\n")

# 4. To match the formats, convert the Helioviewer date to UTC (timezone-aware)
helioviewer_df['date'] = helioviewer_df['date'].dt.tz_localize('UTC')

# 5. Merge the NOAA and Helioviewer DataFrames
combined_df = pd.merge(noaa_df, helioviewer_df, how='inner', left_on='time_tag', right_on='date')

# Print the combined DataFrame to verify the merge
print("Combined DataFrame:")
print(combined_df.head())

NOAA data after conversion:
                   time_tag  satellite          flux  observed_flux  \
0 2024-09-26 10:59:00+00:00         16  5.267604e-08   6.918628e-08   
1 2024-09-26 10:59:00+00:00         16  2.376826e-06   2.404217e-06   
2 2024-09-26 11:00:00+00:00         16  5.384534e-08   7.102810e-08   
3 2024-09-26 11:00:00+00:00         16  2.385958e-06   2.413543e-06   
4 2024-09-26 11:01:00+00:00         16  5.633739e-08   7.287300e-08   

   electron_correction  electron_contaminaton      energy  
0         1.651024e-08                  False  0.05-0.4nm  
1         2.739015e-08                  False   0.1-0.8nm  
2         1.718276e-08                  False  0.05-0.4nm  
3         2.758464e-08                  False   0.1-0.8nm  
4         1.653561e-08                  False  0.05-0.4nm   

Helioviewer data:
          id                 date     name     scale  scaleCorrection  width  \
0  148024150  2023-09-14 23:59:49  AIA 335  0.603527         0.994157   4096   

   h

In [46]:
# Extract features from the NOAA data (e.g., X-ray flux levels)
combined_df['flux_difference'] = combined_df['flux'] - combined_df['flux'].shift(1)

# Feature engineering: Add time-related features
combined_df['hour'] = combined_df['time_tag'].dt.hour
combined_df['day'] = combined_df['time_tag'].dt.day
combined_df['month'] = combined_df['time_tag'].dt.month

# Handle missing values (using the forward-fill method)
combined_df = combined_df.ffill()  # Forward fill missing values

# Drop irrelevant columns (check if they exist)
columns_to_drop = ['unnecessary_column_1', 'unnecessary_column_2']
combined_df = combined_df.drop(columns=[col for col in columns_to_drop if col in combined_df.columns], errors='ignore')

# Check the DataFrame after processing
print(combined_df.head())

Empty DataFrame
Columns: [time_tag, satellite, flux, observed_flux, electron_correction, electron_contaminaton, energy, id, date, name, scale, scaleCorrection, width, height, refPixelX, refPixelY, rotation, rsun, dsun, sunCenterOffsetParams, layeringOrder, flux_difference, hour, day, month]
Index: []

[0 rows x 25 columns]


In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Check the size of the DataFrame
print("DataFrame shape:", combined_df.shape)

# If the DataFrame is empty, check if any filtering steps caused it
if combined_df.empty:
    print("Warning: The DataFrame is empty. Check your data loading or filtering steps.")

# Ensure your data is loaded correctly before train_test_split
if not combined_df.empty:
    X = combined_df[['flux_difference', 'hour', 'day', 'month', 'solar_event_1', 'solar_event_2']]
    y = combined_df['flux']

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features if necessary
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Training and test sets prepared successfully.")

DataFrame shape: (0, 25)


In [51]:
# Check if the data was loaded correctly
print("Initial DataFrame shape:", combined_df.shape)

# If the shape is (0, 27), it means there is no data
if combined_df.empty:
    print("The DataFrame is empty. Check data loading step.")
else:
    print("Data loaded successfully. Here's a preview:")
    print(combined_df.head())

# Check for any filtering or dropping operations that might have caused this
# Example: if you had filtering conditions
# combined_df = combined_df[combined_df['some_column'] > threshold]
# If such filters exist, verify that they didn't remove all rows.

# Also check for missing values
print("Missing data check:")
print(combined_df.isnull().sum())

# If missing values were an issue, try handling them
# Handle missing values using forward fill (or you can use backward fill as per your need)
combined_df.ffill(inplace=True)

# Continue with the rest of your code
print("Missing values handled successfully.")



Initial DataFrame shape: (0, 25)
The DataFrame is empty. Check data loading step.
Missing data check:
time_tag                 0
satellite                0
flux                     0
observed_flux            0
electron_correction      0
electron_contaminaton    0
energy                   0
id                       0
date                     0
name                     0
scale                    0
scaleCorrection          0
width                    0
height                   0
refPixelX                0
refPixelY                0
rotation                 0
rsun                     0
dsun                     0
sunCenterOffsetParams    0
layeringOrder            0
flux_difference          0
hour                     0
day                      0
month                    0
dtype: int64
Missing values handled successfully.


In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

NameError: name 'X_train' is not defined