# ACE Exploration


## Libraries and global variables

In [1]:
# Standard library imports
import sys
import os

# Third-party imports
from contextlib import suppress
import warnings
import pandas as pd
import numpy as np
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA

# Local application imports
sys.path.append("../src/scripts")
from utilities import (
    parse_hdf_data,
    merge_dataframes,
    sort_columns_except_key,
    add_datetime_column,
)

# Set the warning filter to ignore all warnings
warnings.filterwarnings("ignore")

## Data Import

In [2]:
# Read in the ACE data
data_dir = "../data/ace/raw"
swics_1hr_dir = f"{data_dir}/swics_1hr"
swics_2hr_dir = f"{data_dir}/swics_2hr"

mag_df = parse_hdf_data(f"{data_dir}/MAG_data_1hr.txt")
swepam_df = parse_hdf_data(f"{data_dir}/SWEPAM_data_1hr.txt")
epam_df = parse_hdf_data(f"{data_dir}/EPAM_data_1hr.txt")

swics_dfs = []
for dir in [swics_1hr_dir, swics_2hr_dir]:
    for file in os.listdir(dir):
        swics_dfs.append(parse_hdf_data(f"{dir}/{file}"))
swics_df = pd.concat(swics_dfs)

In [3]:
# Global variables
ACE_DATASETS = [mag_df, swepam_df, epam_df, swics_df]
ACE_DATASETS_NAMES = ["MAG", "SWEPAM", "EPAM", "SWICS"]

In [4]:
# dtype conversion
for df in ACE_DATASETS:
    df[["year", "day", "hr", "min", "sec"]] = df[
        ["year", "day", "hr", "min", "sec"]
    ].astype(int)

    with suppress(KeyError):
        df['Quality'] = df['Quality'].astype(str)

In [5]:
# datetime conversion and drop redundant features
for df in ACE_DATASETS:
    add_datetime_column(df).drop(
        columns=["year", "day", "hr", "min", "sec", "fp_year", "fp_doy"],
        inplace=True,
        axis=1,
    )

# swics_df may contain duplicate records to nature of 1.0 and 2.0 data collection
swics_df.drop_duplicates(subset="datetime", inplace=True)

## Data Cleaning

### Descriptives

In [None]:
for df, df_name in zip(ACE_DATASETS, ACE_DATASETS_NAMES):
    print(f"Dataframe: {df_name}")
    display(df.info())
    display(df.describe())
    print("\n" + ("-" * 20))

### Retain *Good* Quality data

Good data is flagged by the researchers with a value of 0. 

In [6]:
# replace null flag with np.nan for quality flag analysis
for c, (df, df_name) in enumerate(zip(ACE_DATASETS, ACE_DATASETS_NAMES)):
    for flag in [-9999.9,-999.9]:
        df = df.replace(flag, np.nan)
    ACE_DATASETS[c] = df

mag_df, swepam_df, epam_df, swics_df = ACE_DATASETS

In [7]:
# assess quality flag provided in the datasets
for c, (df, df_name) in enumerate(zip(ACE_DATASETS, ACE_DATASETS_NAMES)):

    with suppress(KeyError):  # not all datasets have the quality flag
        if df_name != "SWICS":
            df = df[str(df["Quality"]) == "0.0"]
            df.drop(columns=["Quality"], inplace=True, axis=1)
        else:
            qf_cols = swics_df.filter(regex="^qf_").columns
            err_cols = swics_df.filter(regex="_err$").columns

            # drop rows that have no 0 or nan values due to two instrument versions
            df = swics_df[
                (swics_df[qf_cols].isna() | swics_df[qf_cols].eq(0)).any(axis=1)
            ]

            # drop rows that dont have good quality flag 0
            for col in df[qf_cols]:
                df = df[(df[col] == 0) | (df[col].isna())]

            # drop quality columns
            df.drop(columns=qf_cols, inplace=True)
            df.drop(columns=err_cols, inplace=True)

    ACE_DATASETS[c] = df

mag_df, swepam_df, epam_df, swics_df = ACE_DATASETS

### Join data

In [8]:
# find unique timestamps
mag_dates, swepam_dates, epam_dates, swics_dates = [
    df.datetime.unique() for df in ACE_DATASETS
]

# find the common dates for 1hr interval data
common_dates_1hr = reduce(
    np.intersect1d, (mag_dates, swepam_dates, epam_dates)
)

# find the common dates for 2hr interval data
common_date_2hr = reduce(
    np.intersect1d, (mag_dates, swepam_dates, epam_dates, swics_dates)
    )

print(len(common_dates_1hr))
print(len(common_date_2hr))

204768
132245


In [9]:
# join the 1hr to 2 hr interval datasets
insitu_df = merge_dataframes(ACE_DATASETS, "datetime")
df = sort_columns_except_key(insitu_df, "datetime")
df.shape

(132245, 175)

In [10]:
# df.to_csv("../data/ace/preprocessed/insitu_ace.csv", index=False)

### Handling Missing Values

Missing data has the value of -999.900. Assert that there are no longer missing values due to dropping data labeled as not of good quality.

In [None]:
features = [
    "datetime",
    "proton_speed",
    "proton_density",
    "proton_temp",
    "O7to6",
    "C6to5",
    "FetoO",
    "avqFe",
]

df_features = df[features].dropna()
X_timestamp = df_features["datetime"].astype(str)
X = df_features[df_features.select_dtypes(include="number").columns.tolist()].astype(
    float
)

In [None]:
neg_count = X[X < 0].count().sum()
na_count = X.isna().sum().sum()

print("Number of non-positive values:", neg_count)
print("Number of rows with NaN values:", na_count)

In [None]:
np.log10(X).min()

## Data Transformation

### Log transformation and Min Max Scaler

Log transformation is primarily used to reduce skewness in highly skewed data, where some values are much larger than others. This transformation can make the data more "normal-like" or symmetric. Applying log transformation before Min-Max scaling can dramatically change the distribution of the data, potentially pulling in large values and spreading out smaller ones. This makes the subsequent Min-Max scaling step distribute the scaled values more evenly across the range [0, 1].

In [None]:
# note that the log of zero or negative values is undefined
try:
    assert (X > 0).all().all(), "X contains non-positive values"
    X_log = np.log10(X)
except AssertionError:
    X_log = np.log10(X + 1) # add 1 to avoid log(0)

neg_count = X_log[X < 0].count().sum()
na_count = X_log.isna().sum().sum()

print("Number of non-positive values:", neg_count)
print("Number of rows with NaN values:", na_count)

In [None]:
X_timestamp.isna().count()

In [None]:
X_log_scaled = MinMaxScaler().fit_transform(X).astype(float)
df_numeric = pd.DataFrame(X_log_scaled, columns=X.columns)
df_datetime = pd.DataFrame(X_timestamp, columns=["datetime"])
df_log_scaled = pd.concat([df_datetime, df_numeric], axis=1)
df_log_scaled.dropna(inplace=True)

In [None]:
df_log_scaled.to_csv("../data/ace/preprocessed/insitu_ace_log_scaled.csv", index=False)