# ACE Exploration


## Libraries and global variables

In [None]:
# Standard library imports
import sys
import os

# Third-party imports
from contextlib import suppress
import warnings
import pandas as pd
import numpy as np
from functools import reduce
# import seaborn as sns
import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
# from sklearn.model_selection import cross_val_score
# from sklearn.pipeline import make_pipeline
# from keras.layers import Input, Dense
# from keras.models import Model

# Local application imports
sys.path.append("../src/scripts")
from utilities import (
    parse_hdf_data,
    merge_dataframes,
    sort_columns_except_key,
    add_datetime_column,
)

# Set the warning filter to ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
# global variables
MISSING_FLAG = -999.900
N_SPLITS = 4

## Data Import

In [None]:
# read data
data_dir = "../data/ace/raw"
swics_1hr_dir = f"{data_dir}/swics_1hr"
swics_2hr_dir = f"{data_dir}/swics_2hr"

mag_df = parse_hdf_data(f"{data_dir}/MAG_data_1hr.txt")
swepam_df = parse_hdf_data(f"{data_dir}/SWEPAM_data_1hr.txt")
epam_df = parse_hdf_data(f"{data_dir}/EPAM_data_1hr.txt")

swics_dfs = []
for dir in [swics_1hr_dir, swics_2hr_dir]:
    for file in os.listdir(dir):
        swics_dfs.append(parse_hdf_data(f"{dir}/{file}"))
swics_df = pd.concat(swics_dfs)

In [None]:
ACE_DATASETS = [mag_df, swepam_df, epam_df, swics_df]
ACE_DATASETS_NAMES = ["MAG", "SWEPAM", "EPAM", "SWICS"]

In [None]:
# dtype conversion
for df in ACE_DATASETS:
    df[["year", "day", "hr", "min", "sec"]] = df[
        ["year", "day", "hr", "min", "sec"]
    ].astype(int)

    with suppress(KeyError):
        df['Quality'] = df['Quality'].astype(str)

In [None]:
# datetime conversion and drop redundant features
for df in ACE_DATASETS:
    add_datetime_column(df).drop(
        columns=["year", "day", "hr", "min", "sec", "fp_year", "fp_doy"],
        inplace=True,
        axis=1,
    )

# swics_df may contain duplicate records to nature of 1.0 and 2.0 data collection
swics_df.drop_duplicates(subset="datetime", inplace=True)

## Data Cleaning

### Descriptives

In [None]:
for df, df_name in zip(ACE_DATASETS, ACE_DATASETS_NAMES):
    print(f"Dataframe: {df_name}")
    display(df.info())
    display(df.describe())
    print("\n" + ("-" * 20))

### Retain *Good* Quality data

Good data is flagged by the researchers with a value of 0. 

In [None]:
# replace null flag with np.nan for quality flag analysis
for c, (df, df_name) in enumerate(zip(ACE_DATASETS, ACE_DATASETS_NAMES)):
    for flag in [-9999.9,-999.9]:
        df = df.replace(flag, np.nan)
    ACE_DATASETS[c] = df

mag_df, swepam_df, epam_df, swics_df = ACE_DATASETS

In [None]:
# # THE GOOD CODE
# qf_cols = swics_df.filter(regex="^qf_").columns
# swics_df = swics_df[(swics_df[qf_cols].isna() | swics_df[qf_cols].eq(0)).any(axis=1)]

# for col in swics_df[qf_cols]:
#     swics_df = swics_df[(swics_df[col] == 0) | (swics_df[col].isna())]

In [None]:
# assess quality flag provided in the datasets
for c, (df, df_name) in enumerate(zip(ACE_DATASETS, ACE_DATASETS_NAMES)):

    with suppress(KeyError):  # not all datasets have the quality flag
        if df_name != "SWICS":
            df = df[str(df["Quality"]) == "0.0"]
            df.drop(columns=["Quality"], inplace=True, axis=1)
        else:
            qf_cols = swics_df.filter(regex="^qf_").columns
            err_cols = swics_df.filter(regex="_err$").columns

            # drop rows that have no 0 or nan values due to two instrument versions
            df = swics_df[
                (swics_df[qf_cols].isna() | swics_df[qf_cols].eq(0)).any(axis=1)
            ]

            # drop rows that dont have good quality flag 0
            for col in df[qf_cols]:
                df = df[(df[col] == 0) | (df[col].isna())]

            # drop quality columns
            df.drop(columns=qf_cols, inplace=True)
            df.drop(columns=err_cols, inplace=True)

    ACE_DATASETS[c] = df

mag_df, swepam_df, epam_df, swics_df = ACE_DATASETS

### Join data

In [None]:
# find unique timestamps
mag_dates, swepam_dates, epam_dates, swics_dates = [
    df.datetime.unique() for df in ACE_DATASETS
]

# find the common dates for 1hr interval data
common_dates_1hr = reduce(
    np.intersect1d, (mag_dates, swepam_dates, epam_dates)
)

# find the common dates for 2hr interval data
common_date_2hr = reduce(
    np.intersect1d, (mag_dates, swepam_dates, epam_dates, swics_dates)
    )

print(len(common_dates_1hr))
print(len(common_date_2hr))

In [None]:
# join the 1hr to 2 hr interval datasets
insitu_df = merge_dataframes(ACE_DATASETS, "datetime")
df = sort_columns_except_key(insitu_df, "datetime")
df.shape

In [None]:
# drop additional descriptive columns
# drop_cols = ['Missing_Proportion', 'Missing_Count']
# df.drop(columns=drop_cols, inplace=True)

### Handling Missing Values

Missing data has the value of -999.900. Assert that there are no longer missing values due to dropping data labeled as not of good quality.

In [None]:
features = [
    "datetime",
    "proton_speed",
    "proton_density",
    "proton_temp",
    "O7to6",
    "C6to5",
    "FetoO",
    "avqFe",
]

df_features = df[features].dropna()
X = df_features[df_features.select_dtypes(include="number").columns.tolist()]

## Data Transformation

### Log transformation and Min Max Scaler

Log transformation is primarily used to reduce skewness in highly skewed data, where some values are much larger than others. This transformation can make the data more "normal-like" or symmetric. Applying log transformation before Min-Max scaling can dramatically change the distribution of the data, potentially pulling in large values and spreading out smaller ones. This makes the subsequent Min-Max scaling step distribute the scaled values more evenly across the range [0, 1].

In [None]:
# note that the log of zero or negative values is undefined
assert (X > 0).all().all(), "X contains non-positive values"
X = X.apply(np.log10)

In [None]:
# Fit the scaler to the data and transform it
X = MinMaxScaler().fit_transform(X)

## Dimensionality Reduction

### Dimensionality Reduction Using PCA

In [None]:
# Fit PCA
pca = PCA().fit(X)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color="black")
plt.xlabel("PCA features")
plt.ylabel("variance %")
plt.xticks(features)

# Save components to a DataFrame
PCA_components = pd.DataFrame(pca.transform(X))

plt.show()

In [None]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
num_components = np.where(cumulative_variance > 0.95)[0][0] + 1
print("Number of components to explain 95% Variance: ", num_components)

In [None]:
# Create a PCA that will retain ideal components
pca = PCA(n_components=num_components, whiten=True)

# Conduct PCA
X_pca = pca.fit_transform(X)

# Show the new data
print("original shape:   ", X.shape)
print("transformed shape:", X_pca.shape)

# The transformed data has been reduced to two dimensions
df = pd.DataFrame(
    data=X_pca,
)
print(df.head())

In [None]:
# Creating a DataFrame for better plotting
# pc_df = pd.DataFrame(X_pca, columns=[f"PC{i}" for i in range(1, num_components + 1)])
# sns.pairplot(pc_df)
# plt.show()

In [None]:
# Plotting the PCA results
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Result")
plt.grid(True)
plt.show()

### Dimensionality Reduction Using Kernel PCA

In [None]:
# Fit Kernel PCA with n_components=None to compute all components
kpca = KernelPCA(n_components=None, kernel="rbf")
kpca.fit(X)

# Get eigenvalues
eigenvalues = kpca.lambdas_

# Plot eigenvalues
plt.plot(eigenvalues, "bo-")
plt.xlabel("Index")
plt.ylabel("Eigenvalue")
plt.show()

In this plot, the x-axis represents the index of each component (in descending order of eigenvalue), and the y-axis represents the corresponding eigenvalue. You typically choose the number of components at the point where adding another component doesn't significantly increase the eigenvalue (the "elbow" of the plot).

### Dimensionality Reduction Using Autoencoders

In [None]:
# # Define the size of the encoded representation
# encoding_dim = 2  # 2-dimensional encoded representation

# # Define the input layer
# input_img = Input(shape=(X.shape[1],))

# # Define the encoded layer
# encoded = Dense(encoding_dim, activation="relu")(input_img)

# # Define the decoded layer
# decoded = Dense(X.shape[1], activation="sigmoid")(encoded)

# # Define the autoencoder model
# autoencoder = Model(input_img, decoded)

# # Define the encoder model
# encoder = Model(input_img, encoded)

# # Define the decoder model
# encoded_input = Input(shape=(encoding_dim,))
# decoder_layer = autoencoder.layers[-1]
# decoder = Model(encoded_input, decoder_layer(encoded_input))

# # Compile the autoencoder
# autoencoder.compile(optimizer="adadelta", loss="binary_crossentropy")

# # Train the autoencoder
# autoencoder.fit(X, X, epochs=50, batch_size=256, shuffle=True)

# # Use the encoder to reduce the dimensionality of the data
# X_encoded = encoder.predict(X)

# print("original shape:   ", X.shape)
# print("transformed shape:", X_encoded.shape)

## K-Means cluster
