<a href="https://colab.research.google.com/github/RodolfoFerro/deep-solar/blob/main/notebooks/Data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data exploration

> This is an exploratory version, the final version will be uploaded soon.

## Current pipeline method

At present, the raw data are converted to physics units using a calibration table, summed over the sensor segments, and recast as phase space distribution functions. A variety of ad hoc operations are invoked in order to tweak the background subtraction, isolate the physically relevant sub-range in voltage, and remove transients. Then a Gaussian peak fitting is performed. The parameters of the Gaussian map directly to _(n, w, |v|)_. Finally, the ratios between the signal peak values are fed into a table lookup to estimate the flow angle and complete the velocity vector.

In [None]:
!pip install wget cdflib dtw-python -q

> **Notes:** 
> - Data from [_Wind dataset_](https://cdaweb.gsfc.nasa.gov/pub/data/wind/mfi/mfi_h2/2022/) is available from 1994 (1994-11-13) to 2022 (2022-09-17).
> - Data form [_DSCOVR magnetic field dataset_](https://cdaweb.gsfc.nasa.gov/pub/data/dscovr/h0/mag/2022/) is available from 2015 (2015-06-08) to 2022 (2022-09-17).

In [None]:
from datetime import datetime
import re

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
from dtw import dtw
from dtw import rabinerJuangStepPattern
import cdflib
import wget

plt.style.use('seaborn')

In [None]:
raw_date = '2022-09-16'
date = re.sub('-', '',  raw_date)

wind_url = f'https://cdaweb.gsfc.nasa.gov/pub/data/wind/mfi/mfi_h2/2022/wi_h2_mfi_{date}_v03.cdf'
mfield_url = f'https://cdaweb.gsfc.nasa.gov/pub/data/dscovr/h0/mag/2022/dscovr_h0_mag_{date}_v01.cdf'

wind_filename = wget.download(wind_url)
mfield_filename = wget.download(mfield_url)

print(f'Files from {raw_date} to be used:')
print(' wind ->', wind_filename)
print(' magnetic field ->', mfield_filename)

In [None]:
wind_cdf_data = cdflib.cdf_to_xarray(wind_filename, to_datetime=True, fillval_to_nan=True)
wind_cdf_data

In [None]:
wind_data = wind_cdf_data['BGSE'].to_pandas()
wind_data.columns = ['x', 'y', 'z']
wind_data['BF1'] = wind_cdf_data['BF1'].to_pandas()

print('Wind data:')
wind_data

In [None]:
wind_data['BF1'].plot()

In [None]:
wind_data['norm'] = np.linalg.norm(wind_data[['x', 'y', 'z']].values, axis=1)
wind_data

In [None]:
from sklearn.preprocessing import MinMaxScaler


wind_data_model = MinMaxScaler().fit_transform(wind_data[['norm', 'BF1']])

In [None]:
wind_data_model

In [None]:
# Model and performance
import tensorflow as tf
from tensorflow.keras import layers, losses
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


data_length = 2

w_train, w_test = train_test_split(wind_data_model, test_size=0.2, random_state=42)

input = tf.keras.layers.Input(shape=(data_length,))

# Encoder layers
encoder = tf.keras.Sequential([
  layers.Dense(16, activation='relu'),
  layers.Dense(8, activation='relu'),
  layers.Dense(4, activation='relu')])(input)

# Decoder layers
decoder = tf.keras.Sequential([
      layers.Dense(8, activation='relu'),
      layers.Dense(16, activation='relu'),
      layers.Dense(data_length, activation='sigmoid')])(encoder)

# Create the autoencoder
autoencoder = tf.keras.Model(inputs=input, outputs=decoder)

In [None]:
# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='msle',  metrics=['mse'])

# Fit the autoencoder
history = autoencoder.fit(
    w_train,
    w_train,
    epochs=10,
    batch_size=64,
    validation_data=(w_test, w_test))

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('MSLE Loss')
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    # provides losses of individual instances
    reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)
    # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
        + 3 * np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_test_scaled, threshold):
    predictions = model.predict(x_test_scaled)
    # provides losses of individual instances
    errors = tf.keras.losses.msle(predictions, x_test_scaled)
    # 0 = anomaly, 1 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1 if x == True else 0)
    return errors, preds

In [None]:
threshold = find_threshold(autoencoder, w_train)
print(f"Threshold: {threshold}")

In [None]:
errors, predictions = get_predictions(autoencoder, w_test, threshold)

In [None]:
fig = plt.figure(figsize=(20, 5))
plt.plot(errors[:1000])
plt.axhline(y=threshold, color='r', linestyle='-')

In [None]:
len(predictions),  sum(predictions)

In [None]:
reconstructions = autoencoder.predict(w_train)
reconstruction_errors = tf.keras.losses.msle(reconstructions, w_train)

In [None]:
w_train[0], reconstructions[0], reconstruction_errors[0].numpy()

In [None]:
# the histogram of the data
n, bins, patches = plt.hist(reconstruction_errors[:100], density=True)


plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of reconstructions')
plt.grid(True)
plt.show()

In [None]:
# the histogram of the data
n, bins, patches = plt.hist(reconstructions, density=True, facecolor='g', alpha=0.75)


plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of reconstructions')
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.xlim(40, 160)
plt.ylim(0, 0.03)
plt.grid(True)
plt.show()

## Proposals

- Recurrent neural network
- ARIMA Models

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

x = np.arange(50, 450)

fig = make_subplots(rows=2, cols=1)
fig.append_trace(go.Scatter(x=x, y=w_train[50:450, 1], line=dict(color='royalblue', width=4, dash='dot')), row=1, col=1)
fig.append_trace(go.Scatter(x=x, y=reconstructions[50:450, 1]), row=1, col=1)
fig.append_trace(go.Scatter(x=x, y=reconstruction_errors[50:450]), row=2, col=1)
fig.add_hline(y=threshold, row=2, col=1)
fig.show()