# 04-normalize-transform.ipynb

Demonstrates:

- Data cleaning and normalization (Min-Max, Z-score, Log)
- Fourier transform (FFT) to inspect frequency content
- Wavelet transform (continuous wavelet via `scipy.signal.cwt` with Ricker wavelet)

It uses a synthetic example time series and shows how to apply these transforms to real data.

In [None]:
# Imports and synthetic dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

# create synthetic CME-like parameters and a time series signal
t = np.linspace(0, 10, 1000)                       # time in hours
signal = 0.6*np.sin(2*np.pi*1.5*t) + 0.3*np.sin(2*np.pi*7*t) + 0.2*np.random.randn(t.size)
cme_speed = np.abs(1000 + 300*np.sin(0.2*t) + 100*np.random.randn(t.size))  # positive
cme_density = np.abs(5 + 2*np.cos(0.1*t) + 0.5*np.random.randn(t.size))

df = pd.DataFrame({
    'time': t,
    'signal': signal,
    'cme_speed': cme_speed,
    'cme_density': cme_density
})

# show a small sample
df.head()

## Cleaning and normalization functions

In [None]:
# Cleaning and normalization functions
def clean_data(df, method='median'):
    if method == 'drop':
        return df.dropna()
    if method == 'mean':
        return df.fillna(df.mean(numeric_only=True))
    if method == 'median':
        return df.fillna(df.median(numeric_only=True))
    raise ValueError("method must be one of ['mean','median','drop']")

def minmax_normalize(series):
    return (series - series.min()) / (series.max() - series.min())

def zscore_normalize(series):
    return (series - series.mean()) / series.std(ddof=0)

def log_normalize(series):
    # shift to positive then log
    shifted = series - series.min() + 1e-6
    return np.log(shifted)

# apply cleaning
df_clean = clean_data(df, method='median')

# apply normalizations and show
df_norm = df_clean.copy()
df_norm['speed_minmax'] = minmax_normalize(df_norm['cme_speed'])
df_norm['density_zscore'] = zscore_normalize(df_norm['cme_density'])
df_norm['signal_log'] = log_normalize(df_norm['signal'] - df_norm['signal'].min() + 1e-6)  # shift positive

df_norm[['time','signal','signal_log','cme_speed','speed_minmax','cme_density','density_zscore']].head()

## Plot original vs normalized

In [None]:
# Plots: original and normalized signals
plt.figure(figsize=(10,4))
plt.plot(df['time'], df['signal'], label='original signal')
plt.plot(df_norm['time'], df_norm['signal_log'], label='log-normalized (shifted)', linewidth=1)
plt.xlabel('time')
plt.title('Signal: original vs log-normalized')
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,4))
plt.plot(df['time'], df['cme_speed'], label='cme_speed (original)')
plt.plot(df_norm['time'], df_norm['speed_minmax'], label='cme_speed (min-max)')
plt.xlabel('time')
plt.title('CME Speed: original vs Min-Max normalized')
plt.legend()
plt.tight_layout()
plt.show()

## Fourier transform (FFT)
Compute amplitude spectrum to inspect dominant frequencies.

In [None]:
from numpy.fft import rfft, rfftfreq

y = df['signal'].values
n = y.size
dt = df['time'].iloc[1] - df['time'].iloc[0]
yf = rfft(y)
xf = rfftfreq(n, d=dt)

plt.figure(figsize=(8,4))
plt.plot(xf, np.abs(yf))
plt.xlim(0, 20)
plt.xlabel('Frequency (Hz)')
plt.title('FFT amplitude spectrum of signal')
plt.tight_layout()
plt.show()

## Wavelet transform (CWT)
Using `scipy.signal.cwt` with the Ricker wavelet (Mexican hat) to get time-scale representation.

In [None]:
from scipy import signal

widths = np.arange(1, 128)
cwtmatr = signal.cwt(df['signal'].values, signal.ricker, widths)

plt.figure(figsize=(10,6))
plt.imshow(np.abs(cwtmatr), extent=[df['time'].min(), df['time'].max(), widths.max(), widths.min()], aspect='auto')
plt.xlabel('time')
plt.ylabel('width (scale)')
plt.title('CWT (Ricker) amplitude')
plt.colorbar(label='abs(CWT)')
plt.tight_layout()
plt.show()

## Example: rolling FFT peak frequency
This shows how to generate a derived feature (peak frequency in rolling windows) for later modeling.

In [None]:
def rolling_fft_peak(signal_array, window_size=200, step=50, dt=0.01):
    peaks = []
    for start in range(0, len(signal_array)-window_size+1, step):
        seg = signal_array[start:start+window_size]
        yf = np.abs(rfft(seg))
        xf = rfftfreq(len(seg), d=dt)
        peak_freq = xf[np.argmax(yf)]
        peaks.append(peak_freq)
    return np.array(peaks)

peaks = rolling_fft_peak(df['signal'].values, window_size=256, step=64, dt=dt)
peaks[:8]

In [None]:
# Save normalized dataset to CSV for downstream notebooks
out_path = '/mnt/data/normalized_cme_dataset.csv'
df_norm.to_csv(out_path, index=False)
print('Saved normalized dataset to', out_path)

### Notes & next steps
- Replace the synthetic `df` with your parsed DataFrame (from `01-parse-cactus-cmes`) by loading your CSV or exchanging the creation cell.
- For other wavelet families or finer analysis, consider `pywt` (PyWavelets): `pip install pywt`.
- Use the `df_norm` output CSV `/mnt/data/normalized_cme_dataset.csv` in downstream notebooks `02` & `03`.