In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
from datetime import datetime

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import ttest_ind, ks_2samp, levene
from scipy.stats import entropy, normaltest, jarque_bera
from statsmodels.tsa.stattools import adfuller
from scipy.signal import periodogram

In [2]:
X_train = pd.read_parquet("data/X_train.parquet")
y_train = pd.read_parquet("data/y_train.parquet")

In [None]:
# X_test = pd.read_parquet("data/X_test.reduced.parquet")
# y_test = pd.read_parquet("data/y_test.reduced.parquet")

# df = X_test.loc[10001:10001]
# df = df.reset_index()

In [None]:
# df_features_list = []
# for n in tqdm.tqdm(windows):
#     mask_pre  = (pos >= -n) & (pos <= -1)
#     mask_post = (pos >= 0)  & (pos <=  n-1)

#     pre = (df[mask_pre]
#            .groupby('id')['value']
#            .agg(agg_funcs)
#            .sort_index())
#     pre.columns = [f'value_{c}_pre_{n}' for c in pre.columns]

#     post = (df[mask_post]
#             .groupby('id')['value']
#             .agg(agg_funcs)
#             .sort_index())
#     post.columns = [f'value_{c}_post_{n}' for c in post.columns]

#     # vectorized slopes
#     pre[f'value_slope_pre_{n}']  = (pre[f'value_last_pre_{n}']  - pre[f'value_first_pre_{n}'])  / pre[f'value_count_pre_{n}']
#     post[f'value_slope_post_{n}'] = (post[f'value_last_post_{n}'] - post[f'value_first_post_{n}']) / post[f'value_count_post_{n}']

#     df_features_list.extend([pre, post])

# df_features = pd.concat(df_features_list, axis=1)

## Ruptures

In [None]:
# ! pip install ruptures
import ruptures as rpt

In [None]:
df = data.loc[10001:10010]
df

In [None]:
# Creation of data
# n = 500  # number of samples
# n_bkps, sigma = 3, 5  # number of change points, noise standard deviation
# signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma)

# change point detection
model = "rbf"  # "l1", "rbf", "linear", "normal", "ar",...
signal = df[['value']]

# Dynamic Programming
# algo = rpt.Dynp(model=model, min_size=3, jump=5).fit(signal)
# bkps = algo.predict(n_bkps=1)

# Linearly penalized segmentation
# algo = rpt.Pelt(model=model, min_size=3, jump=5).fit(signal)
# bkps = algo.predict(pen=2)

# Binary Segmentation Model
algo = rpt.Binseg(model=model).fit(signal)
bkps = algo.predict(n_bkps=1)

# Bottom-Up Model
# algo = rpt.BottomUp(model=model).fit(signal)
# bkps = algo.predict(n_bkps=1)

# Window-based change point detection
# algo = rpt.Window(width=40, model=model).fit(signal)
# bkps = algo.predict(n_bkps=1)

print(bkps)

In [None]:
# Show results
rpt.show.display(signal, bkps, figsize=(10, 6))
plt.show()

## Changefinder

In [5]:
pip install changefinder==0.1

Collecting changefinder==0.1
  Downloading changefinder-0.1.tar.gz (3.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [1 lines of output]
  ERROR: Can not execute `setup.py` since setuptools is not available in the build environment.
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

Encountered error while generating package metadata.

See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.


## Fast Fourier Transforms (FFT)

In [None]:
from scipy.signal import periodogram

def extract_main_frequencies(time_series_data, sampling_rate=1, num_main_freqs=3):
    """
    Extracts the main frequencies from time series data using a periodogram.

    Args:
        time_series_data (np.array): The input time series data.
        sampling_rate (float): The sampling rate of the time series data (samples per unit time).
        num_main_freqs (int): The number of main frequencies to extract.

    Returns:
        dict: A dictionary containing the top frequencies, their corresponding periods,
              and their power spectral densities.
    """
    # Estimate power spectral density using a periodogram
    frequencies, power_spectral_density = periodogram(time_series_data, fs=sampling_rate)

    # Get indices for the highest power spectral density values
    top_freq_indices = np.argsort(power_spectral_density)[::-1][:num_main_freqs]

    # Extract the top frequencies, powers, and calculate periods
    main_frequencies = frequencies[top_freq_indices]
    main_powers = power_spectral_density[top_freq_indices]
    main_periods = 1 / main_frequencies

    results = {}
    for i in range(num_main_freqs):
        results[f'freq_{i+1}'] = main_frequencies[i]
        results[f'period_{i+1}'] = main_periods[i]
        results[f'power_{i+1}'] = main_powers[i]
    
    df_freq = pd.DataFrame(results, index=[0])
    return df_freq

df_features = pd.DataFrame()
# Generate frequency features for pre and post boundary signals
for period in (0, 1):
    suffix = '_pre' if period == 0 else '_post'
    freq_features = df[df['period'] == period].groupby('id').apply(lambda x: extract_main_frequencies(x['value']))
    freq_features = freq_features.droplevel(1).filter(regex='^freq|^power')
    freq_features.columns = [col + suffix for col in freq_features.columns]

    df_features = pd.concat([df_features, freq_features], axis=1)

for n in range(1, 4):
    df_features[f'diff_freq_{n}'] = df_features[f'freq_{n}_post'] - df_features[f'freq_{n}_pre']
    df_features[f'avg_freq_{n}'] =  (df_features[f'freq_{n}_post'] + df_features[f'freq_{n}_pre'])/2
    df_features[f'pct_change_freq_{n}'] = df_features.apply(lambda x: round(x[f'diff_freq_{n}']/x[f'avg_freq_{n}'], 4)
                                                             if x[f'avg_freq_{n}'] != 0 else np.nan, axis=1)
feature_cols = df_features.filter(regex='^pct_change.*').columns
df_features[feature_cols]

# main_freq_info, freqs, psd = extract_main_frequencies(signal_0, sampling_rate=1, num_main_freqs=3)
# print("Main Frequencies Information:")
# for key, value in main_freq_info.items():
#     print(f"{key}: {value:.4f}")
    
# main_freq_info, freqs, psd = extract_main_frequencies(signal_1, sampling_rate=1, num_main_freqs=3)
# print("Main Frequencies Information:")
# for key, value in main_freq_info.items():
#     print(f"{key}: {value:.4f}")


# Plotting the Periodogram (optional)
# plt.figure(figsize=(10, 6))
# plt.plot(freqs, psd)
# plt.title('Periodogram of Time Series Data')
# plt.xlabel('Frequency (Hz)')
# plt.ylabel('Power Spectral Density')
# plt.grid(True)
# plt.show()

In [None]:
agg_funcs = [('mean', 'mean'), ('std', 'std'), ('max', 'max'), ('min', 'min'), ('median', 'median'), ('skew', 'skew'), ('kurtosis', lambda y: y.kurt()), ('count', 'count'), ('first', 'first'), ('last', 'last'),
             ('trend', lambda y: np.mean(np.diff(y))), ('volatility', lambda y: np.std(np.diff(y))), ('range', lambda y: np.max(y) - np.min(y)),
             ('mean_absolute_change', lambda y: np.mean(np.abs(np.diff(y)))), ('max_jump', lambda y: np.max(np.abs(np.diff(y)))), # Change dynamics
             ('num_turning_points', lambda y: np.sum(np.diff(np.sign(np.diff(y))) != 0)), ('upward_steps', lambda y:  np.sum(np.diff(y) > 0)), ('downward_steps', lambda y: np.sum(np.diff(y) < 0)), # Complexity
             ('entropy', lambda y: lambda y: entropy(np.histogram(y, bins=20, density=True)[0])),  ('normality', lambda y: normaltest(y)[0]), # Distribution shape
             ('auto_1', lambda y: y.autocorr(lag=1)), ('auto_2', lambda y: y.autocorr(lag=2)), ('auto_3', lambda y: y.autocorr(lag=3)), ('stationarity', lambda y: adfuller(y)[0])]
