In [109]:
import os
import numpy as np
import pandas as pd
from scipy.fft import fft, fftfreq
from tqdm import tqdm

from sklearn.decomposition import PCA

In [110]:
if not os.path.basename(os.getcwd()) == 'Seismic-Multilabel-Event-Classifier':
    os.chdir('..')
    print(f"Changed directory to {os.getcwd()}")

In [111]:
# Dataset path
DATA_PATH = 'data/interim/datosML.json'  
df_raw = pd.read_json(DATA_PATH, lines=True)
print('Loaded', df_raw.shape[0], 'rows')
df_raw.head()

Loaded 1696 rows


Unnamed: 0,Archivo,NPTS,Falla,Mag,Vs,Time,AccV,AccH2,AccH1
0,RSN8478_PARK2004,32169,1 Stiker Slip (SS),4-6,600-,"[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07...","[3.34e-08, 3.3700000000000004e-08, 3.39e-08, 3...","[-5.5800000000000003e-08, -5.65e-08, -5.730000...","[-1.93e-08, -1.93e-08, -1.93e-08, -1.93e-08, -..."
1,RSN8700_40204628,20001,1 Stiker Slip (SS),4-6,600-,"[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07...","[8.5e-09, 8.9e-09, 9.2e-09, 9.5e-09, 9.7000000...","[-2.2000000000000003e-09, -2.3e-09, -2.4e-09, ...","[8e-10, 1.2e-09, 1.6e-09, 2e-09, 2.4e-09, 2.80..."
2,RSN8459_PARK2004,32380,1 Stiker Slip (SS),4-6,600-,"[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07...","[5.86e-08, 5.94e-08, 6.02e-08, 6.1e-08, 6.17e-...","[2.7200000000000002e-08, 2.7e-08, 2.6800000000...","[1.5000000000000002e-08, 1.4900000000000001e-0..."
3,RSN2148_BEARCTY,8200,1 Stiker Slip (SS),4-6,600-,"[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07...","[-2.27684e-05, -2.11897e-05, -2.81075e-05, 1.7...","[7.438800000000001e-06, 5.522000000000001e-06,...","[1.0019400000000001e-05, 9.6351e-06, 1.25398e-..."
4,RSN8426_BEARCTY,14465,1 Stiker Slip (SS),4-6,600-,"[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07...","[3.1e-09, 3.1e-09, 3.1e-09, 3.1e-09, 3.1e-09, ...","[4e-10, 4e-10, 4e-10, 4e-10, 4e-10, 4e-10, 4e-...","[2.9e-09, 2.9e-09, 2.9e-09, 2.9e-09, 2.8000000..."


In [112]:
# Ensure signals are flat NumPy arrays
for col in ['AccV','AccH1','AccH2']:
    df_raw[col] = df_raw[col].apply(lambda x: np.array(x).flatten())

In [113]:
def extract_time_features(sig: np.ndarray, dt: float, prefix: str):
    """Compute max, rms of a signal"""
    feats = {
        f'max_{prefix}': np.max(sig),
        f'rms_{prefix}': np.sqrt(np.mean(sig**2)),
    }
    return feats

def zero_crossing_rate(sig: np.ndarray, dt: float):
    crossings = np.where(np.diff(np.signbit(sig)))[0]
    duration = len(sig) * dt
    return len(crossings) / duration if duration > 0 else 0.0

def fft_features(sig: np.ndarray, dt: float, prefix: str, n_fft: int = 512, band_split: float = 5.0):
    """Return dict with FFT magnitude vector and summary stats"""
    mag = np.abs(fft(sig, n=n_fft))[:n_fft//2 + 1]  # unilateral spectrum
    freqs = fftfreq(n_fft, d=dt)[:n_fft//2 + 1]
    dom_freq = freqs[np.argmax(mag)]
    centroid = (freqs * mag).sum() / (mag.sum() + 1e-12)
    bandwidth = np.sqrt(((freqs - centroid)**2 * mag).sum() / (mag.sum() + 1e-12))
    low_energy = mag[freqs < band_split].sum()
    high_energy = mag[freqs >= band_split].sum()
    spec_ratio = high_energy / (low_energy + 1e-12)
    feats = {
        f'dom_freq_{prefix}': dom_freq,
        f'centroid_{prefix}': centroid,
        f'bandwidth_{prefix}': bandwidth,
        f'spec_ratio_{prefix}': spec_ratio,
        f'FFTmag_{prefix}': mag  # store vector; will flatten later
    }
    return feats

In [114]:
feature_rows = []
n_fft = 512
for idx, row in tqdm(df_raw.iterrows(), total=df_raw.shape[0]):
    dt = row['DT'] if 'DT' in row else 0.01
    feats = {}

    # time‑domain stats
    for comp in ['V','H1','H2']:
        sig = row[f'Acc{comp}']
        feats.update(extract_time_features(sig, dt, comp))
    # duration
    feats['duration'] = (len(row['AccV']) - 1) * dt
    # zero‑crossing rate of vertical
    feats['zcr_V'] = zero_crossing_rate(row['AccV'], dt)

    # FFT based
    for comp in ['V','H1','H2']:
        feats.update(fft_features(row[f'Acc{comp}'], dt, comp, n_fft=n_fft))

    feature_rows.append(feats)

features_df = pd.DataFrame(feature_rows, index=df_raw.index)
print('Created feature matrix shape:', features_df.shape)

100%|██████████| 1696/1696 [00:00<00:00, 6140.21it/s]

Created feature matrix shape: (1696, 23)





In [115]:

# Aplanar FFT mags
fft_dfs = []
for comp in ['V','H1','H2']:
    arr = np.vstack(features_df[f'FFTmag_{comp}'].values)
    cols = [f'FFTmag_{comp}_{i}' for i in range(arr.shape[1])]
    fft_dfs.append(pd.DataFrame(arr, columns=cols, index=features_df.index))
flat_fft = pd.concat(fft_dfs, axis=1)
# Limpiar y unir
features_df = features_df.drop(columns=[f'FFTmag_{c}' for c in ['V','H1','H2']])
features_df = pd.concat([features_df, flat_fft], axis=1)
print('Feature matrix shape before PCA:', features_df.shape)


Feature matrix shape before PCA: (1696, 791)


In [116]:

# Aplicar PCA para 95% varianza
pca = PCA(n_components=0.95, svd_solver='full', random_state=42)
fft_cols = [c for c in features_df.columns if c.startswith('FFTmag_')]
other_cols = [c for c in features_df.columns if not c.startswith('FFTmag_')]
fft_data = features_df[fft_cols]
pca_transformed = pca.fit_transform(fft_data)
pca_cols = [f'FFT_PCA_{i+1}' for i in range(pca_transformed.shape[1])]
df_pca = pd.DataFrame(pca_transformed, columns=pca_cols, index=features_df.index)
print('PCA components:', len(pca_cols))


PCA components: 62


In [117]:

# Dataset final con PCA
df_features = pd.concat([features_df[other_cols], df_pca], axis=1)
print('Final feature shape:', df_features.shape)
df_features.head()


Final feature shape: (1696, 82)


Unnamed: 0,max_V,rms_V,max_H1,rms_H1,max_H2,rms_H2,duration,zcr_V,dom_freq_V,centroid_V,...,FFT_PCA_53,FFT_PCA_54,FFT_PCA_55,FFT_PCA_56,FFT_PCA_57,FFT_PCA_58,FFT_PCA_59,FFT_PCA_60,FFT_PCA_61,FFT_PCA_62
0,0.000342,5.2e-05,0.000587,0.000111,0.000617,0.000107,321.68,2.014362,0.0,5.447156,...,-0.010389,0.015152,-0.00162,-0.007796,-0.002894,0.008734,0.005754,-0.00561,0.001254,0.015319
1,0.003967,0.000347,0.006399,0.000501,0.004988,0.000475,200.0,9.034548,0.78125,17.55107,...,-0.010391,0.015147,-0.001616,-0.007771,-0.002902,0.008747,0.005756,-0.005632,0.001243,0.015314
2,0.000135,2.3e-05,0.000204,2.8e-05,0.000157,2.6e-05,323.79,1.429895,0.195312,2.573894,...,-0.01039,0.01515,-0.001616,-0.007797,-0.002894,0.008739,0.005757,-0.00562,0.001256,0.015312
3,0.003166,0.000453,0.00503,0.000657,0.004669,0.000725,41.0,21.189954,14.0625,14.019404,...,-0.04958,0.007665,-0.000685,-0.019198,0.014396,-0.024908,0.009837,0.006,0.004093,0.001835
4,1.1e-05,2e-06,2.1e-05,3e-06,1.8e-05,3e-06,180.8,3.943366,0.195312,10.936094,...,-0.010387,0.015149,-0.001617,-0.007796,-0.002892,0.008732,0.005761,-0.005615,0.00126,0.015316


In [118]:

# Unir etiquetas originales
df_final = pd.concat([df_raw[['Archivo','Falla','Mag','Vs']].reset_index(drop=True),
                      df_features.reset_index(drop=True)], axis=1)
# One-hot encode multilabels
df_final = pd.get_dummies(df_final, columns=['Falla','Mag','Vs'], prefix='', prefix_sep='')
df_final.head()


Unnamed: 0,Archivo,max_V,rms_V,max_H1,rms_H1,max_H2,rms_H2,duration,zcr_V,dom_freq_V,...,FFT_PCA_62,1 Stiker Slip (SS),2 Normal-Oblique,3 Reverse-Oblique,4-6,6-8,0-200,200-400,400-600,600-
0,RSN8478_PARK2004,0.000342,5.2e-05,0.000587,0.000111,0.000617,0.000107,321.68,2.014362,0.0,...,0.015319,True,False,False,True,False,False,False,False,True
1,RSN8700_40204628,0.003967,0.000347,0.006399,0.000501,0.004988,0.000475,200.0,9.034548,0.78125,...,0.015314,True,False,False,True,False,False,False,False,True
2,RSN8459_PARK2004,0.000135,2.3e-05,0.000204,2.8e-05,0.000157,2.6e-05,323.79,1.429895,0.195312,...,0.015312,True,False,False,True,False,False,False,False,True
3,RSN2148_BEARCTY,0.003166,0.000453,0.00503,0.000657,0.004669,0.000725,41.0,21.189954,14.0625,...,0.001835,True,False,False,True,False,False,False,False,True
4,RSN8426_BEARCTY,1.1e-05,2e-06,2.1e-05,3e-06,1.8e-05,3e-06,180.8,3.943366,0.195312,...,0.015316,True,False,False,True,False,False,False,False,True


In [121]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1696 entries, 0 to 1695
Data columns (total 92 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Archivo             1696 non-null   object 
 1   max_V               1696 non-null   float64
 2   rms_V               1696 non-null   float64
 3   max_H1              1696 non-null   float64
 4   rms_H1              1696 non-null   float64
 5   max_H2              1696 non-null   float64
 6   rms_H2              1696 non-null   float64
 7   duration            1696 non-null   float64
 8   zcr_V               1696 non-null   float64
 9   dom_freq_V          1696 non-null   float64
 10  centroid_V          1696 non-null   float64
 11  bandwidth_V         1696 non-null   float64
 12  spec_ratio_V        1696 non-null   float64
 13  dom_freq_H1         1696 non-null   float64
 14  centroid_H1         1696 non-null   float64
 15  bandwidth_H1        1696 non-null   float64
 16  spec_r

In [120]:

# Guardar dataset final
os.makedirs('data/processed', exist_ok=True)
df_final.to_json('data/processed/dataset_final.json', index=False)
print('Final dataset saved to data/processed/dataset_final.json')


Final dataset saved to data/processed/dataset_final.json
