## FEATURE EXTRACTION AND ANALYSIS

In [None]:
import pandas as pd
import numpy as np
from scipy.fft import fft
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df['time_abs'] = pd.to_datetime(df['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], format='%Y-%m-%dT%H:%M:%S.%f')
    df.drop(columns=['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], inplace=True)
    return df

def extract_statistical_features(df):
    df['mean_velocity'] = df['velocity(m/s)'].expanding().mean()

    df['std_velocity'] = df['velocity(m/s)'].expanding().std()
    df['std_velocity'].fillna(df['std_velocity'].median(), inplace=True)

    df['max_velocity'] = df['velocity(m/s)'].expanding().max()
    df['min_velocity'] = df['velocity(m/s)'].expanding().min()
    df['median_velocity'] = df['velocity(m/s)'].expanding().median()
    df['q1_velocity'] = df['velocity(m/s)'].expanding().quantile(0.25)
    df['q3_velocity'] = df['velocity(m/s)'].expanding().quantile(0.75)
    
    # Fill NaN values created by expanding operations
    # df.fillna(method='ffill', inplace=True)
    
    return df

def extract_time_series_features(df):
    # Calculate velocity differences and rolling features
    df['velocity_diff'] = df['velocity(m/s)'].diff()
    df['velocity_diff'].fillna(df['velocity_diff'].median(), inplace=True)

    df['velocity_acceleration'] = df['velocity_diff'].diff()
    df['velocity_acceleration'].fillna(df['velocity_acceleration'].median(), inplace=True)

    # # Add rolling window features
    df['rolling_mean_20'] = df['velocity(m/s)'].rolling(window=10).mean()
    df['rolling_mean_20'].fillna(df['rolling_mean_20'].median(), inplace=True)

    df['rolling_std_20'] = df['velocity(m/s)'].rolling(window=10).std()
    df['rolling_std_20'].fillna(df['rolling_std_20'].median(), inplace=True)

    df['rolling_var_20'] = df['velocity(m/s)'].rolling(window=10).var()
    df['rolling_var_20'].fillna(df['rolling_var_20'].median(), inplace=True)
    
    # Cumulative sum of velocity
    df['velocity_cumsum'] = df['velocity(m/s)'].cumsum()
    
    return df


def extract_frequency_features(df):
    fft_result = fft(df['velocity(m/s)'])
    frequencies = np.fft.fftfreq(len(df['velocity(m/s)']))
    
    df['dominant_frequency'] = np.argmax(np.abs(fft_result))
    df['spectral_density_mean'] = np.mean(np.abs(fft_result) ** 2)
    
    return df

def combine_features(file_path):
    # Load and preprocess data
    df = load_and_preprocess(file_path)
    
    # Extract statistical features for each row
    df = extract_statistical_features(df)
    
    # Extract time-series features for each row
    df = extract_time_series_features(df)
    
    # Return the final DataFrame with all features
    return df

# Example usage
file_path = "space_apps_2024_seismic_detection/data/lunar/training/data/S12_GradeA/xa.s12.00.mhz.1970-01-19HR00_evid00002.csv"
result_df = combine_features(file_path)

# Save the result to a CSV file
result_df.to_csv('all_features_extracted.csv', index=False)

In [2]:
all =pd.read_csv("all_features_extracted.csv")
all.sample(5)

Unnamed: 0,time_rel(sec),velocity(m/s),time_abs,mean_velocity,std_velocity,max_velocity,min_velocity,median_velocity,q1_velocity,q3_velocity,velocity_diff,velocity_acceleration,rolling_mean_20,rolling_std_20,rolling_var_20,velocity_cumsum
414442,62557.283019,2.628375e-10,1970-01-19 17:22:37.948019,-1.385123e-13,1.780599e-10,1.879303e-09,-1.685426e-09,-6.041686e-18,-2.437565e-11,2.495679e-11,2.857879e-10,4.722385e-11,-9.148493e-11,1.769288e-10,3.130381e-20,-5.740545e-08
17688,2669.886792,-1.365319e-11,1970-01-19 00:44:30.551792,-6.283794e-13,4.624415e-11,3.253251e-10,-2.899579e-10,-5.030551e-16,-7.646033e-12,7.717448e-12,-1.132022e-11,-1.707038e-12,-1.560277e-12,9.461977e-12,8.952901000000001e-23,-1.11154e-08
442607,66808.603774,4.847494e-10,1970-01-19 18:33:29.268774,-1.193853e-13,1.837163e-10,1.879303e-09,-1.685426e-09,-1.392544e-17,-3.26826e-11,3.219169e-11,1.261332e-10,-9.969789e-11,8.878676e-11,2.090178e-10,4.3688439999999995e-20,-5.284091e-08
516629,77981.735849,1.882779e-11,1970-01-19 21:39:42.400849,-7.910268e-13,3.621285e-10,7.874026e-09,-8.185283e-09,-2.505288e-17,-4.516782e-11,4.339024e-11,1.113543e-10,-5.572312e-11,-1.443429e-10,2.803651e-10,7.860457e-20,-4.086682e-07
39627,5981.433962,-1.016341e-10,1970-01-19 01:39:42.098962,9.788763e-13,9.173862e-11,8.078946e-10,-7.611013e-10,6.332099e-15,-1.509908e-11,1.571409e-11,-1.721688e-10,-1.312827e-10,5.071186e-11,1.645797e-10,2.708648e-20,3.879091e-08


In [3]:
all.isnull().sum()

time_rel(sec)            0
velocity(m/s)            0
time_abs                 0
mean_velocity            0
std_velocity             0
max_velocity             0
min_velocity             0
median_velocity          0
q1_velocity              0
q3_velocity              0
velocity_diff            0
velocity_acceleration    0
rolling_mean_20          0
rolling_std_20           0
rolling_var_20           0
velocity_cumsum          0
dtype: int64

## VISUALIZATION PLOTS

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.fft import fft

# Load the data
df = pd.read_csv('all_features_extracted.csv')

# Convert time_abs to datetime if it's not already
df['time_abs'] = pd.to_datetime(df['time_abs'])

# 1. Time Series Plot of Velocity
plt.figure(figsize=(12, 6))
plt.plot(df['time_abs'], df['velocity(m/s)'], label='Velocity')
plt.plot(df['time_abs'], df['rolling_mean_20'], label='20-point Rolling Mean')
plt.title('Seismic Velocity Over Time')
plt.xlabel('Time')
plt.ylabel('Velocity (m/s)')
plt.legend()
plt.tight_layout()
plt.savefig('velocity_time_series.png')
plt.close()

# 2. Histogram of Velocity Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['velocity(m/s)'], kde=True)
plt.title('Distribution of Seismic Velocity')
plt.xlabel('Velocity (m/s)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('velocity_distribution.png')
plt.close()

# 3. Box Plot of Velocity
plt.figure(figsize=(10, 6))
sns.boxplot(y=df['velocity(m/s)'])
plt.title('Box Plot of Seismic Velocity')
plt.ylabel('Velocity (m/s)')
plt.tight_layout()
plt.savefig('velocity_boxplot.png')
plt.close()

# 4. Correlation Heatmap
correlation_matrix = df[['velocity(m/s)', 'mean_velocity', 'std_velocity', 'rolling_mean_20', 'rolling_std_20']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap of Key Features')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

# 5. FFT Magnitude Spectrum
fft_result = fft(df['velocity(m/s)'])
n = len(df['velocity(m/s)'])
freq = np.fft.fftfreq(n, d=1)  # Assuming 1 second interval between samples
plt.figure(figsize=(12, 6))
plt.plot(freq[:n//2], np.abs(fft_result[:n//2]))
plt.title('FFT Magnitude Spectrum')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Magnitude')
plt.xlim(0, 0.5)  # Limit x-axis to positive frequencies up to Nyquist frequency
plt.tight_layout()
plt.savefig('fft_spectrum.png')
plt.close()

# 6. Scatter Plot: Velocity vs Acceleration
plt.figure(figsize=(10, 6))
plt.scatter(df['velocity(m/s)'], df['velocity_acceleration'], alpha=0.5)
plt.title('Velocity vs Acceleration')
plt.xlabel('Velocity (m/s)')
plt.ylabel('Acceleration (m/s²)')
plt.tight_layout()
plt.savefig('velocity_vs_acceleration.png')
plt.close()

print("All plots have been saved as PNG files.")

All plots have been saved as PNG files.
