# Data Exploration: 125Hz Galling Experimental Data

This notebook explores the high-frequency (125Hz) experimental data for galling phenomenon analysis.

**Objectives:**
1. Load and inspect raw data structure
2. Visualize COF profiles for sample cycles
3. Analyze temperature-dependent behavior
4. Identify patterns and artifacts
5. Validate preprocessing (0.1mm filter)

In [None]:
import sys
sys.path.append('..')

from src.data_loader import HighFrequencyDataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Data

In [None]:
# Initialize loader
loader = HighFrequencyDataLoader()

# Load data for all temperatures
temps = [165, 167.5, 170]
temp_data_dict = {}

for temp in temps:
    print(f"Loading {temp}°C...")
    temp_data_dict[temp] = loader.load_all_cycles_for_temperature(temp)
    print(f"  Cycles: {temp_data_dict[temp]['n_cycles']}")
    print(f"  Failed: {len(temp_data_dict[temp]['failed_cycles'])}")
    print()

## 2. Dataset Overview

In [None]:
# Create PyTorch dataset
dataset = loader.create_pytorch_dataset(temps)

# Get statistics
stats = loader.get_dataset_statistics(dataset)

print("Dataset Statistics:")
print("=" * 60)
print(f"Total cycles: {stats['n_cycles_total']}")
print(f"Total timesteps: {stats['total_timesteps']}")
print(f"Avg timesteps/cycle: {stats['avg_timesteps_per_cycle']:.1f}")
print(f"\nCycles per temperature:")
for temp, count in stats['n_cycles_per_temp'].items():
    print(f"  {temp}°C: {count} cycles")
print(f"\nCOF range: [{stats['cof_range'][0]:.3f}, {stats['cof_range'][1]:.3f}]")
print(f"\nForce ranges (normalized):")
for axis, (fmin, fmax) in stats['force_range'].items():
    print(f"  F_{axis}: [{fmin:.3f}, {fmax:.3f}]")

## 3. Visualize Sample Cycles

Plot COF profiles for first 5 cycles of each temperature to observe patterns.

In [None]:
for temp in temps:
    temp_data = temp_data_dict[temp]
    
    fig, axes = plt.subplots(5, 1, figsize=(14, 12))
    fig.suptitle(f'COF Profiles - {temp}°C (First 5 Cycles)', fontsize=16, y=0.995)
    
    for i in range(min(5, temp_data['n_cycles'])):
        df = temp_data['data'][i]
        mean_cof = temp_data['mean_cof'][i]
        std_cof = temp_data['std_cof'][i]
        
        axes[i].plot(df['sliding_distance'], df['cof'], linewidth=1.5, label='COF(x)')
        axes[i].axhline(mean_cof, color='red', linestyle='--', linewidth=2,
                       label=f"Mean = {mean_cof:.3f}")
        axes[i].fill_between([0, df['sliding_distance'].max()],
                            mean_cof - std_cof, mean_cof + std_cof,
                            color='red', alpha=0.2, label=f"±1σ ({std_cof:.3f})")
        
        axes[i].set_ylabel('COF', fontsize=11)
        axes[i].set_title(f'Cycle {i+1}', fontsize=12, loc='left')
        axes[i].legend(loc='upper right', fontsize=9)
        axes[i].grid(alpha=0.3)
        axes[i].set_xlim(0, df['sliding_distance'].max())
        
        # Highlight potential galling events (COF spikes)
        cof_threshold = mean_cof + 2 * std_cof
        spikes = df['cof'] > cof_threshold
        if spikes.any():
            axes[i].scatter(df.loc[spikes, 'sliding_distance'],
                          df.loc[spikes, 'cof'],
                          color='orange', s=20, alpha=0.6, label='Potential spikes')
    
    axes[-1].set_xlabel('Sliding Distance (mm)', fontsize=12)
    plt.tight_layout()
    plt.savefig(f'../results/plots/cof_profiles_{temp}C.png', dpi=150, bbox_inches='tight')
    plt.show()

## 4. Cycle-Averaged COF Evolution

Plot mean COF vs cycle number to observe regime transitions.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Cycle-Averaged COF Evolution', fontsize=16)

for idx, temp in enumerate(temps):
    temp_data = temp_data_dict[temp]
    cycles = np.arange(1, len(temp_data['mean_cof']) + 1)
    
    axes[idx].plot(cycles, temp_data['mean_cof'], linewidth=2, label='Mean COF')
    axes[idx].fill_between(cycles,
                          temp_data['mean_cof'] - temp_data['std_cof'],
                          temp_data['mean_cof'] + temp_data['std_cof'],
                          alpha=0.3, label='±1σ')
    
    axes[idx].set_xlabel('Cycle Number', fontsize=12)
    axes[idx].set_ylabel('COF', fontsize=12)
    axes[idx].set_title(f'{temp}°C ({temp_data["n_cycles"]} cycles)', fontsize=13)
    axes[idx].legend(fontsize=10)
    axes[idx].grid(alpha=0.3)
    axes[idx].set_ylim(0, max(temp_data['mean_cof']) * 1.2)

plt.tight_layout()
plt.savefig('../results/plots/cof_evolution_all_temps.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. COF Distribution Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of mean COF
for temp in temps:
    temp_data = temp_data_dict[temp]
    axes[0].hist(temp_data['mean_cof'], bins=30, alpha=0.6, label=f'{temp}°C', density=True)

axes[0].set_xlabel('Mean COF', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Distribution of Cycle-Averaged COF', fontsize=13)
axes[0].legend(fontsize=10)
axes[0].grid(alpha=0.3)

# Box plot comparison
mean_cof_data = [temp_data_dict[temp]['mean_cof'] for temp in temps]
axes[1].boxplot(mean_cof_data, labels=[f'{t}°C' for t in temps])
axes[1].set_ylabel('Mean COF', fontsize=12)
axes[1].set_title('COF Distribution by Temperature', fontsize=13)
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../results/plots/cof_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Force and Velocity Profiles

Examine correlation between forces, velocity, and COF.

In [None]:
# Sample one cycle from 167.5°C (critical temperature)
sample_temp = 167.5
sample_cycle_idx = 50  # Mid-experiment

df_sample = temp_data_dict[sample_temp]['data'][sample_cycle_idx]

fig, axes = plt.subplots(4, 1, figsize=(14, 10))
fig.suptitle(f'Multi-Variable Profiles - {sample_temp}°C, Cycle {sample_cycle_idx+1}', fontsize=16)

x = df_sample['sliding_distance']

# COF
axes[0].plot(x, df_sample['cof'], linewidth=2, color='tab:blue')
axes[0].set_ylabel('COF', fontsize=12)
axes[0].grid(alpha=0.3)

# Velocity
axes[1].plot(x, df_sample['velocity_x'], linewidth=2, color='tab:green')
axes[1].set_ylabel('Velocity X (m/s)', fontsize=12)
axes[1].grid(alpha=0.3)

# Forces
axes[2].plot(x, df_sample['force_x'], linewidth=1.5, label='F_x', alpha=0.8)
axes[2].plot(x, df_sample['force_y'], linewidth=1.5, label='F_y', alpha=0.8)
axes[2].plot(x, df_sample['force_z'], linewidth=1.5, label='F_z', alpha=0.8)
axes[2].set_ylabel('Force (N)', fontsize=12)
axes[2].legend(fontsize=10, loc='upper right')
axes[2].grid(alpha=0.3)

# Cycle phase
axes[3].plot(x, df_sample['cycle_phase'], linewidth=2, color='tab:purple')
axes[3].set_ylabel('Cycle Phase [0,1]', fontsize=12)
axes[3].set_xlabel('Sliding Distance (mm)', fontsize=12)
axes[3].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'../results/plots/multivariable_profile_{sample_temp}C_cycle{sample_cycle_idx+1}.png',
            dpi=150, bbox_inches='tight')
plt.show()

## 7. Galling Event Detection

Count potential galling events (sudden COF spikes) per temperature.

In [None]:
def count_galling_events(temp_data, threshold_multiplier=2.0):
    """
    Count cycles with significant COF spikes (potential galling events).
    
    A spike is defined as: max(COF) > mean + threshold_multiplier * std
    """
    spike_counts = []
    
    for i, df in enumerate(temp_data['data']):
        mean = temp_data['mean_cof'][i]
        std = temp_data['std_cof'][i]
        threshold = mean + threshold_multiplier * std
        
        spikes = (df['cof'] > threshold).sum()
        spike_counts.append(spikes)
    
    return np.array(spike_counts)

# Count events for each temperature
event_data = {}
for temp in temps:
    spike_counts = count_galling_events(temp_data_dict[temp])
    event_data[temp] = spike_counts
    
    cycles_with_events = (spike_counts > 0).sum()
    print(f"{temp}°C: {cycles_with_events}/{len(spike_counts)} cycles have potential galling events")
    print(f"  Total spike points: {spike_counts.sum()}")
    print(f"  Avg spikes/cycle: {spike_counts.mean():.1f}")
    print()

In [None]:
# Visualize galling event frequency
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Galling Event Frequency', fontsize=16)

for idx, temp in enumerate(temps):
    cycles = np.arange(1, len(event_data[temp]) + 1)
    axes[idx].bar(cycles, event_data[temp], width=1.0, alpha=0.7)
    axes[idx].set_xlabel('Cycle Number', fontsize=12)
    axes[idx].set_ylabel('Spike Count', fontsize=12)
    axes[idx].set_title(f'{temp}°C', fontsize=13)
    axes[idx].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../results/plots/galling_event_frequency.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Preprocessing Validation

Verify that the 0.1mm sliding distance filter properly removes initialization artifacts.

In [None]:
# Load one raw cycle without preprocessing
df_raw = loader.load_cycle_csv(165, 1)
df_processed = loader.preprocess_cycle(df_raw, min_distance=0.1)

fig, axes = plt.subplots(2, 1, figsize=(14, 8))
fig.suptitle('Effect of 0.1mm Sliding Distance Filter', fontsize=16)

# Before filtering
axes[0].plot(df_raw['sliding_distance'], df_raw['cof'], linewidth=1.5, color='tab:red')
axes[0].axvline(0.1, color='black', linestyle='--', linewidth=2, label='0.1mm threshold')
axes[0].set_ylabel('COF', fontsize=12)
axes[0].set_title(f'Raw Data (n={len(df_raw)} points)', fontsize=13)
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)
axes[0].set_xlim(0, df_raw['sliding_distance'].max())

# After filtering
axes[1].plot(df_processed['sliding_distance'], df_processed['cof'], linewidth=1.5, color='tab:blue')
axes[1].set_ylabel('COF', fontsize=12)
axes[1].set_xlabel('Sliding Distance (mm)', fontsize=12)
axes[1].set_title(f'Filtered Data (n={len(df_processed)} points, {len(df_raw)-len(df_processed)} removed)',
                 fontsize=13)
axes[1].grid(alpha=0.3)
axes[1].set_xlim(0, df_processed['sliding_distance'].max())

plt.tight_layout()
plt.savefig('../results/plots/preprocessing_validation.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"Points removed: {len(df_raw) - len(df_processed)} ({100*(len(df_raw)-len(df_processed))/len(df_raw):.1f}%)")

## Summary

**Key Findings:**
1. Successfully loaded 536 cycles across 3 temperatures
2. Average ~290 timesteps per cycle after 0.1mm filtering
3. Clear temperature-dependent behavior:
   - 165°C: Occasional spikes
   - 167.5°C: Frequent oscillations (critical temperature)
   - 170°C: Sustained high COF
4. Preprocessing effectively removes initialization artifacts
5. Within-cycle spatial variation is observable

**Next Steps:**
- Implement PINN model architecture
- Begin Stage 1 training (NN pre-training)