# NASA Turbofan Dataset - Exploratory Analysis

This notebook explores the NASA Commercial Modular Aero-Propulsion System Simulation (C-MAPSS) dataset for predictive maintenance.

In [None]:
import sys
import os
from pathlib import Path

# Add the project root to the path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import CMAPSSDataLoader
from src.data.preprocessor import CMAPSSPreprocessor
from src.config import SENSOR_COLUMNS, SETTING_COLUMNS, FEATURE_COLUMNS

%matplotlib inline
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

## Load the Dataset

In [None]:
# Initialize the data loader
data_loader = CMAPSSDataLoader()

# Download and extract the dataset if needed
data_loader.download_dataset()
data_loader.extract_dataset()

# Load the FD001 subset
train_df, test_df, test_rul = data_loader.load_dataset("FD001")

## Basic Data Exploration

In [None]:
# Display basic information about the training data
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"RUL values shape: {test_rul.shape}")

# Show first few rows of training data
train_df.head()

In [None]:
# Summary statistics
train_df.describe()

## Add RUL Values to Training Data

In [None]:
# Initialize the preprocessor
preprocessor = CMAPSSPreprocessor()

# Add RUL values to training data
train_df_with_rul = preprocessor.add_rul(train_df)

# Show the first few rows with RUL values
train_df_with_rul.head()

## Analyze Engine Cycles

In [None]:
# Count cycles per engine
cycles_per_engine = train_df.groupby('unit_id').size()

plt.figure(figsize=(12, 6))
plt.bar(cycles_per_engine.index, cycles_per_engine.values)
plt.title('Number of Cycles per Engine')
plt.xlabel('Engine ID')
plt.ylabel('Number of Cycles')
plt.grid(True)
plt.show()

print(f"Average cycles per engine: {cycles_per_engine.mean():.2f}")
print(f"Min cycles: {cycles_per_engine.min()}")
print(f"Max cycles: {cycles_per_engine.max()}")

## Analyze RUL Distribution

In [None]:
# Plot RUL distribution
plt.figure(figsize=(12, 6))
plt.hist(train_df_with_rul['RUL'], bins=50, alpha=0.7)
plt.title('RUL Distribution in Training Data')
plt.xlabel('Remaining Useful Life (cycles)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot RUL distribution in test data
plt.figure(figsize=(12, 6))
plt.hist(test_rul, bins=20, alpha=0.7)
plt.title('RUL Distribution in Test Data')
plt.xlabel('Remaining Useful Life (cycles)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## Analyze Sensor Readings

In [None]:
# Select one engine for visualization
engine_id = 1
engine_data = train_df_with_rul[train_df_with_rul['unit_id'] == engine_id]

# Plot sensor readings over time for one engine
plt.figure(figsize=(15, 10))

for i, sensor in enumerate(FEATURE_COLUMNS, 1):
    if sensor in SENSOR_COLUMNS:
        plt.subplot(4, 4, i)
        plt.plot(engine_data['time'], engine_data[sensor])
        plt.title(f'Sensor {sensor}')
        plt.xlabel('Cycle')
        plt.ylabel('Value')
        plt.grid(True)

plt.tight_layout()
plt.show()

## Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation = train_df_with_rul[FEATURE_COLUMNS + ['RUL']].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show correlation with RUL
rul_correlation = correlation['RUL'].sort_values(ascending=False)
print("Feature correlation with RUL:")
print(rul_correlation)

## Visualize Engine Degradation

In [None]:
# Select a few engines for visualization
engine_ids = [1, 2, 3, 4]

# Select a sensor that correlates well with RUL
important_sensor = rul_correlation.index[1]  # Second highest correlation with RUL

plt.figure(figsize=(12, 6))

for engine_id in engine_ids:
    engine_data = train_df_with_rul[train_df_with_rul['unit_id'] == engine_id]
    plt.plot(engine_data['time'], engine_data[important_sensor], label=f'Engine {engine_id}')

plt.title(f'Degradation Pattern - {important_sensor}')
plt.xlabel('Cycle')
plt.ylabel('Sensor Value')
plt.legend()
plt.grid(True)
plt.show()

## Visualize RUL Degradation

In [None]:
plt.figure(figsize=(12, 6))

for engine_id in engine_ids:
    engine_data = train_df_with_rul[train_df_with_rul['unit_id'] == engine_id]
    plt.plot(engine_data['time'], engine_data['RUL'], label=f'Engine {engine_id}')

plt.title('RUL Degradation Pattern')
plt.xlabel('Cycle')
plt.ylabel('Remaining Useful Life (cycles)')
plt.legend()
plt.grid(True)
plt.show()

## Identify Key Features

In [None]:
# Plot top correlated features vs RUL
top_features = rul_correlation.head(6).index.tolist()
top_features = [f for f in top_features if f != 'RUL']

plt.figure(figsize=(15, 10))

for i, feature in enumerate(top_features, 1):
    plt.subplot(2, 3, i)
    plt.scatter(train_df_with_rul[feature], train_df_with_rul['RUL'], alpha=0.1)
    plt.title(f'{feature} vs RUL')
    plt.xlabel(feature)
    plt.ylabel('RUL')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Conclusion

In this exploratory analysis, we have:
1. Loaded and examined the NASA turbofan dataset
2. Analyzed engine cycles and RUL distribution
3. Visualized sensor readings over time
4. Identified correlations between features and RUL
5. Observed degradation patterns in key features

These insights will help in designing effective LSTM models for RUL prediction.