# Aircraft Predictive Maintenance - Exploratory Data Analysis

This notebook explores the NASA Turbofan Engine Degradation Simulation Dataset to understand the data characteristics and prepare for model development.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

# Add the src directory to the path to import custom modules
sys.path.append('..')
from src.data_processing import load_data, clean_data, calculate_rul

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## 1. Load the Dataset

The NASA Turbofan Engine Degradation Simulation Dataset consists of multiple files. We'll start with the FD001 dataset.

In [None]:
# Define column names for the dataset
NASA_COLUMNS = [
    'engine_id', 'cycle', 
    'setting1', 'setting2', 'setting3', 
    's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 
    's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21'
]

# Load the dataset
data_path = '../data/raw/train_FD001.txt'
if os.path.exists(data_path):
    data = load_data(data_path)
    print(f"Dataset loaded with shape: {data.shape}")
else:
    print(f"Dataset not found at {data_path}")
    print("Please download the NASA Turbofan Engine Degradation Simulation Dataset from Kaggle:")
    print("https://www.kaggle.com/datasets/behrad3d/nasa-cmaps")

In [None]:
# Display the first few rows of the dataset
data.head()

## 2. Data Overview

Let's examine the basic statistics and structure of the dataset.

In [None]:
# Basic information about the dataset
print("Dataset Info:")
data.info()

In [None]:
# Statistical summary
data.describe().T

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values")

In [None]:
# Count unique engines
n_engines = data['engine_id'].nunique()
print(f"Number of unique engines: {n_engines}")

# Distribution of cycles per engine
cycles_per_engine = data.groupby('engine_id')['cycle'].max()
print(f"\nCycles per engine statistics:")
print(cycles_per_engine.describe())

## 3. Calculate Remaining Useful Life (RUL)

For predictive maintenance, we need to calculate the Remaining Useful Life (RUL) for each engine at each cycle.

In [None]:
# Clean the data
cleaned_data = clean_data(data)

# Calculate RUL
rul_data = calculate_rul(cleaned_data)

# Display the first few rows with RUL
rul_data.head()

In [None]:
# Plot RUL distribution
plt.figure(figsize=(10, 6))
sns.histplot(rul_data['RUL'], kde=True, bins=30)
plt.title('Distribution of Remaining Useful Life (RUL)')
plt.xlabel('RUL (cycles)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

## 4. Visualize RUL Degradation

Let's visualize how RUL decreases over time for a sample of engines.

In [None]:
# Sample a few engines for visualization
sample_engines = np.random.choice(rul_data['engine_id'].unique(), 5, replace=False)

plt.figure(figsize=(12, 8))

for engine_id in sample_engines:
    engine_data = rul_data[rul_data['engine_id'] == engine_id]
    plt.plot(engine_data['cycle'], engine_data['RUL'], marker='o', linestyle='-', label=f'Engine {engine_id}')

plt.title('RUL Degradation Over Time for Sample Engines')
plt.xlabel('Cycle')
plt.ylabel('Remaining Useful Life (cycles)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Sensor Analysis

Let's analyze the sensor readings to understand their behavior and relationship with RUL.

In [None]:
# Select sensor columns
sensor_cols = [col for col in rul_data.columns if col.startswith('s')]

# Correlation with RUL
correlations = rul_data[sensor_cols + ['RUL']].corr()['RUL'].sort_values()

# Plot correlation with RUL
plt.figure(figsize=(12, 8))
correlations.drop('RUL').plot(kind='bar')
plt.title('Sensor Correlation with RUL')
plt.xlabel('Sensor')
plt.ylabel('Correlation Coefficient')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.show()

# Print top correlated sensors
print("Top 5 positively correlated sensors:")
print(correlations.drop('RUL').nlargest(5))
print("\nTop 5 negatively correlated sensors:")
print(correlations.drop('RUL').nsmallest(5))

In [None]:
# Correlation heatmap
plt.figure(figsize=(16, 12))
correlation_matrix = rul_data[sensor_cols + ['RUL']].corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', 
            linewidths=0.5, vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Sensors and RUL')
plt.tight_layout()
plt.show()

## 6. Sensor Trends Over Time

Let's visualize how sensor readings change over time for a sample engine.

In [None]:
# Select a sample engine
sample_engine_id = sample_engines[0]
sample_engine_data = rul_data[rul_data['engine_id'] == sample_engine_id].sort_values('cycle')

# Select top correlated sensors (both positive and negative)
top_sensors = list(correlations.drop('RUL').abs().nlargest(6).index)

# Plot sensor readings over time
plt.figure(figsize=(14, 10))

for sensor in top_sensors:
    plt.plot(sample_engine_data['cycle'], sample_engine_data[sensor], marker='.', 
             linestyle='-', label=sensor)

plt.title(f'Top Correlated Sensor Readings Over Time for Engine {sample_engine_id}')
plt.xlabel('Cycle')
plt.ylabel('Sensor Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 7. Operational Settings Analysis

Let's analyze the operational settings and their impact on engine degradation.

In [None]:
# Select operational setting columns
setting_cols = [col for col in rul_data.columns if col.startswith('setting')]

# Distribution of operational settings
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, col in enumerate(setting_cols):
    sns.histplot(rul_data[col], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Relationship between operational settings and RUL
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, col in enumerate(setting_cols):
    sns.scatterplot(x=col, y='RUL', data=rul_data, alpha=0.5, ax=axes[i])
    axes[i].set_title(f'RUL vs {col}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Selection

Based on our analysis, let's identify the most important features for predicting RUL.

In [None]:
# Calculate absolute correlation with RUL
abs_corr = correlations.drop('RUL').abs().sort_values(ascending=False)

# Select top features based on correlation
top_features = list(abs_corr.nlargest(10).index)

print("Top 10 features based on correlation with RUL:")
for i, feature in enumerate(top_features, 1):
    print(f"{i}. {feature}: {correlations[feature]:.4f}")

In [None]:
# Visualize the relationship between top features and RUL
top_6_features = top_features[:6]
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(top_6_features):
    sns.scatterplot(x=feature, y='RUL', data=rul_data, alpha=0.5, ax=axes[i])
    axes[i].set_title(f'RUL vs {feature}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Data Preparation for Modeling

Let's prepare the data for modeling by normalizing features and creating sequences.

In [None]:
# Normalize the data
# Select features for normalization (all except engine_id, cycle, and RUL)
features_to_normalize = [col for col in rul_data.columns 
                         if col not in ['engine_id', 'cycle', 'RUL']]

# Initialize scaler
scaler = MinMaxScaler()

# Create a copy of the data
normalized_data = rul_data.copy()

# Fit and transform the selected features
normalized_data[features_to_normalize] = scaler.fit_transform(normalized_data[features_to_normalize])

# Display the first few rows of normalized data
normalized_data.head()

In [None]:
# Create a sample sequence for visualization
sequence_length = 30
sample_engine_id = sample_engines[0]
sample_engine_data = normalized_data[normalized_data['engine_id'] == sample_engine_id].sort_values('cycle')

# Select a subset of features for visualization
features_to_plot = top_6_features

# Create a sequence
start_idx = 50  # Start from cycle 50 for better visualization
sequence_data = sample_engine_data.iloc[start_idx:start_idx+sequence_length]

# Plot the sequence
plt.figure(figsize=(14, 8))

for feature in features_to_plot:
    plt.plot(sequence_data['cycle'], sequence_data[feature], marker='o', linestyle='-', label=feature)

plt.title(f'Sample Sequence for Engine {sample_engine_id} (Cycles {start_idx+1} to {start_idx+sequence_length})')
plt.xlabel('Cycle')
plt.ylabel('Normalized Feature Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 10. Summary and Next Steps

### Key Findings:

1. The dataset contains sensor readings from multiple engines over their operational cycles.
2. We calculated the Remaining Useful Life (RUL) for each engine at each cycle.
3. We identified the sensors that have the strongest correlation with RUL.
4. We normalized the data and prepared it for sequence-based modeling.

### Next Steps:

1. Create sequences for time series modeling.
2. Split the data into training, validation, and test sets.
3. Develop and train predictive models (LSTM, CNN-LSTM, XGBoost, etc.).
4. Evaluate and compare model performance.
5. Fine-tune the best performing model.

These steps will be covered in the next notebook: `02_model_development.ipynb`.