## Exploratory Data Analysis (EDA)

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. Imports and Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned data from Day 1
df = pd.read_csv('/content/drive/MyDrive/Infosys/MileSton1/cmapss_cleaned_train_FD001.csv')
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Infosys/MileSton1/cmapss_cleaned_train_FD001.csv'

In [None]:
# Dataset shape and brief info
print("Shape:", df.shape)
print(df.info())

# Engine and cycle range
print("Unique engines:", df['engine_id'].nunique())
print("Cycle range: {} - {}".format(df['cycle'].min(), df['cycle'].max()))


In [None]:
# Plot distribution of cycles per engine
cycle_counts = df.groupby('engine_id')['cycle'].max()
plt.figure(figsize=(8, 4))
sns.histplot(cycle_counts, bins=20, kde=True)
plt.title("Distribution of Engine Lifespans (Cycle Counts)")
plt.xlabel("Cycles until failure")
plt.ylabel("Number of engines")
plt.show()


In [None]:
# Operational settings distributions
op_settings = ['op_setting_1', 'op_setting_2', 'op_setting_3']
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
for idx, col in enumerate(op_settings):
    sns.histplot(df[col], bins=30, ax=axs[idx], kde=True)
    axs[idx].set_title(f"Distribution: {col}")
plt.tight_layout()
plt.show()


In [None]:
    # Sensor example distributions
    sensor_cols = [col for col in df.columns if "sensor_" in col]
    plt.figure(figsize=(16,8))
    for i, col in enumerate(sensor_cols[:10], 1):   # Show first 10 sensors for brevity
        plt.subplot(2, 5, i)
        sns.histplot(df[col], bins=30, kde=True, color='skyblue')
        plt.title(col)
    plt.tight_layout()
    plt.show()


In [None]:
# Visualize a random engine's sensor history
engine_id = np.random.choice(df['engine_id'].unique())
sample = df[df['engine_id'] == engine_id]
plt.figure(figsize=(14,8))
for col in sensor_cols[:5]:   # First 5 sensors for example
    plt.plot(sample['cycle'], sample[col], label=col)
plt.title(f"Sensor Traces for Engine {engine_id}")
plt.xlabel("Cycle")
plt.ylabel("Sensor Reading")
plt.legend()
plt.show()


In [None]:
# Plot individual boxplots for each sensor (first 10 sensors as example)
sensor_cols = [col for col in df.columns if "sensor_" in col]

plt.figure(figsize=(15, 10))
for i, col in enumerate(sensor_cols[:10], 1):
    plt.subplot(2, 5, i)
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(col)
    plt.ylabel("Value")
plt.tight_layout()
plt.show()



In [None]:
# Correlation matrix of sensors and operational settings
plt.figure(figsize=(25,20))
corr = df[sensor_cols + op_settings].corr()
sns.heatmap(corr, cmap='vlag', annot=True, center=0)
plt.title("Correlation Heatmap: Sensors & Operational Settings")
plt.show()


### EDA Summary Notes

#### Key Findings:
- Engine lifespans (cycle counts) show [describe shape – e.g., right-skewed].
- Operational settings [X and Y] have [describe trends, if visible].
- Some sensors (e.g., sensor_N) exhibit [constant values/outliers], which may not be informative and can be dropped or further investigated in preprocessing.
- Several sensors show significant correlation with each other.

#### Potential Issues/Next Steps:
- Some sensor readings may need normalization due to differing scales.
- Possible outliers or faulty sensors identified; confirm if these are expected or artifacts.
- Decide whether to drop uninformative or constant sensors before modeling.

