# Advanced Exploratory Data Analysis (EDA)
Dataset: realistic_threshold_noise_flow_data.csv

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('realistic_threshold_noise_flow_data.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')

## Basic Info and Data Types

In [None]:
display(df.info())
display(df.describe(include='all'))
display(df.isna().sum())

## Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Time Series Analysis

In [None]:
fig = px.line(df, x='timestamp', y='flow_rate', color='sensor_id', title='Flow Rate Over Time')
fig.show()
fig = px.line(df, x='timestamp', y='pressure', color='sensor_id', title='Pressure Over Time')
fig.show()
fig = px.line(df, x='timestamp', y='temperature_C', color='sensor_id', title='Temperature Over Time')
fig.show()

## Categorical Features

In [None]:
sns.countplot(data=df, x='pump_status')
plt.title('Pump Status Distribution')
plt.show()

sns.countplot(data=df, x='anomaly_type', order=df['anomaly_type'].value_counts().index)
plt.title('Anomaly Type Distribution')
plt.xticks(rotation=45)
plt.show()

## Numerical Distribution by Anomaly Type

In [None]:
for col in ['flow_rate', 'pressure', 'temperature_C']:
    plt.figure(figsize=(10, 4))
    sns.boxplot(data=df, x='anomaly_type', y=col)
    plt.title(f'{col} by Anomaly Type')
    plt.xticks(rotation=45)
    plt.show()

## Sensor-wise Comparison

In [None]:
sns.pairplot(df, hue='sensor_id', vars=['flow_rate', 'pressure', 'temperature_C'])
plt.suptitle('Sensor-wise Pairplot', y=1.02)
plt.show()