# Water Potability - Exploratory Data Analysis
This notebook explores the dataset to understand the distribution of features and their relationship with the target variable `Potability`.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style="whitegrid")

In [None]:
# Load Data
data_path = '../data/raw/water_potability.csv'
if not os.path.exists(data_path):
    print(f"Data not found at {data_path}. Please ensure the data exists.")
else:
    df = pd.read_csv(data_path)
    print("Data Loaded Successfully")
    display(df.head())

In [None]:
# Basic Statistics
display(df.describe())
display(df.info())

In [None]:
# Target Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Potability', data=df)
plt.title('Distribution of Potability')
plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot for key features
features = ['ph', 'Sulfate', 'Hardness', 'Potability']
sns.pairplot(df[features], hue='Potability', diag_kind='kde')
plt.show()