# Water Quality Data Exploration

This notebook explores the water quality dataset and performs initial analysis.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn')
sns.set_palette('Set2')

## Load and Examine Data

In [None]:
# Load the data
df = pd.read_csv('../data/raw/water_dataX.csv', encoding='latin1')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData Info:")
df.info()

## Data Summary Statistics

In [None]:
# Display summary statistics
print("Summary Statistics:")
df.describe()

## Missing Value Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

## Data Visualization

In [None]:
# Distribution of DO levels
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='D.O. (mg/l)', bins=30)
plt.title('Distribution of Dissolved Oxygen Levels')
plt.show()

In [None]:
# Correlation heatmap
numeric_columns = [
    'Temp', 'D.O. (mg/l)', 'PH', 'CONDUCTIVITY (µmhos/cm)',
    'B.O.D. (mg/l)', 'NITRATENAN N+ NITRITENANN (mg/l)',
    'FECAL COLIFORM (MPN/100ml)', 'TOTAL COLIFORM (MPN/100ml)Mean'
]

plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Feature Analysis

In [None]:
# Box plots for key features
plt.figure(figsize=(15, 6))
for i, column in enumerate(numeric_columns[:4], 1):
    plt.subplot(1, 4, i)
    sns.boxplot(y=df[column])
    plt.title(column)
plt.tight_layout()
plt.show()

## Conclusions

Key findings from the data exploration:
1. [Add your conclusions here]
2. [Add your conclusions here]
3. [Add your conclusions here]