# Flood Prediction - Exploratory Data Analysis
## Playground Series S4E5

This notebook explores the flood prediction dataset to understand:
- Data structure and quality
- Feature distributions
- Correlations and relationships
- Target variable characteristics

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline

## 1. Load Data

In [None]:
# Load datasets
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

## 2. Basic Data Inspection

In [None]:
# Display first few rows
train.head()

In [None]:
# Data types and info
train.info()

In [None]:
# Statistical summary
train.describe()

## 3. Missing Values Analysis

In [None]:
# Check for missing values
missing_train = train.isnull().sum()
missing_test = test.isnull().sum()

print("Missing values in train:")
print(missing_train[missing_train > 0])
print("\nMissing values in test:")
print(missing_test[missing_test > 0])

## 4. Target Variable Analysis

In [None]:
# Target variable statistics
print("FloodProbability Statistics:")
print(train['FloodProbability'].describe())

# Distribution plot
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(train['FloodProbability'], bins=50, edgecolor='black')
axes[0].set_xlabel('Flood Probability')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Flood Probability')

# Box plot
axes[1].boxplot(train['FloodProbability'])
axes[1].set_ylabel('Flood Probability')
axes[1].set_title('Box Plot of Flood Probability')

plt.tight_layout()
plt.show()

## 5. Feature Distributions

In [None]:
# Get all feature columns (excluding id and target)
feature_cols = [col for col in train.columns if col not in ['id', 'FloodProbability']]

# Plot distributions
n_cols = 4
n_rows = (len(feature_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 4))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    axes[idx].hist(train[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(col)
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

# Hide unused subplots
for idx in range(len(feature_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = train[feature_cols + ['FloodProbability']].corr()

# Plot heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Top correlations with target
target_corr = correlation_matrix['FloodProbability'].sort_values(ascending=False)
print("Top 10 features correlated with FloodProbability:")
print(target_corr.head(11))  # 11 to exclude FloodProbability itself

# Visualize
plt.figure(figsize=(10, 8))
target_corr[1:11].plot(kind='barh')
plt.xlabel('Correlation with FloodProbability')
plt.title('Top 10 Features by Correlation with Target')
plt.tight_layout()
plt.show()

## 7. Feature Relationships with Target

In [None]:
# Scatter plots for top correlated features
top_features = target_corr[1:7].index.tolist()  # Top 6 features

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, feature in enumerate(top_features):
    axes[idx].scatter(train[feature], train['FloodProbability'], alpha=0.3, s=1)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('FloodProbability')
    axes[idx].set_title(f'{feature} vs FloodProbability')

plt.tight_layout()
plt.show()

## 8. Data Quality Checks

In [None]:
# Check for duplicates
print(f"Duplicate rows in train: {train.duplicated().sum()}")
print(f"Duplicate rows in test: {test.duplicated().sum()}")

# Check value ranges
print("\nValue ranges for each feature:")
for col in feature_cols:
    print(f"{col}: [{train[col].min()}, {train[col].max()}]")

## 9. Key Insights & Next Steps

### Summary:
- Dataset size: 1.1M training samples, 745K test samples
- 20 input features + 1 target (FloodProbability)
- Target is continuous (regression task) with values between 0 and 1

### Next Steps:
1. Feature engineering (if needed)
2. Train baseline models (Linear Regression, Random Forest, XGBoost)
3. Hyperparameter tuning
4. Model evaluation and selection
5. Generate predictions for test set