# Exploratory Data Analysis (EDA) - Bridge Failure Prediction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 50)

## Load Data

In [2]:
df = pd.read_csv('../data/processed/features.csv')
df.head()

## Data Overview

In [3]:
df.info()

In [4]:
df.describe(include='all')

## Missing Values

In [5]:
df.isnull().sum()

## Class Balance

In [6]:
df['failure_within_1yr'].value_counts(normalize=True).plot(kind='bar')
plt.title('Class Balance: Failure Within 1 Year')
plt.ylabel('Proportion')
plt.xlabel('Failure (1=Yes, 0=No)')
plt.show()

## Correlation Matrix

In [7]:
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Feature Distributions

In [8]:
num_cols = ['avg_daily_traffic', 'precipitation', 'avg_temp', 'corrosion_level', 'previous_failures']
df[num_cols].hist(bins=20, figsize=(12, 8))
plt.tight_layout()
plt.show()

## Failure Rate by Bridge Condition

In [9]:
sns.barplot(
    data=df, x='bridge_condition', y='failure_within_1yr', ci=None
)
plt.title('Failure Rate by Bridge Condition')
plt.show()

## Geographic Distribution

In [10]:
plt.scatter(df['longitude'], df['latitude'], c=df['failure_within_1yr'], cmap='coolwarm', alpha=0.3)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Bridge Locations (Color = Failure)')
plt.show()