# EDA

# XAI for House Price Prediction (Boston Housing)

This notebook performs **Exploratory Data Analysis (EDA)** to understand the dataset, the target (`medv`), feature distributions, relationships, and potential data issues (outliers, skew, multicollinearity).  
These findings guide **modeling and explainability** choices (what to standardize/transform and what patterns to expect from explanations).

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

sns.set_theme(style="whitegrid", context="notebook")
plt.rcParams["figure.figsize"] = (10, 5)

ROOT = Path.cwd().parent
DATA_PATH = ROOT / "data" / "processed_housing.csv"

df = pd.read_csv(DATA_PATH)

display(df.head())
print("Shape:", df.shape)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


Shape: (506, 14)


In [None]:
# --- 2) Target analysis: medv ---
target = "medv"

fig, ax = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(df[target], kde=True, ax=ax[0], color="steelblue")
ax[0].set_title("Target distribution (medv)")

sns.boxplot(x=df[target], ax=ax[1], color="lightgray")
ax[1].set_title("Target boxplot (medv)")

plt.tight_layout()
plt.show()

print("medv summary:")
display(df[target].describe())
print("medv skew:", float(df[target].skew()))
print("medv kurtosis:", float(df[target].kurt()))

# Check for typical Boston Housing ceiling at 50
print("medv == 50 count:", int((df[target] == 50).sum()))
print("medv == 50 proportion:", float((df[target] == 50).mean()))