## Data Preprocessing

### ✅ Handle Missing Values

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [19]:
df = pd.read_csv("../data/boston_housing.csv")

In [20]:
# Fill missing values with median
df.fillna(df.select_dtypes(include=['number']).median(), inplace=True)


In [21]:
# Remove outliers using Z-score
df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

In [22]:
# Splitting features and target
X = df.drop(columns=['price'])  # MEDV is target
y = df['price']

In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Assuming X is a pandas DataFrame
categorical_columns = X.select_dtypes(include=['object']).columns

# Convert categorical columns to numerical
if len(categorical_columns) > 0:
    X[categorical_columns] = X[categorical_columns].apply(lambda col: LabelEncoder().fit_transform(col))

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Save preprocessed data
preprocessed_data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
np.savez("../data/preprocessed_data.npz", **preprocessed_data)
