In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 2: Load the dataset
data = pd.read_csv('../data/BostonHousing.csv')

In [3]:
# Step 3: Handle missing values
print("Missing values before cleanup:")
print(data.isnull().sum())


Missing values before cleanup:
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


In [None]:
# Step 4: Detect and treat outliers
# cap outliers using IQR
def cap_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] < lower, lower, df[col])
        df[col] = np.where(df[col] > upper, upper, df[col])
    return df

numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
data = cap_outliers(data, numeric_cols)

In [6]:
# Step 5: Encode categorical variables (chas is binary already)
data['chas'] = data['chas'].astype(int)

In [7]:
# Step 6: Feature/target separation
X = data.drop('medv', axis=1)
y = data['medv']

In [8]:
# Step 7: Normalize/standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Step 8: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [10]:
print("Training feature shape:", X_train.shape)
print("Test feature shape:", X_test.shape)

Training feature shape: (404, 13)
Test feature shape: (102, 13)


In [11]:
# Step 9: Save preprocessed data (optional for scripting)
import joblib
joblib.dump((X_train, X_test, y_train, y_test, scaler), '../data/preprocessed_data.pkl')
print("Preprocessed data saved to ../data/preprocessed_data.pkl")

Preprocessed data saved to ../data/preprocessed_data.pkl
