In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('/content/BostonHousing.csv')

In [2]:
# Outlier handling
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = df[col].clip(lower_bound, upper_bound)

In [3]:
# Feature-target split
X = df.drop('medv', axis=1)
y = df['medv']

In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [6]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
import joblib
joblib.dump(scaler, 'scaler.pkl')
pd.DataFrame(X_train_scaled, columns=X.columns).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test_scaled, columns=X.columns).to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [8]:
print("Preprocessing completed!")
print("Saved files: X_train.csv, X_test.csv, y_train.csv, y_test.csv, scaler.pkl")

Preprocessing completed!
Saved files: X_train.csv, X_test.csv, y_train.csv, y_test.csv, scaler.pkl
