In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [27]:
# Load the dataset
data_path = '../data/BostonHousing.csv'
df = pd.read_csv(data_path)

print("First 5 rows of the dataset:")
display(df.head())


First 5 rows of the dataset:


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [28]:
# Check for missing values
print("Missing Values:")
display(df.isnull().sum())

# Handle missing values (if any) - here we’ll impute with median for numerical columns
if df.isnull().sum().sum() > 0:
    df.fillna(df.median(), inplace=True)
    print("Missing values filled with median.")
else:
    print("No missing values found.")

Missing Values:


CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

No missing values found.


In [29]:
# Define function to cap outliers using IQR
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df
    

# Apply to key numerical columns (adjust as needed)
key_features = ['CRIM', 'RM', 'LSTAT', 'MEDV']
for feature in key_features:
    data = cap_outliers(data, feature)
    print(f"Outliers capped for {feature}")

print("Outliers capped using IQR method.")
display(df.describe())

Outliers capped for CRIM
Outliers capped for RM
Outliers capped for LSTAT
Outliers capped for MEDV
Outliers capped using IQR method.


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [30]:
# Check for categorical variables
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("Categorical Columns:", list(categorical_cols))

# If categorical variables exist, encode them (e.g., one-hot encoding)
if len(categorical_cols) > 0:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print("Categorical variables encoded.")
else:
    print("No categorical variables found in the dataset.")

Categorical Columns: []
No categorical variables found in the dataset.


In [31]:
#Normalize the data

# Separate features and target
X = df.drop('MEDV', axis=1)  # Adjust 'PRICE' if target name differs
y = df['MEDV']

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
print("\nFeatures standardized:")
print(X_scaled.describe())



Features standardized:
               CRIM            ZN         INDUS          CHAS           NOX  \
count  5.060000e+02  5.060000e+02  5.060000e+02  5.060000e+02  5.060000e+02   
mean  -1.123388e-16  7.898820e-17  2.106352e-16 -3.510587e-17 -1.965929e-16   
std    1.000990e+00  1.000990e+00  1.000990e+00  1.000990e+00  1.000990e+00   
min   -4.197819e-01 -4.877224e-01 -1.557842e+00 -2.725986e-01 -1.465882e+00   
25%   -4.109696e-01 -4.877224e-01 -8.676906e-01 -2.725986e-01 -9.130288e-01   
50%   -3.906665e-01 -4.877224e-01 -2.110985e-01 -2.725986e-01 -1.442174e-01   
75%    7.396560e-03  4.877224e-02  1.015999e+00 -2.725986e-01  5.986790e-01   
max    9.933931e+00  3.804234e+00  2.422565e+00  3.668398e+00  2.732346e+00   

                 RM           AGE           DIS           RAD         TAX  \
count  5.060000e+02  5.060000e+02  5.060000e+02  5.060000e+02  506.000000   
mean  -1.088282e-16 -1.474446e-16 -8.425408e-17 -1.123388e-16    0.000000   
std    1.000990e+00  1.000990e+00

In [32]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("\nData split into training and testing sets:")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Data split into training and testing sets:
Training set shape: (404, 13)
Testing set shape: (102, 13)


In [33]:
# Save preprocessed data for later use (optional)
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
print("Preprocessed data saved to '../data/' directory.")

Preprocessed data saved to '../data/' directory.
