## Data Preprocessing (Boston Housing Data)


In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
data = pd.read_csv('BostonHousing.csv')
data.head(2)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [4]:
data.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### Handeling Outliers

In [5]:
# Detecting outliers using IQR
def detect_outliers_iqr(data):
    outliers = {}
    for col in data.select_dtypes(include=['number']).columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_count = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col].count()
        if outlier_count > 0:
            outliers[col] = outlier_count
    return outliers

# Detect outliers
outliers_found = detect_outliers_iqr(data)
print("Outlier counts per column:", outliers_found)


Outlier counts per column: {'crim': 66, 'zn': 68, 'chas': 35, 'rm': 30, 'dis': 5, 'ptratio': 15, 'b': 77, 'lstat': 7, 'medv': 40}


In [6]:
# Removing Outliers using isolation forest

In [7]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)  # Assuming 5% of data are outliers
outliers = iso.fit_predict(data.select_dtypes(include=['number']))

data_cleaned = data[outliers == 1]  # Keep only normal points

print("Rows before removing outliers:", data.shape[0])
print("Rows after removing outliers:", data_cleaned.shape[0])


Rows before removing outliers: 506
Rows after removing outliers: 480


### Standardize numerical features

In [10]:
scaler = StandardScaler()
num_cols = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'b', 'lstat']
data[num_cols] = scaler.fit_transform(data[num_cols])

# Splitting the dataset
X = data.drop(columns=['medv'])
y = data['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Saving the preprocessed data
import os

# Creating directories 
os.makedirs("./data/X_train", exist_ok=True)
os.makedirs("./data/X_test", exist_ok=True)
os.makedirs("./data/y_train", exist_ok=True)
os.makedirs("./data/y_test", exist_ok=True)

# Saving the preprocessed data 
X_train.to_csv("./data/X_train/X_train.csv", index=False)
X_test.to_csv("./data/X_test/X_test.csv", index=False)
y_train.to_csv("./data/y_train/y_train.csv", index=False)