# Import essential libraries

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

load data 

In [22]:
df = pd.read_csv('../data/boston_housing.csv')

# Quick check

In [23]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


Handling Missing Values

In [24]:
print(df.isnull().sum())

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


Handling Outliers

In [25]:
upper_limit = df['price'].quantile(0.99)
df['price'] = np.where(df['price'] > upper_limit, upper_limit, df['price'])

#check after clipping

In [26]:
df['price'].describe()

count    5.450000e+02
mean     4.750173e+06
std      1.809772e+06
min      1.750000e+06
25%      3.430000e+06
50%      4.340000e+06
75%      5.740000e+06
max      1.054200e+07
Name: price, dtype: float64

Encoding categorical features
check unique values in categorical coulmns

In [27]:
print(df['mainroad'].unique())
print(df['furnishingstatus'].unique())

['yes' 'no']
['furnished' 'semi-furnished' 'unfurnished']


encoding binary categorical columns (yes/no)

In [28]:
binary_vars = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for var in binary_vars:
    df[var] = df[var].map({'yes':1, 'no':0})

encoding multi category column (furnishingstatus)

In [29]:
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

In [30]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,10542000.0,7420,4,2,3,1,0,0,0,1,2,1,False,False
1,10542000.0,8960,4,4,4,1,0,0,0,1,3,0,False,False
2,10542000.0,9960,3,2,2,1,0,1,0,0,2,1,True,False
3,10542000.0,7500,4,2,2,1,0,1,0,1,3,1,False,False
4,10542000.0,7420,4,1,2,1,1,1,0,1,2,0,False,False


#Feature Scaling


In [31]:
x = df.drop('price', axis =1)
y = df['price']

In [32]:
#Apply standard scaling numeric features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

#Convert scaled features back to dataframe for readability
x_scaled_df = pd.DataFrame(x_scaled, columns=x.columns)

#verify
x_scaled_df.head()


Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,1.046726,1.403419,1.421812,1.378217,0.405623,-0.465315,-0.734539,-0.219265,1.472618,1.517692,1.804941,-0.844888,-0.696429
1,1.75701,1.403419,5.405809,2.532024,0.405623,-0.465315,-0.734539,-0.219265,1.472618,2.679409,-0.554035,-0.844888,-0.696429
2,2.218232,0.047278,1.421812,0.22441,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,1.804941,1.183588,-0.696429
3,1.083624,1.403419,1.421812,0.22441,0.405623,-0.465315,1.361397,-0.219265,1.472618,2.679409,1.804941,-0.844888,-0.696429
4,1.046726,1.403419,-0.570187,0.22441,0.405623,2.149083,1.361397,-0.219265,1.472618,1.517692,-0.554035,-0.844888,-0.696429


splitting data into train/test

In [33]:
# Split dataset: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    x_scaled_df, y, test_size=0.2, random_state=42
)

# Check the shapes
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Test set shape: {X_test.shape}, {y_test.shape}')


Training set shape: (436, 13), (436,)
Test set shape: (109, 13), (109,)


save cleaned data for using in training

In [34]:
# Saving preprocessed data (optional but recommended)
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
