## Data Preprocessing

### ✅ Handle Missing Values

In [7]:
import pandas as pd  

# Load dataset  
df = pd.read_csv("../data/boston_housing.csv")  

# Fill missing values for numeric columns  
df.fillna(df.select_dtypes(include=['number']).median(), inplace=True)

# Fill missing values for categorical columns  
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Confirm missing values are handled  
print(df.isnull().sum())  # Should print 0 for all columns  


price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


### ✅ Encode Categorical Variables

In [8]:
# Convert categorical feature (if any)
df = pd.get_dummies(df, drop_first=True)


### ✅ Normalize/Standardize Features

In [9]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns
num_cols = df.select_dtypes(include=["float64", "int64"]).columns

# Apply standardization
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [11]:
print(df.columns)


Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
       'mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes',
       'airconditioning_yes', 'prefarea_yes',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')


### ✅ Split the Data

In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["price"])  # Features
y = df["price"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### ✅ Save Processed Data

In [13]:
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)
