In [6]:

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
import joblib



In [7]:

import pandas as pd
import numpy as np

# Load data
train_data = pd.read_csv('../data/house_price_regression_dataset.csv')

# Split features and target
X_train = train_data.drop('House_Price', axis=1)
y_train = train_data['House_Price'].copy()

# Create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                 test_size=0.2, 
                                                 random_state=42)

num_features = X_train.select_dtypes(include=np.number).columns

print("Numerical features:", num_features.tolist())

Numerical features: ['Square_Footage', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Lot_Size', 'Garage_Size', 'Neighborhood_Quality']


In [10]:
# Create pipelines
num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

pre_processing_pipeline = ColumnTransformer([
    ('num_pipe', num_pipeline, num_features),
])