In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 
import joblib

In [14]:

import pandas as pd
import numpy as np

# Load data
train_data = pd.read_csv('../data/house_price_regression_dataset.csv')

# Split features and target
X_train = train_data.drop('House_Price', axis=1)
y_train = train_data['House_Price'].copy()

# Create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                 test_size=0.2, 
                                                 random_state=42)

num_features = X_train.select_dtypes(include=np.number).columns

print("Numerical features:", num_features.tolist())

Numerical features: ['Square_Footage', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Lot_Size', 'Garage_Size', 'Neighborhood_Quality']


In [15]:
# Create pipelines
num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])


num_pipeline


In [16]:
pre_processing_pipeline = ColumnTransformer([
    ('num_pipe', num_pipeline, num_features),
])
pre_processing_pipeline

In [17]:
# Create model pipeline with Linear Regression
model_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', LinearRegression())
])
model_pipeline

In [18]:
# Fit the model
model = model_pipeline.fit(X_train, y_train)

In [19]:
# Validation
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"RMSE: {rmse}")

RMSE: 10071.484424137067


In [24]:
# Save model
# import os
# os.makedirs('models', exist_ok=True)
# joblib.dump(model, 'models/model_with_pipeline.pkl')

import os
import joblib

# Create models directory if it doesn't exist (at the top level)
os.makedirs('../models', exist_ok=True)

# Save the model to the top-level models directory
model_path = '../models/model_with_pipeline.pkl'
joblib.dump(model, model_path)
print(f"Model saved successfully at {model_path}")

Model saved successfully at ../models/model_with_pipeline.pkl


In [21]:
# Try with Random Forest
rf_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline

In [22]:
rf_model = rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
print(f"Random Forest RMSE: {rmse_rf}")

Random Forest RMSE: 19853.324584091704


In [23]:

if rmse_rf < rmse:
    joblib.dump(rf_model, 'models/rf_model_with_pipeline.pkl')
    print("Saved Random Forest model as it performed better")
else:
    print("Saved Linear Regression model")

Saved Linear Regression model
