In [67]:
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge, Lasso


RANDOM_STATE = 42
TARGET = "SalePrice"
TEST_SIZE = 0.2  # 20% of data reserved for testing

In [68]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [69]:
# Load dataset
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [72]:
# Separate features and target variable
y = df[TARGET]
X = df.drop(columns=[TARGET])


In [73]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

In [75]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [76]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Create the pipeline with preprocessing and regression model
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression()),
    ]
)

In [77]:
# Fit the model
model.fit(X_train, y_train)

# Transform the training data
X_train_transformed = model.named_steps["preprocessor"].transform(X_train)

# Display the shape of the transformed training data
print(f"Transformed training data shape: {X_train_transformed.shape}")

Transformed training data shape: (1168, 285)


In [78]:
# Get feature names after preprocessing
feature_names = model.named_steps["preprocessor"].get_feature_names_out()
print("Number of features:", len(feature_names))
print("First 10 features:", feature_names[:10])


Number of features: 285
First 10 features: ['num__MSSubClass' 'num__LotFrontage' 'num__LotArea' 'num__OverallQual'
 'num__OverallCond' 'num__YearBuilt' 'num__YearRemodAdd' 'num__MasVnrArea'
 'num__BsmtFinSF1' 'num__BsmtFinSF2']


In [79]:
# Set up k-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [80]:
# Define different regression models to evaluate
models = {
    "Dummy_mean": DummyRegressor(strategy="mean"),
    "Linear_Regression": LinearRegression(),
    "Ridge_alpha_1.0": Ridge(alpha=1.0),
    "Lasso_alpha_0.1": Lasso(alpha=0.1, max_iter=50000),
}

In [81]:
# Evaluate each model using cross-validation
scoring = {
    "rmse":"neg_root_mean_squared_error",
    "mae":"neg_mean_absolute_error",
}

In [82]:
results = []

for model_name, model_reg in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", model_reg),
        ]
    )
    
    cv_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )
    
    rmse_values = -cv_scores["test_rmse"]
    mae_values = -cv_scores["test_mae"]
    
    result = {
        "model": model_name,
        "rmse_mean": rmse_values.mean(),
        "rmse_std": rmse_values.std(),
        "mae_mean": mae_values.mean(),
        "mae_std": mae_values.std(),
    }

    results.append(result)

baseline_df = pd.DataFrame(results)
baseline_df

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Unnamed: 0,model,rmse_mean,rmse_std,mae_mean,mae_std
0,Dummy_mean,77077.655533,4965.417342,56319.67459,2108.946828
1,Linear_Regression,38539.272563,12091.468997,19757.665739,2121.812548
2,Ridge_alpha_1.0,35665.767715,12652.101503,19296.877995,1751.404687
3,Lasso_alpha_0.1,36470.492713,12226.014967,19316.627201,2181.644766


In [83]:
baseline_df.to_csv(REPORTS_DIR / "metrics_baseline.csv", index=False)