In [7]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [8]:
# ================================================
# 2. Load datasets (train + eval)
# ================================================
train_df = pd.read_csv("C:\\Users\\HP\\Documents\\repos\\mlops-regression-mlflow\\data\\processed\\feature_engineered_train.csv")
eval_df  = pd.read_csv("C:\\Users\\HP\\Documents\\repos\\mlops-regression-mlflow\\data\\processed\\feature_engineered_eval.csv")

In [9]:
'''
# ================================================
# 3. Drop high VIF features (both train + eval)
# ================================================
high_vif_features = [
    "median_sale_price" #highest correlation to 'price' => data leakage
]
train_df.drop(columns=high_vif_features, inplace=True)
eval_df.drop(columns=high_vif_features, inplace=True)
'''



In [10]:
# ================================================
# 4. Define target & features
# ================================================
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

In [14]:
print(y_train.isna().sum())

0


In [17]:
print(X_train.isna().sum())  # Total number of NaNs

year                                 0
quarter                              0
month                                0
median_list_price                    0
median_ppsf                          0
median_list_ppsf                     0
homes_sold                           0
pending_sales                        0
new_listings                         0
inventory                            0
median_dom                           0
avg_sale_to_list                     0
sold_above_list                      0
off_market_in_two_weeks              0
bank                                 0
bus                                  0
hospital                             0
mall                                 0
park                                 0
restaurant                           0
school                               0
station                              0
supermarket                          0
Total Population                     0
Median Age                           0
Per Capita Income        

In [16]:
print(X_eval.isna().sum())

year                                 0
quarter                              0
month                                0
median_list_price                    0
median_ppsf                          0
median_list_ppsf                     0
homes_sold                           0
pending_sales                        0
new_listings                         0
inventory                            0
median_dom                           0
avg_sale_to_list                     0
sold_above_list                      0
off_market_in_two_weeks              0
bank                                 0
bus                                  0
hospital                             0
mall                                 0
park                                 0
restaurant                           0
school                               0
station                              0
supermarket                          0
Total Population                     0
Median Age                           0
Per Capita Income        

In [11]:
# ================================================
# 5. Standardization (fit on train, transform eval)
# ================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled  = scaler.transform(X_eval)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [12]:
# ================================================
# 6. Train & Evaluate Models
# ================================================

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_eval_scaled)

print("Linear Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lr)))
print(" R²:", r2_score(y_eval, y_pred_lr))

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [16]:
# --- Ridge Regression ---
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_eval_scaled)

print("\nRidge Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_ridge))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_ridge)))
print(" R²:", r2_score(y_eval, y_pred_ridge))


Ridge Regression:
 MAE: 54057.96293371481
 RMSE: 121373.00945970618
 R²: 0.8861575399105904


In [17]:
# --- Lasso Regression ---
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_eval_scaled)

print("\nLasso Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lasso))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lasso)))
print(" R²:", r2_score(y_eval, y_pred_lasso))


Lasso Regression:
 MAE: 54442.8865267044
 RMSE: 121676.85999421314
 R²: 0.8855868299820233


  model = cd_fast.enet_coordinate_descent(


In [18]:
# --- ElasticNet ---
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_eval_scaled)

print("\nElasticNet Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_elastic))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_elastic)))
print(" R²:", r2_score(y_eval, y_pred_elastic))


ElasticNet Regression:
 MAE: 54198.64460317444
 RMSE: 122236.80913599546
 R²: 0.8845313627524546
