In [1]:
#
# ------------------- COMPLETE NOTEBOOK CODE -------------------
#
# This script will:
# 1. Load the Ames Housing dataset.
# 2. Perform basic cleaning and preprocessing.
# 3. Train an XGBoost machine learning model.
# 4. Save the trained model and column data to the 'backend/models' folder.
#

# --- 1. Import Libraries ---
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import xgboost as xgb
import joblib
import os
import numpy as np

print("Libraries imported successfully.")

# --- 2. Load the Data ---
try:
    housing = fetch_openml(name="house_prices", as_frame=True)
    df = housing.frame
    print("Dataset loaded successfully.")
    # For simplicity, we will only use a subset of columns
    # These are some of the most impactful features
    features_to_use = [
        'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
        'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
        '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
        'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch', 'ScreenPorch', 'MoSold', 'YrSold'
    ]
    target_variable = 'SalePrice'
    df = df[features_to_use + [target_variable]]

except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    # Exit if data loading fails
    exit()

# --- 3. Preprocessing ---
# Handle missing values by filling with the median (a simple, robust strategy)
for col in features_to_use:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        
# Ensure target variable has no missing values
df.dropna(subset=[target_variable], inplace=True)

print("Data preprocessing complete. Missing values handled.")

# --- 4. Define Features (X) and Target (y) ---
X = df[features_to_use]
y = df[target_variable]

print(f"Features (X) and target (y) are defined. Shape of X: {X.shape}")

# --- 5. Train the XGBoost Model ---
print("Training the XGBoost model... (This may take a minute)")
# Initialize and train the model
# We train on the entire dataset here to create the final model for the app
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                           n_estimators=100,
                           learning_rate=0.1,
                           max_depth=5,
                           random_state=42)

xgb_model.fit(X, y)
print("Model training complete.")

# --- 6. Save the Model and Columns ---
print("Saving the model and column data...")
# Define the path to the models directory in the backend
# The '../' goes up one level from the 'notebooks' directory
models_dir = '../backend/models'

# Create the directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

# Save the trained model
joblib.dump(xgb_model, os.path.join(models_dir, 'xgb_house_price_model.pkl'))

# Save the column names in the correct order
model_columns = list(X.columns)
joblib.dump(model_columns, os.path.join(models_dir, 'model_columns.pkl'))

print(f"✅ Models saved successfully to the '{models_dir}' directory!")
print("You can now start your backend server.")
#
# ------------------- END OF CODE -------------------
#

Libraries imported successfully.
Dataset loaded successfully.
Data preprocessing complete. Missing values handled.
Features (X) and target (y) are defined. Shape of X: (1460, 28)
Training the XGBoost model... (This may take a minute)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=[target_variable], inplace=True)


Model training complete.
Saving the model and column data...
✅ Models saved successfully to the '../backend/models' directory!
You can now start your backend server.
