In [8]:
import numpy as np
import pandas as pd

In [9]:
# Importing necessary functions from the pipeline folder
from data_loader import load_data,split_data
from preprocessing import preprocess_data
from model_training import train_model
from utils import evaluate_model, save_model, load_model

In [10]:
# Load the dataset
file_path = 'C:\\Users\\DELL\\Downloads\\ModularProjects\\AmesHousing\\data\\AmesHousing.csv'
data = load_data(file_path)

In [11]:
data.shape

(2930, 82)

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = split_data(data, target_column='SalePrice')

# Check the shape of the data to confirm
print(f"Training data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing data shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training data shape: X_train=(2344, 81), y_train=(2344,)
Testing data shape: X_test=(586, 81), y_test=(586,)


In [14]:
# Determine which columns are numerical and categorical
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Preprocess the data (Handle missing values and scaling)
X_train_scaled, X_test_scaled, preprocessor = preprocess_data(X_train, X_test, numerical_features, categorical_features)

# Check the shapes after transformation
print(f"Transformed training data shape: {X_train_scaled.shape}")
print(f"Transformed testing data shape: {X_test_scaled.shape}")


Transformed training data shape: (2344, 302)
Transformed testing data shape: (586, 302)


In [15]:
# Train the model (using XGBoost here, you can change to 'linear_regression' or 'random_forest')
model = train_model(X_train_scaled, y_train, model_type='xgboost')

In [16]:
# Check the trained model type
print(f"Trained model: {model}")

Trained model: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)


In [17]:
# Evaluate the model on the test data
mae, mse, r2 = evaluate_model(model, X_test_scaled, y_test)

INFO:root:Mean Absolute Error: 15375.619700565274
INFO:root:Mean Squared Error: 693777079.4311035
INFO:root:R-Squared: 0.9134676456451416


In [18]:

# Print the evaluation metrics
print(f"Mean Absolute Error: {np.round(mae,2)}")
print(f"Mean Squared Error: {np.round(mse,2)}")
print(f"R² Score: {np.round(r2,4)}")

Mean Absolute Error: 15375.62
Mean Squared Error: 693777079.43
R² Score: 0.9135


In [18]:
# Save the trained model to a file
save_model(model, 'final_model_xgboost.joblib')

INFO:root:Model saved to final_model_xgboost.joblib


In [19]:
from run_pipeline import run_pipeline

### Running the entire pipeline:

In [20]:
run_pipeline("C:\\Users\\DELL\\Downloads\\ModularProjects\\AmesHousing\\data\\AmesHousing.csv")

INFO:root:Loading data...
INFO:root:Splitting data...
INFO:root:Preprocessing data...
INFO:root:Training the model...
INFO:root:Evaluating the model...
INFO:root:Mean Absolute Error: 15375.619700565274
INFO:root:Mean Squared Error: 693777079.4311035
INFO:root:R-Squared: 0.9134676456451416
INFO:root:Model performance on test data:
MAE: 15375.6197
MSE: 693777079.4311
R²: 0.9135
INFO:root:Model saved to model.pkl
INFO:root:Pipeline completed successfully.


In [19]:
X_train_scaled.shape

(2344, 302)

In [24]:
X_train.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      