In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [14]:
df = pd.read_csv('data/train_input.csv', na_values = ['','NaN','NA'], keep_default_na = False)
df_test_file = pd.read_csv('data/test_input.csv',na_values = ['','NaN','NA'], keep_default_na = False)

In [21]:
target_train = df['Sale Price'].copy()
target_test = df_test_file['Sale Price'].copy()

train_features = df.drop("Sale Price", axis = 1)
test_features = df_test_file.drop("Sale Price", axis = 1)

num_features = train_features.select_dtypes(exclude = 'object').columns.tolist()
cat_features = train_features.select_dtypes(include = 'object').columns.tolist()

In [22]:
# StandardScaler = StandardScaler()
num_pipeline = Pipeline(
    steps = [
    ("imputing",SimpleImputer(strategy = "mean")),
    ]
)
cat_pipeline = Pipeline(
    steps = [
    ("Imputing",SimpleImputer(strategy = "most_frequent")),
    ("Encoder",OneHotEncoder(handle_unknown = 'ignore'))
    ]
)
full_pipeline = ColumnTransformer(
    [
        ("num",num_pipeline,num_features),
        ("cat",cat_pipeline,cat_features)
    ]
)

In [23]:
df_train = full_pipeline.fit_transform(train_features)
df_test = full_pipeline.transform(test_features)

In [34]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_Score = r2_score(true, predicted)
    return mae, rmse, r2_Score

In [37]:
models = {
    "lasso":Lasso(),
    "Linear Regression": LinearRegression(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Ridge":Ridge(),
    "Random Forest":RandomForestRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Ada Boost Regressor":AdaBoostRegressor(),
    "Xgb boost":XGBRegressor()
}

In [38]:
model_list = []
r2_list = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(df_train, target_train)

    y_train_pred = model.predict(df_train)
    y_test_pred = model.predict(df_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(target_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(target_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("- Model performance for Training set")
    print("- Root Mean Squared Error:{:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error:{:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    
    print('----------------------------------------')
    
    print('- Model performance for Test set')
    print("- Root Mean Squared Error :{:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error:{:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print()
    r2_list.append(model_test_r2)
    

lasso
- Model performance for Training set
- Root Mean Squared Error:0.2384
- Mean Absolute Error:0.1683
- R2 Score: 0.6620
----------------------------------------
- Model performance for Test set
- Root Mean Squared Error :0.2308
- Mean Absolute Error:0.1676
- R2 Score: 0.6626

Linear Regression
- Model performance for Training set
- Root Mean Squared Error:0.2877
- Mean Absolute Error:0.2158
- R2 Score: 0.5078
----------------------------------------
- Model performance for Test set
- Root Mean Squared Error :0.2884
- Mean Absolute Error:0.2176
- R2 Score: 0.4732

KNeighbors Regressor
- Model performance for Training set
- Root Mean Squared Error:0.1914
- Mean Absolute Error:0.1320
- R2 Score: 0.7821
----------------------------------------
- Model performance for Test set
- Root Mean Squared Error :0.2192
- Mean Absolute Error:0.1567
- R2 Score: 0.6957

Ridge
- Model performance for Training set
- Root Mean Squared Error:0.3962
- Mean Absolute Error:0.3063
- R2 Score: 0.0661
------

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform as sp_uniform

# 1. Define the XGBoost Regressor (or XGBClassifier for classification)
# Set common parameters and choose a high n_estimators (trees) 
# as the search will find the optimal number of *effective* trees.
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Loss function for regression
    random_state=42, 
    n_estimators=1000,             # Start high, will be controlled by learning_rate
    n_jobs=-1                      # Use all cores
)

# 2. Define the Parameter Search Space (Distributions)
# Use distributions from scipy.stats for continuous variables in RandomizedSearchCV
param_dist = {
    # Tree Complexity (to combat the overfitting you saw)
    'max_depth': sp_randint(3, 8),          # Lower the depth (e.g., 3 to 7)
    'min_child_weight': sp_randint(1, 10),  # Increase the minimum samples required for a split
    'gamma': sp_uniform(0.0, 0.5),          # Increase min loss reduction required for a split
    
    # Randomness/Subsampling (further regularization)
    'subsample': sp_uniform(0.6, 0.4),      # Sample from 60% to 100% (0.6 + 0.4) of rows
    'colsample_bytree': sp_uniform(0.6, 0.4), # Sample from 60% to 100% of columns
    
    # Learning/Boosting
    'learning_rate': [0.01, 0.05, 0.1, 0.2], # Smaller rates are often better
    'reg_alpha': sp_uniform(0.0, 1.0),      # L1 regularization
    'reg_lambda': sp_uniform(0.0, 1.0)      # L2 regularization
}

# 3. Initialize and Run the Search
# X_train and y_train are your training data (e.g., NumPy arrays or Pandas DataFrames)
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                  # Number of parameter settings that are sampled (tune this!)
    scoring='r2',               # Use R-squared for evaluation
    cv=5,                       # Use 5-fold cross-validation
    verbose=3,
    random_state=42,
    return_train_score=True
)

# Fit the search to your data
random_search.fit(df_train, target_train)

# 5. Get the Best Model
best_xgb_model = random_search.best_estimator_


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END colsample_bytree=0.749816047538945, gamma=0.4753571532049581, learning_rate=0.1, max_depth=7, min_child_weight=5, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797;, score=(train=0.925, test=0.903) total time=   1.2s
[CV 2/5] END colsample_bytree=0.749816047538945, gamma=0.4753571532049581, learning_rate=0.1, max_depth=7, min_child_weight=5, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797;, score=(train=0.928, test=0.875) total time=   1.4s
[CV 3/5] END colsample_bytree=0.749816047538945, gamma=0.4753571532049581, learning_rate=0.1, max_depth=7, min_child_weight=5, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797;, score=(train=0.923, test=0.907) total time=   1.3s
[CV 4/5] END colsample_bytree=0.749816047538945, gamma=0.4753571532049581, learning_rate=0.1, max_depth=7, min_child_weigh

In [40]:
print(best_xgb_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=np.float64(0.7644148053272926), device=None,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, feature_weights=None,
             gamma=np.float64(0.016525366450274193), grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.2, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=1000,
             n_jobs=-1, num_parallel_tree=None, ...)


In [45]:
# Assuming 'random_search' object from the previous fit is still in memory

# 1. Retrieve the optimal model object
best_xgb_model = random_search.best_estimator_

# 2. Evaluate the model on the unseen test set
# This provides the final, non-cross-validated R2 score for reporting.
test_r2_score = best_xgb_model.score(df_train, target_train)

print(f"Final Tuned Model R2 on Test Set: {test_r2_score}")

Final Tuned Model R2 on Test Set: 0.9768077696341226


In [46]:
import joblib

# Assuming 'best_xgb_model' is the object you retrieved from random_search.best_estimator_

# Define the filename
filename = 'best_xgb_model_tuned.joblib'

# Save the model to file
joblib.dump(best_xgb_model, filename)

print(f"Model successfully saved to {filename}")

# --- To Load the model later ---
# loaded_model = joblib.load(filename)

Model successfully saved to best_xgb_model_tuned.joblib
