In [1]:
import pandas as pd

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
train=pd.read_csv('../data/train_engineered_part1.csv')

In [4]:
train

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,year,month,time_since_built,time_since_renovated
0,268643,4,2.25,1810,9240,2.0,0,0,3,7,...,0,1961,1961,98055,1660,9240,2015,5,54,54
1,245000,3,2.50,1600,2788,2.0,0,0,4,7,...,0,1992,1992,98031,1720,3605,2014,7,22,22
2,200000,4,2.50,1720,8638,2.0,0,0,3,8,...,0,1994,1994,98003,1870,7455,2015,1,21,21
3,352499,2,2.25,1240,705,2.0,0,0,3,7,...,90,2009,2009,98027,1240,750,2015,4,6,6
4,232000,3,2.00,1280,13356,1.0,0,0,3,7,...,0,1994,1994,98042,1590,8071,2014,12,20,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,378000,3,1.50,1000,6914,1.0,0,0,3,7,...,0,1947,1947,98125,1000,6947,2014,11,67,67
16205,399950,3,2.50,3087,5002,2.0,0,0,3,8,...,0,2014,2014,98023,2927,5183,2014,11,0,0
16206,575000,3,2.50,2120,4780,2.0,0,0,3,7,...,0,2004,2004,98053,1690,2650,2014,9,10,10
16207,245000,1,0.75,380,15000,1.0,0,0,3,5,...,0,1963,1963,98168,1170,15000,2014,6,51,51


In [5]:
# Define features and target
X = train.drop('price', axis=1)
y = train['price']

In [6]:

# Get index of zipcode column for categorical feature
cat_features = ['zipcode']
cat_feature_indices = [X.columns.get_loc(col) for col in cat_features if col in X.columns]
cat_feature_indices

[13]

In [7]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# Store metrics
train_mse_scores = []
train_r2_scores = []
val_mse_scores = []
val_r2_scores = []

In [9]:

print("=" * 60)
print("CatBoost Regressor - 5-Fold Cross Validation")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize CatBoost model
    # model = CatBoostRegressor(
    #     iterations=1000,
    #     learning_rate=0.1,
    #     depth=6,
    #     loss_function='RMSE',
    #     eval_metric='RMSE',
    #     cat_features=cat_feature_indices,
    #     early_stopping_rounds=50,
    #     verbose=False,
    #     random_state=42
    # )
    model = CatBoostRegressor(
    iterations=2000,            # Increased iterations
    learning_rate=0.03,         # Lowered learning rate for smoother convergence
    depth=4,                    # Slightly increased depth but with more regularization
    l2_leaf_reg=10,             # High L2 penalty to combat overfitting
    subsample=0.8,              # Use 80% of data for each tree
    colsample_bylevel=0.8,      # Use 80% of features for each split
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_feature_indices,
    early_stopping_rounds=100,   # Give it more room to breathe
    verbose=False,
    random_state=42
)
    
    # Fit model with early stopping
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    
    # Predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, train_pred)
    train_r2 = r2_score(y_train, train_pred)
    val_mse = mean_squared_error(y_val, val_pred)
    val_r2 = r2_score(y_val, val_pred)
    
    # Store scores
    train_mse_scores.append(train_mse)
    train_r2_scores.append(train_r2)
    val_mse_scores.append(val_mse)
    val_r2_scores.append(val_r2)
    
    print(f"\nFold {fold}:")
    print(f"  Best iteration: {model.best_iteration_}")
    print(f"  Train - MSE: {train_mse:,.2f} | R²: {train_r2:.4f}")
    print(f"  Val   - MSE: {val_mse:,.2f} | R²: {val_r2:.4f}")

CatBoost Regressor - 5-Fold Cross Validation

Fold 1:
  Best iteration: 1998
  Train - MSE: 9,471,854,154.79 | R²: 0.9276
  Val   - MSE: 15,452,444,936.89 | R²: 0.8769

Fold 2:
  Best iteration: 1999
  Train - MSE: 9,463,365,795.12 | R²: 0.9260
  Val   - MSE: 16,790,318,529.21 | R²: 0.8778

Fold 3:
  Best iteration: 1997
  Train - MSE: 9,463,060,799.90 | R²: 0.9240
  Val   - MSE: 16,392,217,573.37 | R²: 0.8912

Fold 4:
  Best iteration: 1999
  Train - MSE: 9,147,469,275.05 | R²: 0.9309
  Val   - MSE: 15,861,467,851.04 | R²: 0.8674

Fold 5:
  Best iteration: 1999
  Train - MSE: 9,261,993,758.73 | R²: 0.9305
  Val   - MSE: 12,281,274,423.93 | R²: 0.8939


In [10]:

# Print average scores
print("\n" + "=" * 60)
print("Average Scores Across 5 Folds:")
print("=" * 60)
print(f"  Train - MSE: {np.mean(train_mse_scores):,.2f} (±{np.std(train_mse_scores):,.2f})")
print(f"  Train - R²:  {np.mean(train_r2_scores):.4f} (±{np.std(train_r2_scores):.4f})")
print(f"  Val   - MSE: {np.mean(val_mse_scores):,.2f} (±{np.std(val_mse_scores):,.2f})")
print(f"  Val   - R²:  {np.mean(val_r2_scores):.4f} (±{np.std(val_r2_scores):.4f})")


Average Scores Across 5 Folds:
  Train - MSE: 9,361,548,756.72 (±133,101,395.30)
  Train - R²:  0.9278 (±0.0026)
  Val   - MSE: 15,355,544,662.89 (±1,603,106,497.30)
  Val   - R²:  0.8814 (±0.0098)


# Perfoming with no outliers dataset

In [11]:
train2=pd.read_csv('../data/train_outliers_removed.csv')

In [12]:
train2

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,year,month,time_since_built,time_since_renovated
0,268643,4,2.25,1810,9240,2.0,0,0,3,7,...,0,1961,1961,98055,1660,9240,2015,5,54,54
1,245000,3,2.50,1600,2788,2.0,0,0,4,7,...,0,1992,1992,98031,1720,3605,2014,7,22,22
2,200000,4,2.50,1720,8638,2.0,0,0,3,8,...,0,1994,1994,98003,1870,7455,2015,1,21,21
3,352499,2,2.25,1240,705,2.0,0,0,3,7,...,90,2009,2009,98027,1240,750,2015,4,6,6
4,232000,3,2.00,1280,13356,1.0,0,0,3,7,...,0,1994,1994,98042,1590,8071,2014,12,20,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16041,378000,3,1.50,1000,6914,1.0,0,0,3,7,...,0,1947,1947,98125,1000,6947,2014,11,67,67
16042,399950,3,2.50,3087,5002,2.0,0,0,3,8,...,0,2014,2014,98023,2927,5183,2014,11,0,0
16043,575000,3,2.50,2120,4780,2.0,0,0,3,7,...,0,2004,2004,98053,1690,2650,2014,9,10,10
16044,245000,1,0.75,380,15000,1.0,0,0,3,5,...,0,1963,1963,98168,1170,15000,2014,6,51,51


In [13]:
# Define features and target
X2 = train2.drop('price', axis=1)
y2 = train2['price']

In [14]:
# Store metrics
train_mse_scores_2 = []
train_r2_scores_2 = []
val_mse_scores_2 = []
val_r2_scores_2 = []

In [15]:

print("=" * 60)
print("CatBoost Regressor - 5-Fold Cross Validation")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(kf.split(X2), 1):
    X_train, X_val = X2.iloc[train_idx], X2.iloc[val_idx]
    y_train, y_val = y2.iloc[train_idx], y2.iloc[val_idx]
    
    # Initialize CatBoost model
    # model = CatBoostRegressor(
    #     iterations=1000,
    #     learning_rate=0.1,
    #     depth=6,
    #     loss_function='RMSE',
    #     eval_metric='RMSE',
    #     cat_features=cat_feature_indices,
    #     early_stopping_rounds=50,
    #     verbose=False,
    #     random_state=42
    # )
    model = CatBoostRegressor(
    iterations=2000,            # Increased iterations
    learning_rate=0.03,         # Lowered learning rate for smoother convergence
    depth=4,                    # Slightly increased depth but with more regularization
    l2_leaf_reg=10,             # High L2 penalty to combat overfitting
    subsample=0.8,              # Use 80% of data for each tree
    colsample_bylevel=0.8,      # Use 80% of features for each split
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_feature_indices,
    early_stopping_rounds=100,   # Give it more room to breathe
    verbose=False,
    random_state=42
)
    
    # Fit model with early stopping
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    
    # Predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, train_pred)
    train_r2 = r2_score(y_train, train_pred)
    val_mse = mean_squared_error(y_val, val_pred)
    val_r2 = r2_score(y_val, val_pred)
    
    # Store scores
    train_mse_scores_2.append(train_mse)
    train_r2_scores_2.append(train_r2)
    val_mse_scores_2.append(val_mse)
    val_r2_scores_2.append(val_r2)
    
    print(f"\nFold {fold}:")
    print(f"  Best iteration: {model.best_iteration_}")
    print(f"  Train - MSE: {train_mse:,.2f} | R²: {train_r2:.4f}")
    print(f"  Val   - MSE: {val_mse:,.2f} | R²: {val_r2:.4f}")

CatBoost Regressor - 5-Fold Cross Validation

Fold 1:
  Best iteration: 1988
  Train - MSE: 8,450,974,814.14 | R²: 0.9211
  Val   - MSE: 11,939,443,499.22 | R²: 0.8958

Fold 2:
  Best iteration: 1996
  Train - MSE: 8,696,138,222.50 | R²: 0.9210
  Val   - MSE: 10,966,084,876.26 | R²: 0.8932

Fold 3:
  Best iteration: 1985
  Train - MSE: 8,239,543,022.14 | R²: 0.9261
  Val   - MSE: 12,416,635,014.37 | R²: 0.8722

Fold 4:
  Best iteration: 1999
  Train - MSE: 8,386,445,427.08 | R²: 0.9217
  Val   - MSE: 13,260,522,839.05 | R²: 0.8847

Fold 5:
  Best iteration: 1999
  Train - MSE: 8,401,863,998.49 | R²: 0.9218
  Val   - MSE: 14,144,654,612.01 | R²: 0.8756


In [16]:

# Print average scores
print("\n" + "=" * 60)
print("Average Scores Across 5 Folds:")
print("=" * 60)
print(f"  Train - MSE: {np.mean(train_mse_scores_2):,.2f} (±{np.std(train_mse_scores_2):,.2f})")
print(f"  Train - R²:  {np.mean(train_r2_scores_2):.4f} (±{np.std(train_r2_scores_2):.4f})")
print(f"  Val   - MSE: {np.mean(val_mse_scores_2):,.2f} (±{np.std(val_mse_scores_2):,.2f})")
print(f"  Val   - R²:  {np.mean(val_r2_scores_2):.4f} (±{np.std(val_r2_scores_2):.4f})")


Average Scores Across 5 Folds:
  Train - MSE: 8,434,993,096.87 (±148,396,318.71)
  Train - R²:  0.9223 (±0.0019)
  Val   - MSE: 12,545,468,168.18 (±1,090,597,627.74)
  Val   - R²:  0.8843 (±0.0093)


# Lightgbm

In [17]:
from lightgbm import LGBMRegressor

# Store metrics for LightGBM
lgbm_train_mse_scores = []
lgbm_train_r2_scores = []
lgbm_val_mse_scores = []
lgbm_val_r2_scores = []


In [18]:

print("=" * 60)
print("LightGBM Regressor - 5-Fold Cross Validation")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(kf.split(X2), 1):
    X_train, X_val = X2.iloc[train_idx], X2.iloc[val_idx]
    y_train, y_val = y2.iloc[train_idx], y2.iloc[val_idx]
    
    # Initialize LightGBM model
    lgbm_model = LGBMRegressor(
        n_estimators=3000,
        learning_rate=0.01,
        max_depth=4,
        reg_lambda=15,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='regression',
        metric='rmse',
        random_state=42,
        verbose=-1
    )
    
    # Fit model with early stopping
    lgbm_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mse',
        categorical_feature=cat_features,
        callbacks=[
            __import__('lightgbm').early_stopping(stopping_rounds=100, verbose=False)
        ]
    )
    
    # Predictions
    train_pred = lgbm_model.predict(X_train)
    val_pred = lgbm_model.predict(X_val)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, train_pred)
    train_r2 = r2_score(y_train, train_pred)
    val_mse = mean_squared_error(y_val, val_pred)
    val_r2 = r2_score(y_val, val_pred)
    
    # Store scores
    lgbm_train_mse_scores.append(train_mse)
    lgbm_train_r2_scores.append(train_r2)
    lgbm_val_mse_scores.append(val_mse)
    lgbm_val_r2_scores.append(val_r2)
    
    print(f"\nFold {fold}:")
    print(f"  Best iteration: {lgbm_model.best_iteration_}")
    print(f"  Train - MSE: {train_mse:,.2f} | R²: {train_r2:.4f}")
    print(f"  Val   - MSE: {val_mse:,.2f} | R²: {val_r2:.4f}")


LightGBM Regressor - 5-Fold Cross Validation

Fold 1:
  Best iteration: 3000
  Train - MSE: 7,030,762,502.28 | R²: 0.9344
  Val   - MSE: 11,733,661,597.64 | R²: 0.8976

Fold 2:
  Best iteration: 2999
  Train - MSE: 7,020,827,525.47 | R²: 0.9363
  Val   - MSE: 11,432,630,267.06 | R²: 0.8886

Fold 3:
  Best iteration: 2993
  Train - MSE: 6,838,575,183.82 | R²: 0.9387
  Val   - MSE: 12,675,502,911.50 | R²: 0.8695

Fold 4:
  Best iteration: 2988
  Train - MSE: 6,682,539,061.57 | R²: 0.9376
  Val   - MSE: 13,009,464,248.88 | R²: 0.8869

Fold 5:
  Best iteration: 2996
  Train - MSE: 6,952,499,626.54 | R²: 0.9353
  Val   - MSE: 13,571,783,486.58 | R²: 0.8806


In [20]:

# Print average scores
print("\n" + "=" * 60)
print("Average Scores Across 5 Folds:")
print("=" * 60)
print(f"  Train - R²:  {np.mean(lgbm_train_r2_scores):.4f} (±{np.std(lgbm_train_r2_scores):.4f})")
print(f"  Val   - R²:  {np.mean(lgbm_val_r2_scores):.4f} (±{np.std(lgbm_val_r2_scores):.4f})")


Average Scores Across 5 Folds:
  Train - R²:  0.9364 (±0.0015)
  Val   - R²:  0.8847 (±0.0093)
