## Step 1: Install and Import XGBoost


In [3]:
# Install XGBoost if you haven't already
# Run this in a cell: !pip install xgboost

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("XGBoost version:", xgb.__version__)

XGBoost version: 2.1.4


## Step 2: Load and Prepare Data


In [5]:
# Load your engineered features
comps_df = pd.read_csv('data/model_ready/comps_pairs_model_ready.csv')
candidates_df = pd.read_csv('data/model_ready/candidates_pair_model_ready.csv')

# Add labels
comps_df['is_good_comp'] = 1
candidates_df['is_good_comp'] = 0

print(f"Positive examples (comps): {len(comps_df)}")
print(f"Negative examples (candidates): {len(candidates_df)}")
print(f"Imbalance ratio: 1:{len(candidates_df) / len(comps_df):.1f}")

Positive examples (comps): 264
Negative examples (candidates): 9820
Imbalance ratio: 1:37.2


## Step 3: XGBoost Strategy 1 - Using scale_pos_weight


In [6]:
# Combine all data (yes, ALL of it!)
all_data = pd.concat([comps_df, candidates_df], ignore_index=True)

# Define features
feature_cols = [
    'gla_diff', 'lot_size_diff', 'bedroom_diff', 'bathroom_diff',
    'room_count_diff', 'same_property_type', 'same_storey_type',
    'sold_recently_90', 'sold_recently_180'
]

# Prepare X and y
X = all_data[feature_cols].copy()
y = all_data['is_good_comp']

# Handle missing values
print("Handling missing values...")
for col in ['gla_diff', 'lot_size_diff', 'bedroom_diff', 'bathroom_diff', 'room_count_diff']:
    X[col] = X[col].fillna(X[col].median())

for col in ['same_property_type', 'same_storey_type', 'sold_recently_90', 'sold_recently_180']:
    X[col] = X[col].fillna(0)

# Calculate scale_pos_weight (this is the magic for imbalanced data!)
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])
print(f"\nscale_pos_weight = {scale_pos_weight:.1f}")
print("This tells XGBoost to give positive examples more importance")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {len(X_train)} examples")
print(f"Test set: {len(X_test)} examples")

Handling missing values...

scale_pos_weight = 37.2
This tells XGBoost to give positive examples more importance

Training set: 8067 examples
Test set: 2017 examples


## Step 4: Train XGBoost Model


In [11]:
# Create XGBoost classifier with parameters tuned for imbalanced data
xgb_model = xgb.XGBClassifier(
    # Basic parameters
    n_estimators=100,           # Number of trees
    max_depth=6,                # Maximum tree depth (prevent overfitting)
    learning_rate=0.3,          # Step size shrinkage (default is good)
    
    # Imbalanced data parameter
    scale_pos_weight=scale_pos_weight,  # Handle imbalance!
    
    # Regularization parameters (prevent overfitting)
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=1.0,             # L2 regularization
    
    # Other useful parameters
    random_state=42,            # Reproducibility
    n_jobs=-1,                  # Use all CPU cores
    eval_metric='logloss'       # Evaluation metric
)

# Train the model
print("Training XGBoost model...")
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  # Monitor performance on test set
    early_stopping_rounds=10,       # Stop if no improvement
    verbose=False                   # Set to True to see training progress
)

print("Training complete!")
print(f"Best iteration: {xgb_model.best_iteration}")

Training XGBoost model...


TypeError: fit() got an unexpected keyword argument 'early_stopping_rounds'

In [12]:
import xgboost
print(xgboost.__version__)


2.1.4
