In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load data
subjects_df = pd.read_csv('data/cleaned/subjects_cleaned.csv')
candidates_df = pd.read_csv('data/engineered/candidates_pairs_features.csv')
comps_df = pd.read_csv('data/engineered/comps_pairs_features.csv')


In [2]:
# Features to use
feature_cols = [
    'gla_diff', 'lot_size_diff', 'bedroom_diff', 'bathroom_diff', 'room_count_diff',
    'same_property_type', 'same_storey_type', 'sold_recently_90'
]

# Label columns for supervised training
comps_df['label'] = 1
candidates_df['label'] = 0

# Concatenate comps and candidates for training
train_data = pd.concat([comps_df, candidates_df], ignore_index=True)
X = train_data[feature_cols]
y = train_data['label']
groups = train_data.groupby('orderID').size().values


In [3]:
dtrain = xgb.DMatrix(X, label=y)
dtrain.set_group(groups)

params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'lambda': 1,
    'alpha': 0.5,
    'seed': 42
}

model = xgb.train(params, dtrain, num_boost_round=200)
model.save_model('xgboost_model.json')
print("✅ Model trained and saved as xgboost_model.json")


✅ Model trained and saved as xgboost_model.json


In [4]:
# Score every candidate (so every subject can get their top 3)
dmatrix = xgb.DMatrix(candidates_df[feature_cols])
candidates_df['score'] = model.predict(dmatrix)


In [5]:
all_top3 = []

for oid in subjects_df['orderID']:
    group = candidates_df[candidates_df['orderID'] == oid].copy()
    if not group.empty:
        top_n = group.nlargest(3, 'score')
        if len(top_n) < 3:
            pad = pd.DataFrame([{'orderID': oid}] * (3 - len(top_n)))
            top_n = pd.concat([top_n, pad], ignore_index=True)
        all_top3.append(top_n)
    else:
        pad = pd.DataFrame([{'orderID': oid}] * 3)
        all_top3.append(pad)

top3_full = pd.concat(all_top3, ignore_index=True)


In [11]:
# Use suffixes to clarify which columns are for subject vs candidate

# Subject columns
subject_cols = [
    'orderID', 'address', 'gla_clean', 'bedrooms_clean', 'bathrooms_clean',
    'room_count_clean', 'lot_size_clean', 'structure_type', 'stories_clean', 'effective_date_clean'
]

# Candidate columns
candidate_cols = [
    'id', 'address', 'gla_clean', 'bedrooms_clean', 'bathrooms_clean',
    'room_count_clean', 'lot_size_clean', 'structure_type', 'stories_clean', 'close_date_clean'
]

# Make copies to avoid in-place changes
subjects_info = subjects_df[subject_cols].drop_duplicates(subset=['orderID']).copy()
subjects_info = subjects_info.rename(columns={col: 'subject_' + col if col != 'orderID' else col for col in subject_cols})

candidates_info = candidates_df[['orderID', 'id'] + candidate_cols[1:]].copy()
candidates_info = candidates_info.rename(columns={col: 'candidate_' + col if col not in ['orderID', 'id'] else col for col in candidate_cols})

output = top3_full.merge(subjects_info, on='orderID', how='left')
output = output.merge(candidates_info, on=['orderID', 'id'], how='left')

# Final column list: orderID, id, subject columns, candidate columns, score
final_cols = ['orderID', 'id'] + [c for c in subjects_info.columns if c != 'orderID'] + [c for c in candidates_info.columns if c not in ['orderID', 'id']] + ['score']
output = output[final_cols]

# Rank
output['rank'] = output.groupby('orderID')['score'].rank(method='first', ascending=False)

output.to_csv('final_subjects_top3_candidates.csv', index=False)
print("✅ Clean, labeled columns! Saved as final_subjects_top3_candidates.csv")


✅ Clean, labeled columns! Saved as final_subjects_top3_candidates.csv


In [7]:
output['rank'] = output.groupby('orderID')['score'].rank(method='first', ascending=False)

output.to_csv('final_subjects_top3_candidates.csv', index=False)
print("✅ Saved as final_subjects_top3_candidates.csv")


✅ Saved as final_subjects_top3_candidates.csv


In [13]:
# Prepare a set of (orderID, id) pairs that are true expert comps
comp_pairs = set(zip(candidates_df['orderID'], candidates_df['id']))

# Mark for each row in output if it's a true comp
output['is_true_comp'] = output.apply(lambda row: (row['orderID'], row['id']) in comp_pairs, axis=1)

# For each subject, did any of their top 3 candidates match a comp?
precision_at_3 = output.groupby('orderID')['is_true_comp'].max().mean()

print(f"Precision@3: {precision_at_3:.2%}")


Precision@3: 100.00%
