In [1]:
import pandas as pd 

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [10]:
# drop identifiers 
X_train = train.drop(columns=['TARGET_RENT_5YR', 'AREA_NAME', 'YEAR'])
y_train = train['TARGET_RENT_5YR']

X_test = test.drop(columns=['TARGET_RENT_5YR', 'AREA_NAME', 'YEAR'])
y_test = test['TARGET_RENT_5YR']

print(f"Features used for training: {list(X_train.columns)}")


Features used for training: ['1_bedrooms_leased', '2_bedrooms_avg_lease_rate', 'area_sq_meters', 'perimeter_meters', 'park_count', 'ASSAULT_RATE', 'AUTOTHEFT_RATE', 'ROBBERY_RATE', 'THEFTOVER_RATE', 'POPULATION', 'total_stop_count', 'avg_stop_frequency', 'max_stop_frequency', 'total_line_length_meters', 'transit_line_density', 'distinct_route_count', 'avg_rent_1br', 'rent_lag_1', 'rent_lag_2', 'rent_growth_rate', 'is_missing_THEFTOVER_RATE', 'is_missing_ROBBERY_RATE', 'is_missing_AUTOTHEFT_RATE', 'years_since_baseline']


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# initalize
model = XGBRegressor(
    n_estimators=100,                  
    learning_rate=0.1, 
    max_depth=5, 
    random_state=42)
# train the model
model.fit(X_train, y_train)

# predictions 
predictions = model.predict(X_test)

In [None]:
# MAE of ~$274  
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

# Features only explain ~25% percent of rent changes
print(f"MAE: ${mae:.2f}")
print(f"R² Score: {r2:.5f}")

# Compare actual vs predicted for the first 5 rows
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print("\nFirst 5 Predictions:")
print(comparison.head())

MAE: $273.79
R² Score: 0.24958

First 5 Predictions:
        Actual    Predicted
0  2464.500000  2276.994873
1  2220.000000  2454.975098
2  2464.500000  2285.605225
3  2220.000000  2463.430908
4  3092.166667  2951.546143
