In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

pd.options.display.float_format = '{:,.0f}'.format

In [2]:
df = pd.read_csv('../data/pc/train.csv',index_col=None)
df.drop(['assessed_2019','building_value_2019','land_value_2019'],axis=1,inplace=True)

In [3]:
X = df.drop(columns=['acct','TARGET'])
y = df['TARGET']

# Model Stacking 2

In [4]:
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from xgboost import XGBRegressor

In [5]:
# Step 1: Define base models
base_models = [
    LinearRegression(),
    RidgeCV(alphas=[0.1, 1.0, 10.0]),
    XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, n_jobs=1)
]

In [6]:
# Step 2: Get out-of-fold predictions from each model
oof_preds = []
for model in base_models:
    preds = cross_val_predict(model, X, y, cv=5, method='predict')
    oof_preds.append(preds)

In [7]:
# Stack predictions as features for meta-model
stacked_X = np.column_stack(oof_preds)

In [8]:
# Step 3: Fit meta-model with non-negative constraint
meta_model = Lasso(positive=True, alpha=0.1)  # alpha can be tuned
meta_model.fit(stacked_X, y)

Lasso(alpha=0.1, positive=True)

In [9]:
# Step 4: Fit base models on full data
for model in base_models:
    model.fit(X, y)

In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Meta-model predictions on training data (via out-of-fold base predictions)
train_preds = meta_model.predict(stacked_X)
train_rmse = mean_squared_error(y, train_preds, squared=False)

print("Stacked Meta-Model Train RMSE:", train_rmse)

Stacked Meta-Model Train RMSE: 43313.99052782645


In [63]:
len(final_preds)

418858

In [64]:
# Step 6: Clip to ensure no negatives
final_preds = np.clip(final_preds, 0, None)

In [65]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [66]:
test = test.drop_duplicates(subset='acct')

In [67]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [68]:
# Step 5: Predict using stacked full-data base model predictions
full_preds = np.column_stack([model.predict(X_test) for model in base_models])
final_preds = meta_model.predict(full_preds)

In [70]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': final_preds
})

In [71]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [72]:
output_df.reset_index(inplace=True,drop=True)

In [75]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,363887
1,8def0ccceda200b673872a8a9367644767989f3b,152785
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,261626
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,166258
4,63facf82adbae10b23f7fabc93188c95bd832f51,289735
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,291283
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,368415
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,387651
418856,290d351d16d00e79e89dc5404412f07769f40038,293639


In [76]:
output_df.to_csv('../data/pc/predictions/seth_kaufman_prediction_file_7_stacked_model.csv',index=None)

# Model Stacking

In [21]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LinearRegression, LassoCV
from xgboost import XGBRegressor

In [22]:
# Define base models
base_models = [
    ('ols', LinearRegression()),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0])),
    ('lasso', LassoCV(cv=5)),
    ('xgb', XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, n_jobs=1))
]

In [23]:
# Meta-model (can also try LassoCV or GradientBoostingRegressor)
meta_model = RidgeCV()

In [24]:
# Stacking regressor
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=2
)

In [25]:
# Fit to training data
stacked_model.fit(X, y)

StackingRegressor(cv=5,
                  estimators=[('ols', LinearRegression()),
                              ('ridge',
                               RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                              ('lasso', LassoCV(cv=5)),
                              ('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            enable_categorical=False,
                                            gamma=None, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            lear...
                                            max_delta_step=None, max_depth=3,
                      

In [26]:
# Predict on test set
y_pred_stacked = stacked_model.predict(X_test)

In [31]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [32]:
test = test.drop_duplicates(subset='acct')

In [33]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [41]:
y_pred = stacked_model.predict(X_test)

In [42]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': y_pred
})

In [43]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [44]:
output_df.reset_index(inplace=True,drop=True)

In [53]:
y.describe()

count      628,287
mean       255,304
std        316,367
min            100
25%        134,144
50%        183,657
75%        268,128
max     21,732,602
Name: TARGET, dtype: float64

In [45]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,-1.070982e+17
1,8def0ccceda200b673872a8a9367644767989f3b,5.233540e+16
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,-4.090655e+16
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,4.105826e+16
4,63facf82adbae10b23f7fabc93188c95bd832f51,-6.794584e+16
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,-1.011508e+17
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,-9.966911e+16
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,-1.245082e+17
418856,290d351d16d00e79e89dc5404412f07769f40038,-1.005233e+17


In [20]:
output_df.to_csv('../data/pc/predictions/seth_kaufman_prediction_file_6_xgb_gridsearch.csv',index=None)

In [27]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [30]:
len(y_pred_stacked)

418858

In [28]:
rmse_stacked = np.sqrt(mean_squared_error(y_test, y_pred_stacked))
print(f"Stacked Model RMSE: {rmse_stacked:.2f}")

NameError: name 'y_test' is not defined

# XGBoost

In [4]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [5]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
}

In [6]:
# Base model
xgb = XGBRegressor(objective='reg:squarederror', n_jobs=-1)

In [7]:
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=2  # or even 1 to go sequential
)

In [8]:
# Fit to data
grid.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:  6.6min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=-1,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
         

In [9]:
# Results
print("Best RMSE: ", -grid.best_score_)
print("Best Parameters:", grid.best_params_)

Best RMSE:  40700.98454131627
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [10]:
# Best model
best_model = grid.best_estimator_

In [11]:
best_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [12]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [13]:
test = test.drop_duplicates(subset='acct')

In [14]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [15]:
y_pred = best_model.predict(X_test)

In [16]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': y_pred
})

In [17]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [18]:
output_df.reset_index(inplace=True,drop=True)

In [19]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,362302.12500
1,8def0ccceda200b673872a8a9367644767989f3b,149407.25000
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,266044.62500
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,170704.93750
4,63facf82adbae10b23f7fabc93188c95bd832f51,299428.40625
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,329829.03125
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,392524.43750
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,402045.46875
418856,290d351d16d00e79e89dc5404412f07769f40038,324762.09375


In [20]:
output_df.to_csv('../data/pc/predictions/seth_kaufman_prediction_file_6_xgb_gridsearch.csv',index=None)

In [None]:
from sklearn.model_selection import cross_val_score
-np.mean(cross_val_score(xgb, X, y, scoring='neg_root_mean_squared_error', cv=5))


# Multiple Models

In [8]:
import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# ------------------------
# OLS
ols = LinearRegression()
ols.fit(X, y)
pred_ols = ols.predict(X)
rmse_ols = np.sqrt(mean_squared_error(y, pred_ols))

# ------------------------
# Ridge (with cross-validated alpha)
ridge = RidgeCV(alphas=[0.1, 1.0, 10.0])
ridge.fit(X, y)
pred_ridge = ridge.predict(X)
rmse_ridge = np.sqrt(mean_squared_error(y, pred_ridge))

# ------------------------
# Lasso (with cross-validated alpha)
lasso = LassoCV(cv=5)
lasso.fit(X, y)
pred_lasso = lasso.predict(X)
rmse_lasso = np.sqrt(mean_squared_error(y, pred_lasso))

# ------------------------
# XGBoost
xgb = XGBRegressor()
xgb.fit(X, y)
pred_xgb = xgb.predict(X)
rmse_xgb = np.sqrt(mean_squared_error(y, pred_xgb))

# ------------------------
# Summary
print(f"OLS RMSE:     {rmse_ols:.2f}")
print(f"Ridge RMSE:   {rmse_ridge:.2f} (alpha={ridge.alpha_})")
print(f"Lasso RMSE:   {rmse_lasso:.2f} (alpha={lasso.alpha_:.4f})")
print(f"XGBoost RMSE: {rmse_xgb:.2f}")

OLS RMSE:     42095.57
Ridge RMSE:   150604462083.57 (alpha=0.1)
Lasso RMSE:   48757.61 (alpha=96174497.3703)
XGBoost RMSE: 24121.34


In [9]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [10]:
test = test.drop_duplicates(subset='acct')

In [11]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [12]:
y_pred = xgb.predict(X_test)

In [13]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': y_pred
})

In [14]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [15]:
output_df.reset_index(inplace=True,drop=True)

In [16]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,367201.250000
1,8def0ccceda200b673872a8a9367644767989f3b,145278.703125
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,270280.750000
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,166346.718750
4,63facf82adbae10b23f7fabc93188c95bd832f51,295452.031250
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,346767.031250
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,405935.187500
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,412413.281250
418856,290d351d16d00e79e89dc5404412f07769f40038,309453.906250


In [20]:
output_df.to_csv('../data/pc/predictions/seth_kaufman_prediction_file_5_xgb_simple.csv',index=None)

In [4]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

ridge = RidgeCV(alphas=[0.1, 1.0, 10.0]).fit(X, y)
lasso = LassoCV(cv=5).fit(X, y)
enet = ElasticNetCV(cv=5).fit(X, y)

In [5]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [7]:
from sklearn.metrics import mean_squared_error
import numpy as np

preds = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, preds))
print(f"RMSE: {rmse:.2f}")

NameError: name 'model' is not defined

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

xgb = XGBRegressor()
grid = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X, y)

print("Best RMSE:", -grid.best_score_)
print("Best params:", grid.best_params_)

# Ridge ($\ell_2$)

In [13]:
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

rmse_scores = {}

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X,y)

    y_pred = ridge.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    rmse_scores[a] = rmse

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [14]:
rmse_scores

{0.01: np.float64(42107.305615080055),
 0.1: np.float64(42101.37720001765),
 1.0: np.float64(42102.34097079095),
 10.0: np.float64(42103.37436020412),
 100.0: np.float64(42106.19035137568)}

## $\lambda = 0.1$

In [15]:
ridge = Ridge(alpha=0.1)
ridge.fit(X,y)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [16]:
y_pred = ridge.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Train RMSE (Ridge): {rmse:.4f}")

Train RMSE (Ridge): 42101.3772


In [17]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [18]:
test = test.drop_duplicates(subset='acct')

In [20]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [21]:
y_pred = ridge.predict(X_test)

In [22]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': y_pred
})

In [23]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [25]:
output_df.reset_index(inplace=True,drop=True)

In [26]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,372597.540283
1,8def0ccceda200b673872a8a9367644767989f3b,140216.999359
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,274875.499726
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,161327.027440
4,63facf82adbae10b23f7fabc93188c95bd832f51,303217.456451
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,346633.054826
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,349625.233483
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,447081.888710
418856,290d351d16d00e79e89dc5404412f07769f40038,297169.981041


In [27]:
output_df.to_csv('../data/pc/predictions_2.csv', index=False)

## Ridge CV

In [11]:
# Try a range of alpha values
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

In [12]:
# Ridge with cross-validation (default 5-fold)
ridge_cv = RidgeCV(alphas=alphas, scoring='neg_root_mean_squared_error')

In [13]:
ridge_cv.fit(X, y)

RidgeCV(alphas=array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
        scoring='neg_root_mean_squared_error')

In [14]:
# Best alpha
print(f"Best alpha (Ridge): {ridge_cv.alpha_}")

# RMSE on training data
y_pred = ridge_cv.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Train RMSE (Ridge): {rmse:.4f}")

Best alpha (Ridge): 0.01
Train RMSE (Ridge): 1476625477319.8162


In [None]:
test = pd.read_csv('../data/pc/test.csv',index_col=None)

In [None]:
test = test.drop_duplicates(subset='acct')

In [None]:
X_test = test.drop(columns=['acct'])  # or drop other non-feature cols
acct_ids = test['acct']               # store acct ID

In [None]:
y_pred = ridge.predict(X_test)

In [None]:
output_df = pd.DataFrame({
    'acct': acct_ids,
    'TARGET': y_pred
})

In [None]:
output_df.rename({'acct':'ACCOUNT'},axis=1,inplace=True)

In [None]:
output_df.reset_index(inplace=True,drop=True)

In [None]:
output_df

Unnamed: 0,ACCOUNT,TARGET
0,bb75f25168addc1117840b10c0fd6cd0c2a7b7c6,372597.540283
1,8def0ccceda200b673872a8a9367644767989f3b,140216.999359
2,ca33e57b3b13e843909f4b6cbd9a3410387bd45a,274875.499726
3,3e0f6f6090a8226ce67ccf2f8630b8ad630b8d55,161327.027440
4,63facf82adbae10b23f7fabc93188c95bd832f51,303217.456451
...,...,...
418853,24847d36c333ab3376848ee1bda74916286a2a4b,346633.054826
418854,62ba19f1655097ac9cc8682aec435d22a653bfa0,349625.233483
418855,552b5300d14369d8fd792ea6228dfa683014b2f5,447081.888710
418856,290d351d16d00e79e89dc5404412f07769f40038,297169.981041


# LASSO ($\ell_1$)

In [4]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(alphas=[0.001, 0.01, 0.1, 1.0, 10], cv=5)
lasso_cv.fit(X, y)

print(f"Best alpha (Lasso): {lasso_cv.alpha_}")
print(f"Non-zero features: {(lasso_cv.coef_ != 0).sum()} / {len(lasso_cv.coef_)}")

y_pred = lasso_cv.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Train RMSE (Lasso): {rmse:.4f}")

: 