# Boston Housing Predictor - Consolidated Evaluation

This notebook compares three paths under a strict constraint:
1) Chronological Baseline, 
2) Chronological Robust, 
3) Random Split Baseline.


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, ParameterGrid
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
print('Libraries imported')


Libraries imported


## 1. Data & Split Functions


In [2]:
cols=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
data=pd.read_csv('data/housing.csv', names=cols, delim_whitespace=True)
data=data[data['MEDV']<50.0].copy()
X=data.drop('MEDV',axis=1); y=data['MEDV']
def split_chrono(X,y,train_size=0.7):
    s=int(len(X)*train_size); return X.iloc[:s],X.iloc[s:],y.iloc[:s],y.iloc[s:]
def split_random(X,y,train_size=0.7):
    return train_test_split(X,y,test_size=1-train_size,random_state=42,shuffle=True)
print('Data ready:',X.shape)


Data ready: (490, 13)


## 2. Stability Analysis (select 8 stable features)


In [3]:
def stability_scores(X,n_splits=10):
    n=len(X); size=n//n_splits; out={}
    for c in X.columns:
        if c=='CHAS':
            continue
        means=[]; stds=[]
        for i in range(n_splits):
            s=i*size; e=s+size if i<n_splits-1 else n
            seg=X.iloc[s:e][c]
            means.append(seg.mean()); stds.append(seg.std())
        mv=np.var(means); sv=np.var(stds); ov=X[c].var()
        r=(mv/ov if ov>0 else 0)+(sv/ov if ov>0 else 0)
        out[c]=r
    return pd.Series(out).sort_values()
stab=stability_scores(X)
stable=stab.head(8).index.tolist()
print('Stable features:',stable)


Stable features: ['RM', 'AGE', 'LSTAT', 'ZN', 'DIS', 'NOX', 'INDUS', 'PTRATIO']


## 3. Phase 2 Transforms (helpers)


In [4]:
def winsorize_train(Xtr,Xte,low=0.01,high=0.99):
    Xtr=Xtr.copy(); Xte=Xte.copy(); qs=Xtr.quantile([low,high])
    for c in Xtr.columns:
        l,h=qs.loc[low,c],qs.loc[high,c]
        Xtr[c]=Xtr[c].clip(l,h); Xte[c]=Xte[c].clip(l,h)
    return Xtr,Xte
def add_log1p(Xtr,Xte):
    Xtr=Xtr.copy(); Xte=Xte.copy()
    for c in list(Xtr.columns):
        if (Xtr[c]>0).all() and (Xte[c]>0).all():
            Xtr[c+'_log1p']=np.log1p(Xtr[c]); Xte[c+'_log1p']=np.log1p(Xte[c])
    return Xtr,Xte
def add_ratio_diff(Xtr,Xte,pairs):
    Xtr=Xtr.copy(); Xte=Xte.copy()
    for a,b in pairs:
        if a in Xtr.columns and b in Xtr.columns:
            dt=np.where(Xtr[b]==0,1e-6,Xtr[b]); de=np.where(Xte[b]==0,1e-6,Xte[b])
            Xtr[f'{a}_over_{b}']=Xtr[a]/dt; Xte[f'{a}_over_{b}']=Xte[a]/de
            Xtr[f'{a}_minus_{b}']=Xtr[a]-Xtr[b]; Xte[f'{a}_minus_{b}']=Xte[a]-Xte[b]
    return Xtr,Xte
def forward_cv_indices(n,k=5):
    sizes=np.linspace(0.6,0.95,k); idx=np.arange(n); folds=[]
    for s in sizes:
        sp=int(n*s);
        if sp<n-1: folds.append((idx[:sp],idx[sp:]))
    return folds
def tune(model_name,Xtr,ytr):
    folds=forward_cv_indices(len(Xtr),k=5); res=[]
    if model_name=='SVR':
        base=SVR(kernel='rbf'); grid=ParameterGrid({'C':[0.5,1,3,10],'gamma':['scale',0.05,0.1,0.2]})
    else:
        base=GradientBoostingRegressor(random_state=42); grid=ParameterGrid({'n_estimators':[200,300,500],'learning_rate':[0.03,0.05],'max_depth':[2,3]})
    for p in grid:
        rmses=[]
        for tr,va in folds:
            m=base.set_params(**p)
            m.fit(Xtr[tr],ytr.iloc[tr]); pr=m.predict(Xtr[va])
            rmses.append(np.sqrt(mean_squared_error(ytr.iloc[va],pr)))
        res.append((np.mean(rmses),p))
    res.sort(key=lambda x:x[0]); return res[0] if res else None
def calibrate(y_true,y_pred):
    Xc=np.vstack([y_pred,np.ones_like(y_pred)]).T
    a,b=np.linalg.lstsq(Xc,y_true,rcond=None)[0]
    return a,b
def apply_cal(y_pred,a,b): return a*y_pred+b


## 4. Chronological Baseline


In [5]:
Xtr0,Xte0,ytr0,yte0=split_chrono(X,y)
Xtr0=Xtr0[stable].copy(); Xte0=Xte0[stable].copy()
sc0=RobustScaler(); Xtr0s=sc0.fit_transform(Xtr0); Xte0s=sc0.transform(Xte0)
svr0=SVR(kernel='rbf'); gb0=GradientBoostingRegressor(random_state=42)
def eval_model(m,Xtr,Xte,ytr,yte):
    m.fit(Xtr,ytr); pr=m.predict(Xte)
    return {"rmse":float(np.sqrt(mean_squared_error(yte,pr))),"mae":float(np.mean(np.abs(yte-pr))),"r2":float(r2_score(yte,pr))}
res_chrono_base={'SVR':eval_model(svr0,Xtr0s,Xte0s,ytr0,yte0),'GB':eval_model(gb0,Xtr0s,Xte0s,ytr0,yte0)}
print('Chronological Baseline:',res_chrono_base)


Chronological Baseline: {'SVR': {'rmse': 5.665488167160758, 'mae': 4.596780248244526, 'r2': -0.08802542941982483}, 'GB': {'rmse': 6.280568363925484, 'mae': 4.785931919114244, 'r2': -0.33709500597237896}}


## 5. Chronological Robust (Phase 2)


In [6]:
Xtr1,Xte1,ytr1,yte1=split_chrono(X,y)
Xtr1=Xtr1[stable].copy(); Xte1=Xte1[stable].copy()
Xtr1w,Xte1w=winsorize_train(Xtr1,Xte1)
Xtr1m,Xte1m=add_log1p(Xtr1w,Xte1w)
Xtr1f,Xte1f=add_ratio_diff(Xtr1m,Xte1m,[('LSTAT','RM'),('NOX','DIS')])
sc1=RobustScaler(); Xtr1s=sc1.fit_transform(Xtr1f); Xte1s=sc1.transform(Xte1f)
best_svr=tune('SVR',Xtr1s,ytr1); best_gb=tune('GB',Xtr1s,ytr1)
svr1=SVR(kernel='rbf',**(best_svr[1] if best_svr else {}))
gb1=GradientBoostingRegressor(random_state=42,**(best_gb[1] if best_gb else {}))
# Train and calibrate
def eval_with_cal(m,Xtr,Xte,ytr,yte):
    m.fit(Xtr,ytr); pr_tr=m.predict(Xtr); a,b=calibrate(ytr.values,pr_tr); pr=m.predict(Xte); pr_c=apply_cal(pr,a,b)
    raw={'rmse':float(np.sqrt(mean_squared_error(yte,pr))),'mae':float(np.mean(np.abs(yte-pr))),'r2':float(r2_score(yte,pr))}
    cal={'rmse':float(np.sqrt(mean_squared_error(yte,pr_c))),'mae':float(np.mean(np.abs(yte-pr_c))),'r2':float(r2_score(yte,pr_c))}
    return {'raw':raw,'cal':cal}
res_chrono_robust={'SVR':eval_with_cal(svr1,Xtr1s,Xte1s,ytr1,yte1),'GB':eval_with_cal(gb1,Xtr1s,Xte1s,ytr1,yte1)}
print('Chronological Robust:',res_chrono_robust)


Chronological Robust: {'SVR': {'raw': {'rmse': 3.8148215998506605, 'mae': 2.946445633352555, 'r2': 0.5066980432398386}, 'cal': {'rmse': 3.754014910433982, 'mae': 2.8768503456755274, 'r2': 0.5222987709097938}}, 'GB': {'raw': {'rmse': 6.000420708725305, 'mae': 4.645814648425812, 'r2': -0.2204718717088967}, 'cal': {'rmse': 5.954715288615823, 'mae': 4.58256401254189, 'r2': -0.20194992636785525}}}


## 6. Random Split Baseline


In [7]:
Xtr2,Xte2,ytr2,yte2=split_random(X,y)
Xtr2=Xtr2[stable].copy(); Xte2=Xte2[stable].copy()
sc2=RobustScaler(); Xtr2s=sc2.fit_transform(Xtr2); Xte2s=sc2.transform(Xte2)
svr2=SVR(kernel='rbf'); gb2=GradientBoostingRegressor(random_state=42)
res_random_base={'SVR':eval_model(svr2,Xtr2s,Xte2s,ytr2,yte2),'GB':eval_model(gb2,Xtr2s,Xte2s,ytr2,yte2)}
print('Random Baseline:',res_random_base)


Random Baseline: {'SVR': {'rmse': 3.667860625289897, 'mae': 2.6133031484287117, 'r2': 0.7474962664873894}, 'GB': {'rmse': 2.5779021167343608, 'mae': 1.875790307720292, 'r2': 0.8752688285961052}}


## 7. Comparison Summary


In [8]:
# Build a compact table
def pick_best(res):
    if 'raw' in res.get('SVR',{}):
        # robust path
        candidates=[('SVR-raw',res['SVR']['raw']),('SVR-cal',res['SVR']['cal']),('GB-raw',res['GB']['raw']),('GB-cal',res['GB']['cal'])]
    else:
        candidates=[('SVR',res['SVR']),('GB',res['GB'])]
    best=min(candidates,key=lambda x:x[1]['rmse'])
    return best[0],best[1]
b1,b1m=pick_best(res_chrono_base)
b2,b2m=pick_best(res_chrono_robust)
b3,b3m=pick_best(res_random_base)
df=pd.DataFrame([
                    {'path':'Chrono-Base','best':b1,**b1m},
                    {'path':'Chrono-Robust','best':b2,**b2m},
                    {'path':'Random-Base','best':b3,**b3m}
] )
print(df.to_string(index=False))


         path    best     rmse     mae        r2
  Chrono-Base     SVR 5.665488 4.59678 -0.088025
Chrono-Robust SVR-cal 3.754015 2.87685  0.522299
  Random-Base      GB 2.577902 1.87579  0.875269
