# DDS‑8555 Assignment 4 — Abalone Nonlinear Models

**Model 1:** PolynomialFeatures + Ridge  
**Model 2:** SplineTransformer + Ridge  

Trains two nonlinear models, evaluates 3‑fold out‑of‑fold (OOF) RMSE, saves Kaggle‑ready submissions, and embeds the submission results.

## Imports, Paths, and Data Load

In [None]:

import pandas as pd, numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, SplineTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

BASE = Path('.'); DATA = BASE/'data'; FIGS = BASE/'figs'; OUT = BASE/'outputs'
for d in [DATA, FIGS, OUT]: d.mkdir(exist_ok=True, parents=True)

def load_csv(name):
    p1 = DATA/name
    if p1.exists(): return pd.read_csv(p1)
    return pd.read_csv(Path('/mnt/data')/name)

train_df = load_csv('train.csv')
test_df = load_csv('test.csv')
sample_sub = load_csv('sample_submission.csv')

print('Train shape:', train_df.shape, 'Test shape:', test_df.shape)
print('Sample submission columns:', list(sample_sub.columns))


## Target, Features, and Preprocessing

In [None]:

diff_cols = [c for c in train_df.columns if c not in test_df.columns]
target_col = diff_cols[0] if len(diff_cols)==1 else 'Rings' if 'Rings' in train_df.columns else diff_cols[0]
id_col, pred_col = sample_sub.columns[0], sample_sub.columns[-1]

X, y = train_df.drop(columns=[target_col]), train_df[target_col].values
X_test = test_df.copy()

num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')


## Model 1 — PolynomialFeatures + Ridge

In [None]:

num_pipe_poly = Pipeline([('imputer', num_imputer),
                          ('scaler', StandardScaler()),
                          ('poly', PolynomialFeatures(degree=2, include_bias=False))])
cat_pipe = Pipeline([('imputer', cat_imputer),
                     ('ohe', OneHotEncoder(handle_unknown='ignore'))])
preprocess_poly = ColumnTransformer([('num', num_pipe_poly, num_features),
                                     ('cat', cat_pipe, cat_features)])
model1 = Pipeline([('preprocess', preprocess_poly),
                   ('ridge', Ridge(alpha=1.0, random_state=42))])

cv = KFold(n_splits=3, shuffle=True, random_state=42)
oof_pred1 = cross_val_predict(model1, X, y, cv=cv)
rmse1 = float(np.sqrt(mean_squared_error(y, oof_pred1)))
print('Model 1 OOF RMSE:', rmse1)

model1.fit(X, y)
pred1 = np.maximum(model1.predict(X_test), 0.0)
sub1 = pd.DataFrame({id_col: sample_sub[id_col].values, pred_col: pred1})
sub1.to_csv(OUT/'submission_model1_polyridge.csv', index=False)

plt.figure(); plt.scatter(y, oof_pred1, s=12)
plt.xlabel('True'); plt.ylabel('OOF Pred (M1)'); plt.title('M1: OOF Pred vs True')
plt.savefig(FIGS/'model1_pred_vs_true.png', dpi=160, bbox_inches='tight'); plt.close()

res1 = y - oof_pred1
plt.figure(); plt.scatter(oof_pred1, res1, s=12); plt.axhline(0, lw=1)
plt.xlabel('OOF Pred (M1)'); plt.ylabel('Residuals'); plt.title('M1: Residuals vs Pred')
plt.savefig(FIGS/'model1_residuals.png', dpi=160, bbox_inches='tight'); plt.close()


## Model 2 — SplineTransformer + Ridge

In [None]:

num_pipe_spline = Pipeline([('imputer', num_imputer),
                            ('spline', SplineTransformer(degree=3, n_knots=6, include_bias=False)),
                            ('scaler', StandardScaler())])
preprocess_spline = ColumnTransformer([('num', num_pipe_spline, num_features),
                                       ('cat', cat_pipe, cat_features)])
model2 = Pipeline([('preprocess', preprocess_spline),
                   ('ridge', Ridge(alpha=1.0, random_state=42))])

oof_pred2 = cross_val_predict(model2, X, y, cv=cv)
rmse2 = float(np.sqrt(mean_squared_error(y, oof_pred2)))
print('Model 2 OOF RMSE:', rmse2)

model2.fit(X, y)
pred2 = np.maximum(model2.predict(X_test), 0.0)
sub2 = pd.DataFrame({id_col: sample_sub[id_col].values, pred_col: pred2})
sub2.to_csv(OUT/'submission_model2_splineridge.csv', index=False)

plt.figure(); plt.scatter(y, oof_pred2, s=12)
plt.xlabel('True'); plt.ylabel('OOF Pred (M2)'); plt.title('M2: OOF Pred vs True')
plt.savefig(FIGS/'model2_pred_vs_true.png', dpi=160, bbox_inches='tight'); plt.close()

res2 = y - oof_pred2
plt.figure(); plt.scatter(oof_pred2, res2, s=12); plt.axhline(0, lw=1)
plt.xlabel('OOF Pred (M2)'); plt.ylabel('Residuals'); plt.title('M2: Residuals vs Pred')
plt.savefig(FIGS/'model2_residuals.png', dpi=160, bbox_inches='tight'); plt.close()

cv_summary = pd.DataFrame({'Model':['Polynomial+Ridge','Spline+Ridge'],
                           'OOF_RMSE':[rmse1, rmse2]})
cv_summary.to_csv(OUT/'cv_summary.csv', index=False)
cv_summary


## Kaggle Submission Results

![Kaggle Results](figs/kaggle_results.png)

Record your public and private scores here for the submission you selected.