In [None]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# 1) Read training data
train = pd.read_csv("analysis_data.csv")

# 2) Target and base features
y_train = train["monthly_spend"]

# We drop ID, target, and annual_income (not used as a predictor here)
X_train = train.drop(columns=["customer_id", "monthly_spend", "annual_income"])

# 3) Define categorical and numeric columns
cat_cols = [
    "gender",
    "marital_status",
    "education_level",
    "region",
    "employment_status",
    "card_type",
]
num_cols = [c for c in X_train.columns if c not in cat_cols]

# 4) Fill missing values
# numeric: median, categorical: "missing"
num_medians = X_train[num_cols].median(numeric_only=True)
X_train[num_cols] = X_train[num_cols].fillna(num_medians)
X_train[cat_cols] = X_train[cat_cols].fillna("missing")

# 5) One-hot encode categorical variables
X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_train = X_train.astype(float)

# 6) Polynomial features for numeric columns (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)

X_train_num_poly = poly.fit_transform(X_train[num_cols])
X_train_num_poly = pd.DataFrame(
    X_train_num_poly,
    index=X_train.index,
    columns=poly.get_feature_names_out(num_cols),
)

# 7) Final training matrix: poly numeric + one-hot categoricals
X_train_final = pd.concat(
    [X_train_num_poly, X_train.drop(columns=num_cols)],
    axis=1
)

print("X_train_final shape:", X_train_final.shape)

In [None]:
from sklearn.linear_model import Ridge

# Best alpha from cross-validation
best_alpha = 100.0
reg_final = Ridge(alpha=best_alpha, random_state=5200)
reg_final.fit(X_train_final, y_train)

In [None]:
#Prepare features for scoring_data
scoring = pd.read_csv("scoring_data.csv")

# Use the same base features as training
X_test = scoring.drop(columns=["customer_id", "annual_income"])

# Use the same num_cols and cat_cols definition
# (order and names must match training)
X_test[num_cols] = X_test[num_cols].fillna(num_medians)
X_test[cat_cols] = X_test[cat_cols].fillna("missing")

# One-hot encode with same settings
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
X_test = X_test.astype(float)

# Polynomial features for numeric columns using the trained transformer
X_test_num_poly = poly.transform(X_test[num_cols])
X_test_num_poly = pd.DataFrame(
    X_test_num_poly,
    index=X_test.index,
    columns=poly.get_feature_names_out(num_cols),
)

X_test_final = pd.concat(
    [X_test_num_poly, X_test.drop(columns=num_cols)],
    axis=1
)

# Align columns with training data (extra safety)
X_test_final = X_test_final.reindex(columns=X_train_final.columns, fill_value=0.0)

print("X_test_final shape:", X_test_final.shape)

In [None]:
pred_test = reg_final.predict(X_test_final)

submission = pd.DataFrame({
    "customer_id": scoring["customer_id"],
    "monthly_spend": pred_test,
})

submission_filename = "submission_ridge_poly_final.csv"
submission.to_csv(submission_filename, index=False)

print("Submission file saved as:", submission_filename)
print(submission.head())