# MLOps exercises — Solutions for Tasks 1–6 (Ames + Drift)


**Workflow overview**  
- Use `AmesHousing.csv` for training.  
- Mirror Step 5 from `MLOps.ipynb`: one-hot dummies for **Bldg Type** → `BType_*` and **Neighborhood** → `Nbh_*` with `drop_first=True`, `dtype=int`.  
- Save/Load a **best model** + **feature levels**.  
- Evaluate new datasets (MAE) and run simple drift checks.


## Task 0 — Load training data & define feature set

In [None]:

import pandas as pd
TRAIN_CSV = '/mnt/data/AmesHousing.csv'

ames_raw = pd.read_csv(TRAIN_CSV)

num_features = ["Lot Area", "Overall Cond", "Year Built", "Gr Liv Area", "Mo Sold", "Yr Sold"]
cat_features = ["Bldg Type", "Neighborhood"]
target_col = "SalePrice"

cols_needed = num_features + cat_features + [target_col]
ames = ames_raw[cols_needed].copy()
ames.head()


## Task 1 — Preprocess new Ames data like Step 5 (`get_dummies` + drop originals)

In [None]:

import pandas as pd

def preprocess_like_step5(df: pd.DataFrame, levels: list | None = None):
    df = df.copy()
    out = df.join(pd.get_dummies(df["Bldg Type"], drop_first=True, dtype="int", prefix="BType"))
    out = out.join(pd.get_dummies(out["Neighborhood"], drop_first=True, dtype="int", prefix="Nbh"))
    out = out.drop(columns=["Bldg Type", "Neighborhood"])
    # Separate features (drop target if present)
    X = out.drop(columns=[c for c in ["SalePrice"] if c in out.columns])
    # Align to training feature set
    if levels is None:
        levels = list(X.columns)
    else:
        for col in levels:
            if col not in X.columns:
                X[col] = 0
        X = X[levels]
    return X, levels

# Fit levels on training data for later alignment
X_levels_fit, feature_levels = preprocess_like_step5(ames.drop(columns=[target_col]))
len(feature_levels)


## Model artifacts — Load from `MLOps.ipynb` if present, else train & save a baseline

In [None]:

import os, json, joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

ART_DIR = r"/mnt/data/ames_model_artifacts"
os.makedirs(ART_DIR, exist_ok=True)

model_path = os.path.join(ART_DIR, "best_model.joblib")
levels_path = os.path.join(ART_DIR, "feature_levels.json")

def train_and_save_baseline(ames_df):
    X_all, levels_all = preprocess_like_step5(ames_df.drop(columns=['SalePrice']))  # uses current feature_levels form
    y_all = ames_df['SalePrice'].values
    X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=1742)
    rf = RandomForestRegressor(n_estimators=300, random_state=1742, n_jobs=-1)
    rf.fit(X_tr, y_tr)
    joblib.dump(rf, model_path)
    with open(levels_path, "w") as f:
        json.dump(list(X_all.columns), f)
    return rf, list(X_all.columns)

# Try load
model = None
levels_saved = None
if os.path.exists(model_path) and os.path.exists(levels_path):
    model = joblib.load(model_path)
    with open(levels_path) as f:
        levels_saved = json.load(f)
else:
    model, levels_saved = train_and_save_baseline(ames)

type(model).__name__, len(levels_saved), model_path


## Task 2 — Function to evaluate a model on NEW Ames data (MAE)

In [None]:

from sklearn.metrics import mean_absolute_error

def evaluate_new_data(csv_path: str, model, levels: list):
    df_new = pd.read_csv(csv_path)
    assert "SalePrice" in df_new.columns, "New data must include SalePrice to compute MAE."
    X_new, _ = preprocess_like_step5(df_new, levels=levels)
    y_true = df_new["SalePrice"].values
    y_pred = model.predict(X_new)
    mae = mean_absolute_error(y_true, y_pred)
    return mae, pd.DataFrame({"y_true": y_true, "y_pred": y_pred})

"Function ready ✅"


## Task 3 — Test on `NewAmesData1.csv` with the best model

In [None]:

mae1, preds1 = evaluate_new_data("/mnt/data/NewAmesData1.csv", model, levels_saved)
print("MAE on NewAmesData1.csv:", round(mae1, 2))
preds1.head()


## Task 4 — Test on `NewAmesData2.csv` and note any drift signs later

In [None]:

mae2, preds2 = evaluate_new_data("/mnt/data/NewAmesData2.csv", model, levels_saved)
print("MAE on NewAmesData2.csv:", round(mae2, 2))
preds2.head()


## Tasks 5–6 — Data drift checks (numeric: KS; categorical: Chi-squared)

In [None]:

import numpy as np
from scipy.stats import ks_2samp, chi2_contingency

def drift_report(train_df: pd.DataFrame, new_df: pd.DataFrame, num_feats: list, cat_feats: list):
    rows = []
    # Numeric KS
    for col in num_feats:
        a = train_df[col].dropna()
        b = new_df[col].dropna()
        if len(a) > 0 and len(b) > 0:
            stat, p = ks_2samp(a, b)
            rows.append({"feature": col, "type": "numeric", "test": "KS", "stat": stat, "p_value": p, "drift": p < 0.05})
    # Categorical Chi2
    for col in cat_feats:
        ct_train = pd.crosstab(train_df[col], "train").rename(columns={"train":"train"})
        ct_new = pd.crosstab(new_df[col], "new").rename(columns={"new":"new"})
        both = ct_train.join(ct_new, how="outer").fillna(0)
        try:
            chi2, p, dof, exp = chi2_contingency(both.values)
            rows.append({"feature": col, "type": "categorical", "test": "Chi2", "stat": chi2, "p_value": p, "drift": p < 0.05})
        except Exception as e:
            rows.append({"feature": col, "type": "categorical", "test": "Chi2", "stat": np.nan, "p_value": np.nan, "drift": False})
    rep = pd.DataFrame(rows).sort_values(["drift","p_value"], ascending=[False, True])
    return rep

train_slice = ames[num_features + cat_features + [target_col]].copy()
new2 = pd.read_csv("/mnt/data/NewAmesData2.csv")
new4 = pd.read_csv("/mnt/data/NewAmesData4.csv")

report2 = drift_report(train_slice, new2, num_features, cat_features)
report4 = drift_report(train_slice, new4, num_features, cat_features)

report2.head(20)


### Task 5 — Which variables drift in `NewAmesData2.csv`?

In [None]:

report2.sort_values(["drift","p_value"], ascending=[False, True])


### Task 6 — Which variables drift in `NewAmesData4.csv`?

In [None]:

report4.sort_values(["drift","p_value"], ascending=[False, True])
