# Solutions: Data Science & AI Engineer Interview Drills
Reference implementations and discussion. Try the exercises yourself first.

## Easy Solutions

### 1) Z-Score Normalization

In [None]:
import numpy as np

def zscore(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    mean = x.mean()
    std = x.std()
    if std == 0:
        return np.zeros_like(x, dtype=float)
    return (x - mean) / std

# Sanity check
x = np.random.randn(1000)
out = zscore(x)
assert np.allclose(out.mean(), 0, atol=1e-2)
assert np.allclose(out.std(), 1, atol=1e-2)

### 2) Missing-Value Imputer

In [None]:
import pandas as pd

def impute_with_median(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    result = df.copy()
    for col in cols:
        if not np.issubdtype(result[col].dropna().dtype, np.number):
            raise ValueError(f"Column {col} is not numeric")
        median = result[col].median()
        result[col] = result[col].fillna(median)
    return result

# Example
data = pd.DataFrame({"a": [1, 2, np.nan], "b": [3.0, np.nan, 5.0]})
assert impute_with_median(data, ["a", "b"]).isna().sum().sum() == 0

## Medium Solutions

### 3) Sliding-Window Feature Extraction

In [None]:
def sliding_windows(x: np.ndarray, k: int) -> np.ndarray:
    x = np.asarray(x)
    if k <= 0 or k > x.shape[0]:
        raise ValueError("Invalid window size")
    stride = x.strides[0]
    shape = (x.shape[0] - k + 1, k)
    return np.lib.stride_tricks.as_strided(x, shape=shape, strides=(stride, stride))

# Check
x = np.arange(6)
expected = np.array([[0,1,2],[1,2,3],[2,3,4],[3,4,5]])
assert np.array_equal(sliding_windows(x,3), expected)

### 4) Custom AUC

In [None]:
from sklearn.metrics import roc_curve

def binary_auc(y_true: np.ndarray, y_score: np.ndarray) -> float:
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)
    if y_true.shape != y_score.shape:
        raise ValueError("Shapes must match")
    if not set(np.unique(y_true)).issubset({0, 1}):
        raise ValueError("y_true must be binary")
    fpr, tpr, _ = roc_curve(y_true, y_score)
    # Trapezoidal integration
    return np.trapz(tpr, fpr)

# Check against sklearn
y_true = np.array([0, 0, 1, 1])
y_score = np.array([0.1, 0.4, 0.35, 0.8])
assert np.isclose(binary_auc(y_true, y_score), 0.75)

### 5) Logistic Regression from Scratch

In [None]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-z))

def _add_bias(X: np.ndarray) -> np.ndarray:
    return np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)

def predict_proba(X: np.ndarray, w: np.ndarray) -> np.ndarray:
    Xb = _add_bias(X)
    return sigmoid(Xb @ w)

def loss(X: np.ndarray, y: np.ndarray, w: np.ndarray, lambda_: float = 0.0) -> float:
    Xb = _add_bias(X)
    logits = Xb @ w
    eps = 1e-9
    ll = y * np.log(sigmoid(logits) + eps) + (1 - y) * np.log(1 - sigmoid(logits) + eps)
    reg = 0.5 * lambda_ * np.sum(w[1:] ** 2)
    return -ll.mean() + reg

def fit_logreg(
    X: np.ndarray,
    y: np.ndarray,
    lr: float = 0.1,
    epochs: int = 1000,
    lambda_: float = 0.0,
    tol: float = 1e-6,
) -> np.ndarray:
    Xb = _add_bias(X)
    w = np.zeros(Xb.shape[1])
    prev_loss = np.inf
    for _ in range(epochs):
        preds = sigmoid(Xb @ w)
        grad = Xb.T @ (preds - y) / len(y)
        grad[1:] += lambda_ * w[1:]
        w -= lr * grad
        cur_loss = loss(X, y, w, lambda_)
        if abs(prev_loss - cur_loss) < tol:
            break
        prev_loss = cur_loss
    return w

# Quick sanity check
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=200, n_features=4, random_state=42)
w = fit_logreg(X, y, lr=0.1, epochs=5000, lambda_=0.1)
preds = predict_proba(X, w) >= 0.5
assert (preds == y).mean() > 0.8

## Expert Solutions

### 6) Minimal MLP in PyTorch

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

class MLP(nn.Module):
    def __init__(self, in_dim: int, hidden_dim: int = 32, p_drop: float = 0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p_drop),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x).squeeze(-1)

def set_seed(seed: int = 0):
    torch.manual_seed(seed)
    np.random.seed(seed)

def train_epoch(model, loader, optimizer, criterion, device: str = "cpu") -> float:
    model.train()
    total_loss, total_count = 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
        total_count += xb.size(0)
    return total_loss / total_count

def evaluate(model, loader, criterion, device: str = "cpu") -> tuple[float, float]:
    model.eval()
    total_loss, total_correct, total_count = 0.0, 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)
            preds = (torch.sigmoid(logits) >= 0.5).float()
            total_loss += loss.item() * xb.size(0)
            total_correct += (preds == yb).sum().item()
            total_count += xb.size(0)
    return total_loss / total_count, total_correct / total_count

# Smoke test
set_seed(42)
X = torch.randn(200, 10)
y = (X.sum(dim=1) + 0.2 * torch.randn(200) > 0).float()
loader = DataLoader(TensorDataset(X, y), batch_size=32, shuffle=True)
model = MLP(in_dim=10)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
for epoch in range(3):
    train_loss = train_epoch(model, loader, optimizer, criterion)
val_loss, acc = evaluate(model, loader, criterion)
assert acc > 0.7

### 7) Transformer Block Forward Pass

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model: int, nhead: int, dim_ff: int, p_drop: float = 0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, dropout=p_drop, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.GELU(),
            nn.Dropout(p_drop),
            nn.Linear(dim_ff, d_model),
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p_drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Pre-norm + self-attention
        x_norm = self.norm1(x)
        attn_out, _ = self.attn(x_norm, x_norm, x_norm, need_weights=False)
        x = x + self.dropout(attn_out)
        # Pre-norm + feed-forward
        ff_norm = self.norm2(x)
        ff_out = self.ffn(ff_norm)
        x = x + self.dropout(ff_out)
        return x

# Check
blk = TransformerBlock(d_model=32, nhead=4, dim_ff=64)
dummy = torch.randn(2, 5, 32)
out = blk(dummy)
assert out.shape == dummy.shape

### 8) Offline Metrics vs. Online Metrics (Design)
- **Offline:**
  - `AUC/ROC` for ranking quality; insensitive to threshold.
  - `NDCG@k` to emphasize top-k relevance (aligns with recommender objectives).
  - `Calibration error` (ECE/Brier) to ensure probabilities are trustworthy for downstream decisions.
- **Online (A/B):**
  - `CTR`/`conversion` lift and `add-to-cart`/`play` rate as primary success metrics.
  - `Latency`/`p95 response` and `error rate` as guardrails; also watch `bounce rate` or `session length` for negative UX shifts.
- **Canary rollout:**
  - Start with small traffic slice (e.g., 1â€“5%), region/segment isolated; compare to control in real time.
  - Automatic rollback if guardrails breached; gradually ramp traffic with monitoring windows.
  - Keep feature flags to disable rapidly; log inputs/outputs for postmortem.

### 9) Feature Store Consistency (Design)
- **Point-in-time correctness:** enforce event timestamps, use as-of joins in offline pipelines to avoid future leakage; freeze lookup times for training sets.
- **Backfills/versioning:** immutably store feature values with data/version stamps; backfill via new versions rather than overwriting; track lineage in metadata.
- **Validation/monitoring:** schema checks (types, ranges), null/volume drift alerts, training/serving distribution comparisons for key features; canary new feature versions before full rollout.
- **Operational playbooks:** feature flagging to disable problematic features, rollback to prior version, alerting channels, and runbooks for re-materialization; keep golden datasets for quick verification.