In [2]:
import numpy as np
import pandas as pd
import math

from numpy.linalg import eig
from scipy import optimize
from scipy.special import gammaln  # log(y!) safely


In [3]:
def as_numpy(x):
    """Force numpy array (autograders often care)."""
    return np.asarray(x)

def clip01(a):
    return np.clip(a, 0.0, 1.0)

# 1) Markov Chain

## 1.1 Estimate Transition Matrix from a Path

In [None]:
def estimate_transition_matrix(states, n_states=None):
    """
    states: 1D array/list like [x0,x1,...,xT]
    returns P_hat shape (n_states,n_states)
    """
    s = np.asarray(states, dtype=int)
    if n_states is None:
        n_states = int(s.max()) + 1

    counts = np.zeros((n_states, n_states), dtype=float)
    for a, b in zip(s[:-1], s[1:]):
        counts[a, b] += 1

    row_sums = counts.sum(axis=1, keepdims=True)
    P_hat = np.divide(counts, row_sums, out=np.zeros_like(counts), where=row_sums > 0)
    return P_hat


### Check Transition Matrix

In [None]:
def is_transition_matrix(P, tol=1e-10):
    P = np.asarray(P, dtype=float)
    if (P < -tol).any():
        return False
    rs = P.sum(axis=1)
    return np.allclose(rs, 1.0, atol=1e-8)

## 1.2 Stationary distribution π

### Method A (robust): Eigenvector of $P^T$ at eigenvalue 1

In [None]:
def stationary_distribution(P):
    P = np.asarray(P, dtype=float)
    w, v = eig(P.T)
    k = np.argmin(np.abs(w - 1))
    pi = np.real(v[:, k])
    pi = pi / pi.sum()
    pi = np.maximum(pi, 0)
    pi = pi / pi.sum()
    return pi

### Method B (Linear System): solve $(P^T-I)\pi = 0 + constraint$

In [4]:
def stationary_distribution_lstsq(P):
    P = np.asarray(P, dtype=float)
    n = P.shape[0]
    A = np.vstack([P.T - np.eye(n), np.ones(n)])
    b = np.zeros(n+1)
    b[-1] = 1
    pi, *_ = np.linalg.lstsq(A, b, rcond=None)
    pi = np.maximum(pi, 0)
    return pi / pi.sum()


## 1.3 n-step transition probability

In [5]:
def n_step_prob(P, i, j, k):
    Pk = np.linalg.matrix_power(np.asarray(P, float), k)
    return float(Pk[i, j])


## 1.4 Simulate the chain

In [6]:
def simulate_chain(P, x0, T, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    P = np.asarray(P, float)
    x = int(x0)
    path = [x]
    for _ in range(T):
        x = rng.choice(P.shape[0], p=P[x])
        path.append(int(x))
    return np.array(path, dtype=int)


## 1.5 Irreducible? (graph reachability)

A chain is irreducible if every state can reach every other state with positive probability.

In [7]:
def is_irreducible(P):
    P = np.asarray(P, float)
    n = P.shape[0]
    adj = (P > 0)

    def reachable(start):
        seen = set([start])
        stack = [start]
        while stack:
            u = stack.pop()
            for v in np.where(adj[u])[0]:
                if v not in seen:
                    seen.add(int(v))
                    stack.append(int(v))
        return seen

    for s in range(n):
        if len(reachable(s)) != n:
            return False
    return True


## 1.6 Period / aperiodic

Period of state i is gcd of return times $\{n : (P^n)_ii > 0\}$. If gcd=1 ⇒ aperiodic.

In [8]:
def period_of_state(P, i, max_power=200):
    P = np.asarray(P, float)
    n = P.shape[0]
    g = 0
    Pk = np.eye(n)
    for k in range(1, max_power+1):
        Pk = Pk @ P
        if Pk[i, i] > 1e-12:
            g = k if g == 0 else math.gcd(g, k)
    return g if g != 0 else None  # None means never returns within max_power

def is_aperiodic(P, max_power=200):
    P = np.asarray(P, float)
    for i in range(P.shape[0]):
        d = period_of_state(P, i, max_power=max_power)
        if d is not None and d != 1:
            return False
    return True


## 1.7 Expected hitting time to target state/set (the “linear equations” part)

Let target set be A. For $i \in A: m_i = 0$. Else:

\begin{equation*}
m_i = 1 + \sum_{j} P_{ij} m_j
\end{equation*}

In [10]:
def expected_hitting_times(P, target_states):
    P = np.asarray(P, float)
    n = P.shape[0]
    target = set(target_states)

    A = np.zeros((n, n), float)
    b = np.zeros(n, float)

    for i in range(n):
        if i in target:
            A[i, i] = 1.0
            b[i] = 0.0
        else:
            A[i, i] = 1.0
            A[i, :] -= P[i, :]
            b[i] = 1.0

    m = np.linalg.solve(A, b)
    return m


### Simulation Estimate (Backup Method)

In [11]:


def hitting_time_sim(P, start, target, n_sims=2000, max_steps=10000, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    P = np.asarray(P, float)
    target = set(target if hasattr(target, "__iter__") else [target])

    times = []
    for _ in range(n_sims):
        x = int(start)
        t = 0
        while t < max_steps and x not in target:
            x = rng.choice(P.shape[0], p=P[x])
            t += 1
        times.append(t if x in target else np.inf)
    return np.array(times, float)


## 1.8 “First time hit D at time t” (exact distribution)

If target is state d. Let Q be transitions among non-target states, and r be probs into d. Then:

\begin{equation*}
\mathbb{P}(T = t) = e_s^T Q^{t-1}r
\end{equation*}


\begin{equation*}
\mathbb{P}(T = \inf) = 1- \sum_{t \ge 1} \mathbb{P}(T = t)
\end{equation*}

In [12]:
def first_hit_probabilities(P, start, target_state, t_list):
    P = np.asarray(P, float)
    n = P.shape[0]
    d = int(target_state)

    non = [i for i in range(n) if i != d]
    idx = {s:i for i,s in enumerate(non)}

    Q = P[np.ix_(non, non)]
    r = P[non, d]

    s = int(start)
    if s == d:
        return {t: (1.0 if t == 0 else 0.0) for t in t_list} | {"inf": 0.0}

    e = np.zeros(len(non))
    e[idx[s]] = 1.0

    out = {}
    for t in t_list:
        if t <= 0:
            out[t] = 0.0
        else:
            Qtm1 = np.linalg.matrix_power(Q, t-1)
            out[t] = float(e @ Qtm1 @ r)

    # probability of ever hitting (sum_{t>=1} e Q^{t-1} r = e (I-Q)^(-1) r)
    I = np.eye(len(non))
    hit_ever = float(e @ np.linalg.solve(I - Q, r))
    out["inf"] = 1.0 - hit_ever
    return out


# 2) Random Variables & Sampling 

### Inversion, Rejection, and LCG

## 2.1 Inversion sampling (when CDF is invertible)

Example Problem: 

\begin{equation*}
F(x) = \frac{e^{x^2} - 1}{e-1}, 0 \lt x \lt 1
\end{equation*}

Solve $u = {(e^{x^2} - 1)}/{(e-1)}$:

\begin{equation*}
x = \sqrt{ln(1 + u(e-1))}
\end{equation*}

In [13]:
def sample_inversion_e_x2(n, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    u = rng.uniform(0, 1, size=n)
    x = np.sqrt(np.log(1 + u*(math.e - 1)))
    return x


## 2.2 Rejection sampling (general recipe)

1. Identify target density $f(x)$ on support
2. Choose Proposal $g(x)$ that's easy to sample
3. Find $M \ge sup_x f(x)/g(x)$
4. Accept with Probability $f(x)/(M g(x))$

### For the same distribution, using Uniform(0,1) proposal

Density:

\begin{equation*}
f(x) = \frac{2xe^{x^2}}{e-1}, 0 \lt x \lt 1
\end{equation*}

Maximum at $x = 1  \Longrightarrow M = f(1) = 2e/(e-1)$

In [None]:
def sample_rejection_e_x2(n, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    M = 2*math.e/(math.e - 1)

    out = []
    while len(out) < n:
        x = rng.uniform(0, 1)
        u = rng.uniform(0, 1)
        fx = (2*x*math.exp(x*x))/(math.e - 1)
        if u <= fx / M:
            out.append(x)
    return np.array(out)


## 2.3 Monte Carlo integration (+ Hoeffding CI)

If you sample $X \sim f$, then:

\begin{equation*}
\int h(x) f(x) d(x) = \mathbb{E}[h(X)] \approx \frac{1}{n} \sum h(X_i)
\end{equation*}

#### Hoeffding 95% CI for bounded $h(X) \in [a,b]

\begin{equation*}
\epsilon = (b-a) \sqrt{\frac{ln(2/\delta)}{2n}}, \delta = 0.05
\end{equation*}




In [None]:
def hoeffding_ci(samples_of_h, a, b, delta=0.05):
    z = np.asarray(samples_of_h, float)
    n = len(z)
    mean = float(z.mean())
    eps = (b - a) * math.sqrt(math.log(2/delta) / (2*n))
    return (mean - eps, mean + eps), mean, eps


Example: $int_0^1 sin(x) f(x) dx$

Since $sin(x) \in [0,sin(1)] \subset [0,1]$, safe bound is [0,1].

In [16]:
x = sample_inversion_e_x2(100000)
h = np.sin(x)
ci, est, eps = hoeffding_ci(h, a=0.0, b=1.0, delta=0.05)

## 2.4 LCG → Uniform(0,1) → Accept/Reject target density

### LCG Template

In [17]:
def lcg(size, seed=1, a=1103515245, c=12345, m=2**31):
    u = seed
    out = []
    for _ in range(size):
        u = (a*u + c) % m
        out.append(u / m)
    return np.array(out, float)


### Accept-reject for target $f(x) = \frac{\pi}{2}|sin(2 \pi x)|$ on $[0,1]$

Max is $\pi/2$, proposal uniform $\Rightarrow$ acceptance prob $= |sin(2 \pi x)|$

In [18]:
def sample_sin_target(n, seed=1):
    out = []
    k = 0
    while len(out) < n:
        x = lcg(1, seed=seed+k)[0]
        u = lcg(1, seed=seed+k+999)[0]
        if u <= abs(math.sin(2*math.pi*x)):
            out.append(x)
        k += 1
    return np.array(out, float)


## 2.5 The “tricky CDF” hack

CDF on $0 < x < 1/20$:

\begin{equation*}
F(x) = 20xe^{20- \frac{1}{x}}
\end{equation*}

Let $Z = 1/X$ (so $Z \ge 20$). Then propose:
- Sample Z from shifted exponential: $Z = 20 + E, E \sim Exp(1)$
- Accept with a near-constant $M \approx 1.05$ (very fast)

In [19]:
def sample_tricky_cdf_fast(n, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    out = []
    M = 1.05  # safe bound

    while len(out) < n:
        z = 20 + rng.exponential(1.0)   # proposal h(z) = e^{-(z-20)}
        u = rng.uniform(0, 1)

        # target/proposal ratio simplifies to 20*(1/z + 1/z^2)
        ratio = 20.0*(1.0/z + 1.0/(z*z))
        if u <= ratio / M:
            out.append(1.0/z)

    return np.array(out)


# 3) Classification Metrics + Confidence Interval

## 3.1 Confusion Matrix

In [20]:
def confusion_matrix(y_true, y_pred):
    y_true = np.asarray(y_true, int)
    y_pred = np.asarray(y_pred, int)

    TP = int(np.sum((y_true==1) & (y_pred==1)))
    TN = int(np.sum((y_true==0) & (y_pred==0)))
    FP = int(np.sum((y_true==0) & (y_pred==1)))
    FN = int(np.sum((y_true==1) & (y_pred==0)))
    return {"TP":TP, "TN":TN, "FP":FP, "FN":FN}


## 3.2 Precision / recall / accuracy

![title](img/confusion_matrix.png)

In [21]:
def precision_recall_accuracy(counts):
    TP, TN, FP, FN = counts["TP"], counts["TN"], counts["FP"], counts["FN"]
    precision = TP / (TP + FP) if (TP+FP)>0 else 0.0
    recall    = TP / (TP + FN) if (TP+FN)>0 else 0.0
    accuracy  = (TP + TN) / (TP + TN + FP + FN) if (TP+TN+FP+FN)>0 else 0.0
    return precision, recall, accuracy


## 3.3 Hoeffding CI for a proportion
Works for accuracy, recall, precision if defined as mean of Bernoulli

If metric is average of values in [0,1], Hoeffding gives:

\begin{equation*}
\hat{p} \pm \sqrt{\frac{ln(2/\delta)}{2n}} 
\end{equation*}

In [22]:
def hoeffding_ci_01(mean_hat, n, delta=0.05):
    eps = math.sqrt(math.log(2/delta)/(2*n))
    return (mean_hat - eps, mean_hat + eps)


#### How to apply to precision/recall properly

- Recall is mean of indicators over the true positives set (all y=1):
    - data size $= n_1 = \# \{i:y_i = 1\}$
- Precision is mean of indicators over the predicted positives set:
    - data size $= n_{\hat{1}} = \# \{i: \hat{y_i} = 1\}$

In [23]:
def recall_ci(y_true, y_pred, delta=0.05):
    y_true = np.asarray(y_true, int); y_pred = np.asarray(y_pred, int)
    idx = (y_true==1)
    n = int(idx.sum())
    if n == 0: 
        return (0.0, 1.0)
    rec = float(np.mean(y_pred[idx]==1))
    return hoeffding_ci_01(rec, n, delta=delta)

def precision_ci(y_true, y_pred, delta=0.05):
    y_true = np.asarray(y_true, int); y_pred = np.asarray(y_pred, int)
    idx = (y_pred==1)
    n = int(idx.sum())
    if n == 0:
        return (0.0, 1.0)
    prec = float(np.mean(y_true[idx]==1))
    return hoeffding_ci_01(prec, n, delta=delta)


# 4) Cost-Sensitive Classification (Fraud-style)

### Total Cost


In [24]:
def total_cost(counts, c_fp=100, c_fn=500, c_tp=0, c_tn=0):
    return (counts["FP"]*c_fp +
            counts["FN"]*c_fn +
            counts["TP"]*c_tp +
            counts["TN"]*c_tn)


### Sweep thresholds over a score (SVM decision function)

In [None]:
def sweep_thresholds(y_true, scores, thresholds, c_fp=100, c_fn=500):
    y_true = np.asarray(y_true, int)
    scores = np.asarray(scores, float)

    rows = []
    for t in thresholds:
        y_pred = (scores >= t).astype(int)
        counts = confusion_matrix(y_true, y_pred)
        rows.append({
            "threshold": float(t),
            **counts,
            "total_cost": float(total_cost(counts, c_fp=c_fp, c_fn=c_fn))
        })
    return pd.DataFrame(rows).sort_values("total_cost")


Typical usage with LinearSVC:

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LinearSVC(C=1.0, max_iter=10000, random_state=0)
clf.fit(X_train, y_train)

scores = clf.decision_function(X_test)
thresholds = np.linspace(scores.min(), scores.max(), 200)
df = sweep_thresholds(y_test, scores, thresholds)
best = df.iloc[0]
best_threshold = best["threshold"]


NameError: name 'X_train' is not defined

### Hoeffding CI for average cost per observation
Cost per obs is bounded in $[0,500]$

Same Hoeffding formula:

In [26]:
def per_observation_cost(y_true, y_pred, c_fp=100, c_fn=500, c_tp=0, c_tn=0):
    y_true = np.asarray(y_true, int)
    y_pred = np.asarray(y_pred, int)
    cost = np.zeros_like(y_true, dtype=float)

    cost[(y_true==0) & (y_pred==1)] = c_fp
    cost[(y_true==1) & (y_pred==0)] = c_fn
    cost[(y_true==1) & (y_pred==1)] = c_tp
    cost[(y_true==0) & (y_pred==0)] = c_tn
    return cost

def hoeffding_cost_ci(costs, a=0.0, b=500.0, delta=0.05):
    costs = np.asarray(costs, float)
    n = len(costs)
    mean = float(costs.mean())
    eps = (b-a)*math.sqrt(math.log(2/delta)/(2*n))
    return (mean-eps, mean+eps), mean


# 5) “Spam words” conditional probability (quick counting)

Estimate:

\begin{equation*}
\mathbb{P}(Y=1 | \text{word present}) = \frac{\#(\text{spam and word present})}{\#(\text{word present})}
\end{equation*}

In [27]:
def contains_word(text, words):
    t = text.lower()
    return any(w in t for w in words)

def estimate_spam_given_words(data, words=("free","prize")):
    # data: list of [text, label] where label 0/1
    present = []
    spam = []
    for text, y in data:
        if contains_word(text, words):
            present.append(1)
            spam.append(int(y))
    present_n = len(present)
    if present_n == 0:
        return 0.0, 0
    return sum(spam)/present_n, present_n


CI with Hoeffding (mean of Bernoulli):

In [28]:
p_hat, n = estimate_spam_given_words(spam_no_spam, ("free","prize"))
ci = hoeffding_ci_01(p_hat, n, delta=0.10)  # for 90% interval


NameError: name 'spam_no_spam' is not defined

“free appears twice” ⇒ count occurrences:

In [29]:
def count_word(text, word):
    return text.lower().split().count(word)

def estimate_spam_free_twice(data):
    vals = []
    for text, y in data:
        if text.lower().count("free") >= 2:
            vals.append(int(y))
    n = len(vals)
    p_hat = sum(vals)/n if n>0 else 0.0
    return p_hat, n


# 6) Logistic model + calibration (ProportionalSpam-style)

## 6.1 Logistic loss (negative log-likelihood)

For $p_i = \sigma ({\beta}_0 + {x_i}^T \beta)$:

\begin{equation*}
\mathbb{L} = - \sum{(y_i \ log \ p_i + (i-y_i) \ log(1 - p_i))}
\end{equation*}

In [30]:
def logistic_loss(X, y, coeffs):
    X = np.asarray(X, float)
    y = np.asarray(y, float)
    b0 = coeffs[0]
    b  = coeffs[1:]
    z = b0 + X @ b
    # stable sigmoid
    p = 1.0/(1.0 + np.exp(-z))
    eps = 1e-12
    p = np.clip(p, eps, 1-eps)
    return -np.sum(y*np.log(p) + (1-y)*np.log(1-p))


Fit with scipy:

In [31]:
def fit_logistic(X, y):
    X = np.asarray(X, float)
    y = np.asarray(y, float)
    init = np.zeros(X.shape[1] + 1)
    res = optimize.minimize(lambda c: logistic_loss(X,y,c), init, method="CG")
    return res.x, res


Predict probabilities:

In [32]:
def logistic_predict_proba(X, coeffs):
    X = np.asarray(X, float)
    z = coeffs[0] + X @ coeffs[1:]
    return 1.0/(1.0 + np.exp(-z))


## 6.2 Calibration with isotonic regression

In [33]:
from sklearn.isotonic import IsotonicRegression

def calibrate_isotonic(probs_calib, y_calib):
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(probs_calib, y_calib)
    return iso


Pipeline:

In [None]:
# Fit base model on train
coeffs, _ = fit_logistic(X_train, y_train)

# Calibrate on calib set
p_cal = logistic_predict_proba(X_calib, coeffs)
iso = calibrate_isotonic(p_cal, y_calib)

# Final calibrated probs on test
p_test = logistic_predict_proba(X_test, coeffs)
p_final = iso.predict(p_test)

# 0-1 loss using Bayes classifier threshold 0.5
y_pred = (p_final >= 0.5).astype(int)
loss01 = np.mean(y_pred != y_test)

# Hoeffding CI for 0-1 loss (bounded [0,1])
ci01 = hoeffding_ci_01(loss01, n=len(y_test), delta=0.05)


# 7) Regression questions

## 7.1 Train/test split + model

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

def build_regression_pipeline(cat_cols, num_cols):
    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ]
    )
    model = RandomForestRegressor(random_state=0)
    return Pipeline([("pre", pre), ("model", model)])


## 7.2 Metrics: MAE + Absolute Relative Error

In [35]:
def mae(y_true, y_pred):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    return float(np.mean(np.abs(y_true - y_pred)))

def abs_relative_error(y_true, y_pred, eps=1e-12):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    return float(np.mean(np.abs((y_true - y_pred) / (y_true + eps))))


## 7.3 EDF of residuals + 95% confidence bands (DKW inequality)

In [36]:
import matplotlib.pyplot as plt

def edf_with_dkw(residuals, alpha=0.05):
    r = np.sort(np.asarray(residuals, float))
    n = len(r)
    Fn = np.arange(1, n+1)/n
    eps = math.sqrt(math.log(2/alpha)/(2*n))
    lower = clip01(Fn - eps)
    upper = clip01(Fn + eps)
    return r, Fn, lower, upper, eps

def plot_edf_with_bands(residuals, alpha=0.05):
    x, Fn, lo, hi, eps = edf_with_dkw(residuals, alpha=alpha)
    plt.figure()
    plt.step(x, Fn, where="post")
    plt.step(x, lo, where="post")
    plt.step(x, hi, where="post")
    plt.xlabel("residual")
    plt.ylabel("EDF")
    plt.title(f"EDF with DKW {int((1-alpha)*100)}% bands (eps={eps:.4f})")
    plt.show()


## 7.4 Scatter plot predicted vs true

In [37]:
def plot_pred_vs_true(y_true, y_pred):
    plt.figure()
    plt.scatter(y_pred, y_true)
    plt.xlabel("predicted")
    plt.ylabel("true")
    plt.title("Predicted vs True")
    plt.show()


# 8) Poisson Regression (counts visits)

In [38]:
def poisson_loss(coeffs, X, y, include_factorial=False):
    X = np.asarray(X, float)
    y = np.asarray(y, float)
    alpha = coeffs[:-1]
    beta  = coeffs[-1]
    eta = X @ alpha + beta
    lam = np.exp(eta)

    loss = np.sum(lam - y*eta)
    if include_factorial:
        loss += np.sum(gammaln(y + 1))  # constant w.r.t coeffs, but safe
    return loss

def fit_poisson(X, y):
    X = np.asarray(X, float)
    y = np.asarray(y, float)
    init = np.zeros(X.shape[1] + 1)
    res = optimize.minimize(lambda c: poisson_loss(c, X, y), init, method="CG")
    return res.x, res

def poisson_predict(X, coeffs):
    X = np.asarray(X, float)
    alpha = coeffs[:-1]
    beta  = coeffs[-1]
    return np.exp(X @ alpha + beta)


Naive baseline (always predict mean of train):

In [None]:
def naive_mean_predict(y_train, n_test):
    return np.full(n_test, float(np.mean(y_train)))


# 9) SVD / PCA anomaly detection

In [39]:
def svd_explained_variance(X):
    X = np.asarray(X, float)
    U, s, Vt = np.linalg.svd(X, full_matrices=False)
    var = s**2
    evr = var / var.sum()
    cev = np.cumsum(evr)
    return U, s, Vt, evr, cev


In [None]:
def choose_k_for_variance(cum_evr, target=0.90):
    return int(np.searchsorted(cum_evr, target) + 1)


In [None]:
def svd_reconstruct(U, s, Vt, k):
    Uk = U[:, :k]
    sk = s[:k]
    Vtk = Vt[:k, :]
    return Uk @ (sk[:, None] * Vtk)


In [None]:
def row_reconstruction_error(X, Xhat):
    X = np.asarray(X, float)
    Xhat = np.asarray(Xhat, float)
    return np.linalg.norm(X - Xhat, axis=1)


In [None]:
def top_k_anomalies(errors, k=10):
    idx = np.argsort(errors)[::-1][:k]
    return idx, errors[idx]
