## Here is an example of how to check reverisibility of any matrix!

In [None]:
import numpy as np

# P = matrix
# pi = linear equation system we have 

# Transition matrix
P = np.array([
    [0.3, 0.7, 0.0, 0.0],  # Downtown
    [0.2, 0.5, 0.3, 0.0],  # Suburbs
    [0.0, 0.0, 0.5, 0.5],  # Countryside
    [0.0, 0.0, 0.0, 1.0]   # Workshop
])


stationary_dist = stationary_distribution(P)


# Code for checking reversibility for ANY matrix!:

def is_reversible(P, stationary_dist, tol=1e-12):
    n = P.shape[0]
    for i in range(n):
        for j in range(n):
            left = stationary_dist[i] * P[i,j]
            right = stationary_dist[j] * P[j,i]
            if not np.isclose(left, right, atol=tol):
                return False
    return True

print("Reversible:", is_reversible(P, stationary_dist))




Reversible: True


## This is how you can always find the Stationary distribution of any matrix!

In [11]:
import numpy as np

def stationary_distribution(P, tol=1e-12, verify=True):
    """
    Compute the stationary distribution of a finite-state Markov chain.

    Parameters
    ----------
    P : np.ndarray, shape (n, n)
        Transition matrix. Rows should sum to 1 (row-stochastic).
    tol : float, optional
        Tolerance used for cleaning small numerical noise.
    verify : bool, optional
        If True, checks that the result is approximately stationary.

    Returns
    -------
    pi : np.ndarray, shape (n,)
        Stationary distribution vector (non-negative, sums to 1).

    Notes
    -----
    This solves the linear system:

        (P^T - I) * pi = 0,  with  sum(pi) = 1

    by replacing one of the equations with the normalization condition.
    It assumes that a (unique) stationary distribution exists.
    """
    P = np.asarray(P, dtype=float)
    n = P.shape[0]

    if P.shape[0] != P.shape[1]:
        raise ValueError("P must be a square matrix.")

    # Build A * pi = b
    A = P.T - np.eye(n)
    b = np.zeros(n)

    # Replace last row with normalization condition: sum_i pi_i = 1
    A[-1, :] = 1.0
    b[-1] = 1.0

    # Solve linear system
    pi = np.linalg.solve(A, b)

    # Clean tiny numerical noise
    pi[np.abs(pi) < tol] = 0.0

    # If there are small negative values, clamp them to 0 and renormalize
    if np.any(pi < -tol):
        # Serious negativity -> indicate potential problem
        raise RuntimeError(
            "Computed stationary distribution has significantly negative entries. "
            "Check that P is a valid transition matrix with a unique stationary distribution."
        )

    # Clamp small negatives and renormalize
    pi = np.maximum(pi, 0.0)
    s = pi.sum()
    if not np.isfinite(s) or s <= 0:
        raise RuntimeError("Failed to compute a valid stationary distribution (sum <= 0).")
    pi /= s

    if verify:
        # Check stationarity: pi P ≈ pi
        if not np.allclose(pi @ P, pi, atol=1e-8):
            raise RuntimeError("Result does not satisfy pi P ≈ pi. Check the input matrix P.")

    return pi

P = np.array([[0.3,0.7,0,0], [0.2,0.5,0.3,0], [0,0,0.5,0.5], [0,0,0,1]])
print("This is stationary distribution: ", stationary_distribution(P))


This is stationary distribution:  [0. 0. 0. 1.]


In [10]:
# This stationary distribution fiunction could also work, just not always the same and correct way:
def stationary_distribution(P):
    """
    Computes the stationary distribution of a Markov chain
    by finding the eigenvector corresponding to eigenvalue 1.
    """
    eigenvalues, eigenvectors = np.linalg.eig(P.T)
    
    # Find the eigenvector associated with eigenvalue 1
    idx = np.argmin(np.abs(eigenvalues - 1))
    vec = np.real(eigenvectors[:, idx])
    
    # Normalize to sum to 1
    stationary = vec / np.sum(vec)
    return stationary

P = np.array([[0.3,0.7,0,0], [0.2,0.5,0.3,0], [0,0,0.5,0.5], [0,0,0,1]])
print("This is stationary distribution: ", stationary_distribution(P))

This is stationary distribution:  [0. 0. 0. 1.]


### Always use this stationary distribution function below to check, it WILL always work! 

In [None]:
def stationary_distribution_always_works(P):
    """
    Computes the stationary distribution by solving
    (P^T - I) * pi = 0  with  sum(pi) = 1.
    This method ALWAYS works for any Markov chain with a stationary distribution.
    """

    n = P.shape[0]

    # Build system: (P^T - I) * pi = 0
    A = P.T - np.eye(n)

    # Replace last equation with the normalization condition sum(pi)=1
    A[-1] = np.ones(n)

    b = np.zeros(n)
    b[-1] = 1.0

    # Solve the linear system
    pi = np.linalg.solve(A, b)

    return pi



This is stationary distribution:  [ 0. -0. -0.  1.]


### This is another stationary distribution function which the bot said will actually always for for any chain:



In [None]:
import numpy as np

def stationary_distribution_any_markov(P, tol=1e-12, cleanup_tol=1e-15, verify=True):
    P = np.asarray(P, dtype=float)
    n = P.shape[0]
    if P.ndim != 2 or n != P.shape[1]:
        raise ValueError("P must be square.")

    # Validate Markov matrix
    if np.any(P < -cleanup_tol):
        raise ValueError("P has negative entries.")
    if not np.allclose(P.sum(axis=1), 1.0, atol=1e-12):
        raise ValueError("Rows of P must sum to 1.")

    A = P.T - np.eye(n)

    # Add normalization as an extra equation (least squares)
    A_aug = np.vstack([A, np.ones((1, n))])
    b_aug = np.zeros(n + 1)
    b_aug[-1] = 1.0

    # Least-squares solution (works even if A is singular)
    pi, *_ = np.linalg.lstsq(A_aug, b_aug, rcond=None)

    # Cleanup / project to simplex
    pi[np.abs(pi) < cleanup_tol] = 0.0
    pi = np.maximum(pi, 0.0)
    s = pi.sum()
    if s <= 0 or not np.isfinite(s):
        raise RuntimeError("Could not recover a valid distribution.")
    pi /= s

    if verify and not np.allclose(pi @ P, pi, atol=1e-8):
        # If periodic/reducible, this should still pass; if not, input likely invalid/ill-conditioned
        raise RuntimeError("Result does not satisfy pi P ≈ pi (within tolerance).")

    return pi


-----
-----

### Expected hitting time function

This function computes expected hitting times to a given **target set of states** in a finite Markov chain with transition matrix \( P \).

We consider a Markov chain with state space \( \{0, 1, \dots, n-1\} \) and transition matrix

$$
P = (P_{ij})_{i,j=0}^{n-1},
$$

where \( P_{ij} = \mathbb{P}(X_{t+1} = j \mid X_t = i) \).

Given a set of **target states** \( T \subset \{0, \dots, n-1\} \), the *hitting time* of \( T \) is

$$
T_{\text{hit}} = \min\{ t \ge 0 : X_t \in T \}.
$$

For each state \( i \), we define the expected hitting time

$$
h(i) = \mathbb{E}[T_{\text{hit}} \mid X_0 = i].
$$

These satisfy

$$
h(i) = 0 \quad \text{for } i \in T,
$$

and for \( i \notin T \),

$$
h(i) = 1 + \sum_{j=0}^{n-1} P_{ij} h(j).
$$

If we collect the non-target states into a set \( S = \{0, \dots, n-1\} \setminus T \), and form the submatrix \( Q \) of \( P \) with rows and columns indexed by \( S \), then the vector \( h_S = (h(i))_{i \in S} \) solves

$$
(I - Q) h_S = \mathbf{1},
$$

where \( \mathbf{1} \) is a vector of ones.

The function `expected_hitting_time` implements this:

- **Parameters**
  - `P`: `np.ndarray` of shape `(n, n)`  
    Transition matrix of the Markov chain.
  - `target_states`: iterable of integers  
    Indices of the target states \( T \).
  - `start_state` (optional): integer  
    If provided, the function returns \( h(\text{start\_state}) \).
  - `start_dist` (optional): 1D array-like of length `n`  
    Initial distribution \( \alpha \). If provided, the function returns
    $$
    \mathbb{E}[T_{\text{hit}}] = \sum_{i=0}^{n-1} \alpha_i h(i).
    $$

- **Return value**
  - If `start_state` is given: a single float, \( h(\text{start\_state}) \).
  - If `start_dist` is given: a single float, the expected hitting time under that initial distribution.
  - If neither is given: a length-`n` NumPy array, containing \( h(i) \) for all states `i` (targets get value `0`).

- **Usage example (this exam problem)**

For the three-region chain

$$
P = \begin{pmatrix}
0.3 & 0.4 & 0.3 \\\\
0.2 & 0.5 & 0.3 \\\\
0.4 & 0.3 & 0.3
\end{pmatrix},
$$

with downtown = state 0 and suburbs = state 1:

```python
P = np.array([
    [0.3, 0.4, 0.3],  # Downtown
    [0.2, 0.5, 0.3],  # Suburbs
    [0.4, 0.3, 0.3],  # Countryside
])

ET_suburbs_to_downtown = expected_hitting_time(P, target_states=[0], start_state=1)
print(ET_suburbs_to_downtown)  # should be 50/13 ≈ 3.8461538


In [6]:

import numpy as np

def expected_hitting_time(P, target_states, start_state=None, start_dist=None):
    """
    Compute expected hitting times to a given set of target states in a finite Markov chain.

    Parameters
    ----------
    P : np.ndarray, shape (n, n)
        Transition matrix of the Markov chain.
    target_states : iterable of int
        Indices of the target states.
    start_state : int, optional
        If provided, return the expected hitting time starting from this state.
    start_dist : array-like, shape (n,), optional
        If provided, return the expected hitting time under this initial distribution.

    Returns
    -------
    float or np.ndarray
        - If start_state is given: expected hitting time from that state.
        - If start_dist is given: expected hitting time under that distribution.
        - If neither is given: array h of length n with expected hitting times
          from all states (targets have value 0).

    Notes
    -----
    This solves the linear system

        (I - Q) h_S = 1

    where Q is the submatrix of P restricted to non-target states,
    and 1 is a vector of ones. Assumes that the target set is hit
    with probability 1 from the relevant starting states.
    """
    P = np.asarray(P, dtype=float)
    n = P.shape[0]

    target_states = np.array(sorted(set(target_states)), dtype=int)
    all_states = np.arange(n, dtype=int)

    # Non-target states S
    non_target_states = np.array([s for s in all_states if s not in target_states], dtype=int)

    # If all states are targets, hitting time is identically zero
    if non_target_states.size == 0:
        h = np.zeros(n, dtype=float)
        if start_state is not None:
            return float(h[start_state])
        if start_dist is not None:
            start_dist = np.asarray(start_dist, dtype=float)
            return float(start_dist @ h)
        return h

    # Build Q and solve (I - Q) h_S = 1
    Q = P[np.ix_(non_target_states, non_target_states)]
    I = np.eye(Q.shape[0])
    ones = np.ones(Q.shape[0])

    # Solve for h_S
    h_S = np.linalg.solve(I - Q, ones)

    # Put back into full vector h of length n
    h = np.zeros(n, dtype=float)
    h[target_states] = 0.0
    for idx, s in enumerate(non_target_states):
        h[s] = h_S[idx]

    # Return according to user request
    if (start_state is not None) and (start_dist is not None):
        raise ValueError("Provide either start_state or start_dist, not both.")

    if start_state is not None:
        return float(h[start_state])

    if start_dist is not None:
        start_dist = np.asarray(start_dist, dtype=float)
        if start_dist.shape[0] != n:
            raise ValueError("start_dist must have length equal to number of states.")
        return float(start_dist @ h)

    return h


In [7]:
# Example from above code: 

P = np.array([
    [0.3, 0.4, 0.3],  # Downtown
    [0.2, 0.5, 0.3],  # Suburbs
    [0.4, 0.3, 0.3],  # Countryside
])

# Expected steps until first time in Downtown (state 0) starting from Suburbs (state 1)
ET_suburbs_to_downtown = expected_hitting_time(P, target_states=[0], start_state=1)
print(ET_suburbs_to_downtown)  # ~3.846153846153846 (50/13)


3.846153846153846


-----
# This is how you can always find M in a finite interval for the Reject-Accept sampling algorithm:



In [None]:
import numpy as np

def f_x(x):
    return np.exp(x)          # target pdf

def g_x(x):
    return 1/np.log(2)        # uniform(0, ln 2) pdf

# Remember to change the interval to your interval you have
xs = np.linspace(0, np.log(2), 1000)

ratio = f_x(xs) / g_x(xs)

# This works since M is always maximum of f / g
M_num = ratio.max()

print("Answer: ", 2*np.log(2))

print("Numeric M ≈", M_num)   # should be close to 2*np.log(2)


-----
# This is the different Hoeffding intervals from the Lecture notes:

## Summary of Concentration Inequalities from the Lecture Notes

This cell summarizes the main inequalities used in Monte Carlo estimation and empirical distribution analysis:  
- Hoeffding’s inequality for Monte Carlo means  
- The Dvoretzky–Kiefer–Wolfowitz (DKW) inequality  
- Alternative concentration inequalities listed in the notes  
For each formula we also show how to solve for $$\varepsilon$$ when forming a confidence interval.

---

## 1. Hoeffding’s Inequality for Monte Carlo Estimation

Assume we estimate a mean using  
$$
\overline{Y} = \frac{1}{n}\sum_{i=1}^n Y_i,
$$  
where the samples satisfy $Y_i \in [a,b]$.

The **two-sided Hoeffding bound** is:
$$
\mathbb{P}\left( \left| \overline{Y} - \mathbb{E}[Y] \right| \ge \varepsilon \right)
\;\le\; 2 \exp\left( \frac{-2 n \varepsilon^2}{(b-a)^2} \right).
$$

### Solving for $$\varepsilon$$

Set the right-hand side equal to $\delta$:
$$
2 \exp\left( \frac{-2 n \varepsilon^2}{(b-a)^2} \right) = \delta.
$$

Solving gives:
$$
\varepsilon = (b-a)\sqrt{\frac{\ln(2/\delta)}{2n}}.
$$

Thus a $(1-\delta)100\%$ confidence interval is:
$$
\left[\, \overline{Y} - \varepsilon,\; \overline{Y} + \varepsilon \,\right].
$$

---

## 2. Dvoretzky–Kiefer–Wolfowitz (DKW) Inequality

For empirical CDF $$F_n(x)$$ based on i.i.d. samples with true CDF $$F(x)$$:

$$
\mathbb{P}\!\left( \sup_x |F_n(x) - F(x)| \ge \varepsilon \right)
\;\le\; 2 \exp(-2n\varepsilon^2).
$$

### Solving for $$\varepsilon$$

Set the right-hand side equal to $\delta$:

$$
2 \exp(-2n\varepsilon^2) = \delta.
$$

Solving gives:
$$
\varepsilon = \sqrt{\frac{\ln(2/\delta)}{2n}}.
$$

Useful for constructing confidence bands:
$$
F_n(x) - \varepsilon \le F(x) \le F_n(x) + \varepsilon.
$$

---

## 3. Other Alternatives Mentioned in the Lecture Notes

### (a) Chebyshev’s Inequality
Assuming finite variance $\sigma^2$:

$$
\mathbb{P}\left( |\overline{Y} - \mathbb{E}[Y]| \ge \varepsilon \right)
\le \frac{\sigma^2}{n\varepsilon^2}.
$$

Solving for $\varepsilon$ by setting RHS = $\delta$:
$$
\varepsilon = \sigma \sqrt{\frac{1}{n\delta}}.
$$

---

### (b) Central Limit Theorem (CLT) Approximation
For large $n$:

$$
\overline{Y} \approx \mathcal{N}\!\left(\mathbb{E}[Y],\, \frac{\sigma^2}{n}\right).
$$

A $(1-\delta)$ interval is:
$$
\overline{Y} \;\pm\; z_{1-\delta/2}\,\frac{\sigma}{\sqrt{n}},
$$
where $z_{1-\delta/2}$ is the standard normal quantile.

---

### (c) Bernstein (or Chernoff–Hoeffding) Inequality  
Sometimes given in extended form when variance is known. In bounded case (same assumptions as Hoeffding):

$$
\mathbb{P}\!\left( |\overline{Y} - \mathbb{E}[Y]| \ge \varepsilon \right)
\le 2 \exp\!\left( 
\frac{-n\varepsilon^2}{2\sigma^2 + \frac{2}{3}(b-a)\varepsilon}
\right).
$$

Solving for $\varepsilon$ requires numerical methods; not algebraic in closed form.

---

## Summary Table of $$\varepsilon$$ Solutions

| Inequality | Bound | Solution for $$\varepsilon$$ |
|-----------|-------|-------------------------------|
| Hoeffding | $2\exp\!\left(-\frac{2n\varepsilon^2}{(b-a)^2}\right) \le \delta$ | $\varepsilon = (b-a)\sqrt{\frac{\ln(2/\delta)}{2n}}$ |
| DKW | $2\exp(-2n\varepsilon^2) \le \delta$ | $\varepsilon = \sqrt{\frac{\ln(2/\delta)}{2n}}$ |
| Chebyshev | $\frac{\sigma^2}{n\varepsilon^2} \le \delta$ | $\varepsilon = \sigma\sqrt{\frac{1}{n\delta}}$ |
| CLT | approx | $$\varepsilon = z_{1-\delta/2}\,\sigma/\sqrt{n}$$ |
| Bernstein | not closed form | requires numerical solution |



-----

### General information that could be important:

* .values and .to_numpy() both convert pandas DataFrames or Series into NumPy arrays; 
* .to_numpy() is the recommended modern approach.


-----
### Permutation Importance

Permutation importance is a model-agnostic method for measuring feature importance. It works by randomly permuting the values of a single feature in the test set and then measuring how much the model’s predictive performance decreases. If permuting a feature leads to a large drop in performance, the model relied heavily on that feature, and it is considered important.

This method measures the **impact of each feature on the model’s predictive performance**, rather than relying on model-specific parameters. Because it only requires the ability to make predictions and evaluate them with a chosen metric, permutation importance is applicable to **any type of predictive model**, including linear models, tree-based models, and neural networks.


In [None]:
# ============================================================
# GENERAL TEMPLATE: Permutation Importance (works for ANY model)
# ============================================================
# What you MUST change depending on your setup:
#   1) estimator      -> set this to your TRAINED model (or Pipeline)
#   2) X_test, y_test -> set these to your TEST split
#   3) feature_names  -> set these to your column names (list of strings)
#   4) scoring        -> choose a metric appropriate for your task
#
# Notes:
# - This works for any model as long as it has predict() (or predict_proba() for some scorers)
# - If you used preprocessing (scaling, one-hot encoding, etc.), it's best to wrap it in a Pipeline
#   and pass the Pipeline as the estimator to avoid mismatches.
# ============================================================

import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

# --------------------------
# 1) CHOOSE YOUR TRAINED MODEL
# --------------------------
# CHANGE THIS:
# - If you trained a Pipeline (recommended): estimator = my_pipeline
# - If you trained a plain model:           estimator = my_model
#
# Examples:
# estimator = problem3_model                   # Pipeline: scaler + logistic regression
# estimator = trained_random_forest_model      # e.g., RandomForestClassifier already fit
# estimator = trained_svm_model                # e.g., SVC already fit
estimator = problem3_model  # <-- CHANGE to your trained model / pipeline


# --------------------------
# 2) PROVIDE TEST DATA
# --------------------------
# CHANGE THESE:
# - X_test should be the test features (NumPy array or pandas DataFrame)
# - y_test should be the test labels
#
# Examples:
# X_test = problem3_X_test
# y_test = problem3_y_test
X_test = problem3_X_test   # <-- CHANGE if your variables are named differently
y_test = problem3_y_test   # <-- CHANGE if your variables are named differently


# --------------------------
# 3) PROVIDE FEATURE NAMES
# --------------------------
# CHANGE THIS:
# - If X_test is a pandas DataFrame, you can do: feature_names = X_test.columns
# - If X_test is a NumPy array, you must supply a list yourself (same order as columns in X_test)
#
# Examples:
# feature_names = problem3_features
# feature_names = list(X_test.columns)
feature_names = problem3_features  # <-- CHANGE to your feature name list (correct order!)


# --------------------------
# 4) CHOOSE A SCORING METRIC
# --------------------------
# CHANGE THIS depending on your task:
# Classification examples:
#   scoring = "accuracy"            (simple, common)
#   scoring = "balanced_accuracy"   (good if classes are imbalanced)
#   scoring = "f1"                  (if you care about positive class quality)
#   scoring = "roc_auc"             (needs probability or decision scores; many models support it)
#
# Regression examples:
#   scoring = "r2"
#   scoring = "neg_mean_squared_error"
#   scoring = "neg_mean_absolute_error"
#
# Tip:
# - If "roc_auc" fails, your estimator may not provide predict_proba/decision_function.
scoring = "accuracy"  # <-- CHANGE if needed


# --------------------------
# 5) RUN PERMUTATION IMPORTANCE
# --------------------------
perm = permutation_importance(
    estimator=estimator,
    X=X_test,
    y=y_test,
    n_repeats=30,        # increase for more stable estimates (slower)
    random_state=42,
    n_jobs=-1,
    scoring=scoring
)

# --------------------------
# 6) FORMAT RESULTS
# --------------------------
perm_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance Mean": perm.importances_mean,
    "Importance Std": perm.importances_std
}).sort_values(by="Importance Mean", ascending=False)

print("Top features by permutation importance:")
print(perm_df.head(15))

# --------------------------
# 7) OPTIONAL: GET MOST IMPORTANT FEATURE
# --------------------------
most_important_feature = perm_df.iloc[0]["Feature"]
print("\nMost important feature (by permutation importance):", most_important_feature)


# --------------------------
# 8) OPTIONAL: RESTRICT TO A SUBSET OF FEATURES
# --------------------------
# Example: if you want "most important one-hot encoded feature"
# CHANGE the selection rule to match your one-hot naming scheme.
# For your diabetes case (features starting with smoking_ or sex_):
subset = [f for f in feature_names if str(f).startswith("smoking_") or str(f).startswith("sex_")]

if len(subset) > 0:
    perm_subset = perm_df[perm_df["Feature"].isin(subset)].sort_values(by="Importance Mean", ascending=False)
    print("\nPermutation importance for subset features:")
    print(perm_subset)

    most_important_in_subset = perm_subset.iloc[0]["Feature"]
    print("\nMost important feature in subset:", most_important_in_subset)
else:
    print("\nSubset list is empty. Adjust the subset selection rule to match your feature names.")


### Remember:
If the dataset does not have a header, then you need to use header=None in the pd.read_csv("data", header=None). Otherwise we will miss one column. 

-----
### Using Utils.py file

If the exam question requires the file Utils.py, then simply copy paste the file into the current folder I am in and then the code will be able to find the file Utils.py. 

-----

-----
# This is general information when to use which data set in the exam:

## When to use each dataset and variable

The data in this assignment is split into **training**, **validation**, and **test** sets. Each set has a specific role, and using them correctly is essential to avoid data leakage and to obtain an unbiased evaluation.

---

## Training set: fit the model

**Variables**
- `PROBLEM3_X_train`
- `PROBLEM3_y_train`

**When to use**
Use the training set **only to learn the model parameters**.

**Typical operations**
- Fit a model:

```python
    model.fit(PROBLEM3_X_train, PROBLEM3_y_train)
```


- Do **not** compute performance metrics or choose thresholds using training data.

**Purpose**
The training set teaches the model the relationship between features and labels.

---

## Validation set: model selection and threshold choice

**Variables**
- `PROBLEM3_X_val`
- `PROBLEM3_y_val` (or `PROBLEM3_y_true_val`)
- `PROBLEM3_y_pred_proba_val`

**When to use**
Use the validation set to **make decisions about the model**, such as:
- choosing a classification threshold,
- comparing different loss functions,
- computing cost, precision, recall, and 0–1 loss.

**Typical operations**
- Predict probabilities:

```python
    y_pred_proba_val = model.predict_proba(PROBLEM3_X_val)[:,1]
```

- Convert probabilities to predictions:

```python
    y_pred_val = (y_pred_proba_val >= threshold).astype(int)
```

- Compute metrics:
- cost
- precision
- recall
- 0–1 loss

**Purpose**
The validation set is used to **tune decisions** without biasing the final evaluation.

---

## Test set: final evaluation only

**Variables**
- `PROBLEM3_X_test`
- `PROBLEM3_y_test` (or `PROBLEM3_y_true_test`)
- `PROBLEM3_y_pred_proba_test`

**When to use**
Use the test set **only after**:
- the model has been trained,
- the threshold has been chosen using validation data.

**Typical operations**
- Predict probabilities:

```python
    y_pred_proba_test = model.predict_proba(PROBLEM3_X_test)[:,1]
```

- Evaluate final performance:
- compute final cost,
- build a confidence interval,
- report final metrics.

**Purpose**
The test set provides an **unbiased estimate of real-world performance**.

---

## Summary table (conceptual)

- Training set → **fit the model**
- Validation set → **choose thresholds and compare decision rules**
- Test set → **final evaluation and confidence intervals**

---

## Important rules to remember

- Never choose thresholds using the test set.
- Never report final performance using the validation set.
- The test set must only be used **once**, at the very end.
- Predicted probabilities (`predict_proba`) are used for **threshold-based decisions**.
- Binary predictions (`>= threshold`) are used for **cost, precision, recall, and loss**.

Following these rules ensures a correct and exam-safe machine learning workflow.


# General Guide: Which Dataset to Use, When, and Why (Logistic Regression & Classification)

THIS RESPONSE IS **INTENTIONALLY AND EXCLUSIVELY** A SINGLE MARKDOWN TEXT CELL.  
There is **NO TEXT OUTSIDE THIS BLOCK**.  
You can copy **once** and paste directly into a Jupyter Notebook **Markdown cell**.

---

## 1. Defining Features and Target

### Features (`X`)
- Features are the **input variables** used by the model to make predictions.
- These typically include:
  - Numerical variables (e.g. age, BMI, blood glucose)
  - One-Hot encoded categorical variables (e.g. `sex_Male`, `smoking_former`)
- **Rule**: Features must represent information that is available **before** a prediction is made.
- **Rule**: Never include the target variable inside the feature set.

### Target (`y`)
- The target is what the model is trying to predict.
- For classification:
  - Binary variable (e.g. diabetes = 0 or 1)
- **Rule**: The target must NEVER be included among the features.

```python
problem3_X = problem3_df[feature_columns].values
problem3_y = problem3_df[target_column].values
```

## 2. Train–Test Split (Why and How)

Training Dataset
- Used to train (fit) the model.
- The model learns patterns from this data.

Test Dataset
- Used to evaluate final performance.
- Simulates unseen, real-world data.
- Must NEVER be used during training.

Standard Split
- 80% training
- 20% testing

```python
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    problem3_X,
    problem3_y,
    test_size=0.2,
    random_state=42
)
```


## 3. Training the Model (.fit())
Which Dataset Goes Into .fit()?
- ONLY the training dataset.

```python
  from sklearn.linear_model import LogisticRegression

  model = LogisticRegression(C=1.0, max_iter=1000)
  model.fit(X_train, y_train)
```

Notes
- C controls regularization:
  - Smaller C → stronger penalization
  - Important when many One-Hot encoded features exist
- max_iter is increased to avoid convergence warnings


## 4. Making Predictions (.predict())
Which Dataset Goes Into .predict()?

| Purpose | Dataset |
|--------|---------|
| Train the model | `X_train`, `y_train` |
| Final model evaluation | `X_test`, `y_test` |
| Generate predicted class labels | `X_test` |
| Generate predicted probabilities | `X_test` |
| Predict on completely new / unseen data | New data with the **same feature structure** as `X_train` |
| Debugging or sanity checks only | `X_train` (NOT for evaluation) |
| Compute precision / recall | Compare `y_test` with predictions from `X_test` |
| Compute confidence intervals | Predictions made on `X_test` |
| Extract feature importance | Trained model (`model.coef_`) |

```python
  y_pred = model.predict(X_test)
```
Rule: Never evaluate performance using predictions from X_train.


## 5. Probability Predictions (.predict_proba())
Used when:
- You want class probabilities instead of labels
- You want threshold-based decisions
- You want confidence-aware analysis

```python
  y_prob = model.predict_proba(X_test)
```
* Output shape: (n_samples, 2)
  * Column 0 → probability of class 0
  * Column 1 → probability of class 1


## 6. Evaluation Metrics: Precision & Recall
Precision
* Of all predicted positives, how many are correct?
* Interpretation:
  * “When the model predicts diabetes, how often is it right?”

Recall
* Of all actual positives, how many were found?
* Interpretation:
  * “How many diabetes cases did the model detect?”

```python
  from sklearn.metrics import precision_score, recall_score

  precision_1 = precision_score(y_test, y_pred, pos_label=1)
  recall_1 = recall_score(y_test, y_pred, pos_label=1)

  precision_0 = precision_score(y_test, y_pred, pos_label=0)
  recall_0 = recall_score(y_test, y_pred, pos_label=0)
```

## 7. Feature Importance (Logistic Regression)

How Feature Importance Is Defined
* Logistic Regression uses coefficients
* Larger absolute coefficient ⇒ stronger influence

```python
  coefficients = model.coef_[0]
```

One-Hot Encoded Features
* Compare absolute values of coefficients
* The most important One-Hot encoded feature is the one with the largest absolute coefficient

```python
  important_idx = np.argmax(np.abs(coefficients))
  important_feature = feature_columns[important_idx]
```

## 8. What Dataset to Use for Each Task (Summary Table)
| Purpose | Dataset |
|--------|---------|
| Train the model | `X_train`, `y_train` |
| Final model evaluation | `X_test`, `y_test` |
| Generate predicted class labels | `X_test` |
| Generate predicted probabilities | `X_test` |
| Predict on completely new / unseen data | New data with the **same feature structure** as `X_train` |
| Debugging or sanity checks only | `X_train` (NOT for evaluation) |
| Compute precision / recall | Compare `y_test` with predictions from `X_test` |
| Compute confidence intervals | Predictions made on `X_test` |
| Extract feature importance | Trained model (`model.coef_`) |


## 10. Exam-Safe Golden Rules (MEMORIZE)
* Never train on test data
* Never evaluate on training data
* .fit() → training data ONLY
* .predict() → test or unseen data ONLY
* Metrics → always computed using y_test
* One-Hot feature importance → coefficient magnitude
* If unsure: ask which dataset is allowed before using it




# Relevant metrics to look at the LinearRegression model: 

Below are four common regression metrics. Each compares the **true target values** to the model’s **predicted target values**.

### Notation
- $y_i$: true target value for sample $i$
- $\hat{y}_i$: predicted target value for sample $i$
- $\bar{y}$: mean of true targets in the evaluated set (typically the test set)
- $n$: number of samples in the evaluated set

---

## 1) Mean Squared Error (MSE)
**What it does (short):** Measures the average squared prediction error. Large errors are penalized more because of squaring.

**Formula:**
$ \mathrm{MSE} = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2 $

**Datasets used:**
- $y$ = `y_test` (true values for the test set)
- $\hat{y}$ = `y_pred` (predictions for the test set)

---

## 2) Root Mean Squared Error (RMSE)
**What it does (short):** Square root of MSE, giving an error measure in the **same unit** as the target variable.

**Formula:**
$ \mathrm{RMSE} = \sqrt{\mathrm{MSE}} = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2} $

**Datasets used:**
- $y$ = `y_test`
- $\hat{y}$ = `y_pred`

---

## 3) Mean Absolute Error (MAE)
**What it does (short):** Measures the average absolute prediction error. Less sensitive to outliers than MSE/RMSE.

**Formula:**
$ \mathrm{MAE} = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i| $

**Datasets used:**
- $y$ = `y_test`
- $\hat{y}$ = `y_pred`

---

## 4) $R^2$ Score (Coefficient of Determination)
**What it does (short):** Measures how much of the variance in $y$ is explained by the model. 
- $R^2 = 1$ is perfect fit
- $R^2 = 0$ is no better than predicting the mean
- $R^2 < 0$ is worse than predicting the mean

**Formula:**
$ R^2 = 1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2} $

**Datasets used:**
- $y$ = `y_test`
- $\hat{y}$ = `y_pred`
- $\bar{y}$ is the mean of `y_test`

---

# Python code to compute all metrics (scikit-learn)

```python
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

# Assume you already have a train/test split:
# X_train, X_test, y_train, y_test

# 1) Train the model on the training data
model = LinearRegression()
model.fit(X_train, y_train)

# 2) Create predictions on the test data
# y_pred is "y_hat" (predicted y). It comes from calling model.predict(...) on X_test.
y_pred = model.predict(X_test)

# 3) Compute metrics comparing y_test (true) vs y_pred (predicted)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE  = {mse:.6f}")
print(f"RMSE = {rmse:.6f}")
print(f"MAE  = {mae:.6f}")
print(f"R^2  = {r2:.6f}")
```

Summary of which variables are used
* **y_test**: the true target values for the test set (ground truth).
* **y_pred**: the predicted target values for the test set, computed as:
* **y_pred** = model.predict(X_test)
* All four metrics above should typically be reported on the test set:
    * Compare y_test vs y_pred.


## Interpreting regression metric values and how to compute them in practice

### What does “closer to the true value” mean?
In regression, each data point has:
- a **true value** $y_i$ (the actual observed target from the dataset), and
- a **predicted value** $\hat{y}_i$ (the value predicted by the model).

The difference between them is called the **residual**:
$ \text{residual}_i = y_i - \hat{y}_i $

A model is considered better when these residuals are small, meaning the predictions are numerically close to the true values.

---

## How do you get the actual (true) values?
The **true values** come directly from your dataset.

After splitting the data:
- `y_train`: true target values used to train the model
- `y_test`: true target values used to evaluate the model

All regression metrics should be computed using **`y_test`**, because these values were not seen during training.

---

## How do you get the predicted values?
Predicted values are produced by the trained model:

    y_pred = model.predict(X_test)

- `X_test`: input features for the test set
- `y_pred`: predicted target values ($\hat{y}$)

---

## Mean Squared Error (MSE)
- **Goal:** Lower is better
- **What it measures:** Average squared difference between true and predicted values
- **Interpretation:** Strongly penalizes large errors

$ \mathrm{MSE} = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2 $

Python code:

    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_test, y_pred)

**Comparison used:**  
`y_test` (true values) vs `y_pred` (predicted values)

---

## Root Mean Squared Error (RMSE)
- **Goal:** Lower is better
- **What it measures:** Square root of MSE, in the same unit as the target
- **Interpretation:** Typical size of prediction error

$ \mathrm{RMSE} = \sqrt{\mathrm{MSE}} $

Python code:

    import numpy as np
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

**Comparison used:**  
`y_test` vs `y_pred`

---

## Mean Absolute Error (MAE)
- **Goal:** Lower is better
- **What it measures:** Average absolute difference between true and predicted values
- **Interpretation:** Less sensitive to outliers than MSE/RMSE

$ \mathrm{MAE} = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i| $

Python code:

    from sklearn.metrics import mean_absolute_error
    mae = mean_absolute_error(y_test, y_pred)

**Comparison used:**  
`y_test` vs `y_pred`

---

## $R^2$ Score (Coefficient of Determination)
- **Goal:** Higher is better
- **What it measures:** Fraction of variance in the true values explained by the model
- **Interpretation:**
  - $R^2 = 1$: perfect predictions
  - $R^2 = 0$: no better than predicting the mean
  - $R^2 < 0$: worse than predicting the mean

$ R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2} $

Python code:

    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)

**Comparison used:**  
`y_test` vs `y_pred`  
$\bar{y}$ is the mean of `y_test`

---

## Complete minimal example (context)

    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Train the model on true training values
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict target values for unseen data
    y_pred = model.predict(X_test)

---

## Summary
- **True (actual) values:** `y_test` (ground truth from the dataset)
- **Predicted values:** `y_pred`, obtained using `model.predict(X_test)`
- **Each metric compares:** `y_test` vs `y_pred`
- **Lower is better:** MSE, RMSE, MAE
- **Higher is better:** $R^2$
- **Key idea:** Better models produce predictions $\hat{y}$ that are numerically closer to the true values $y$


-----



-----

-----
## How to think when choosing a proposal distribution \( g(x) \) in rejection sampling

When facing a difficult rejection sampling problem, do **not** start by guessing formulas. Instead, follow this reasoning process.

---

### 1) Locate where the probability mass is
Ask: *Where does the distribution actually concentrate its mass?*

Look for terms like:
- $( e^{-1/x} )$, $( e^{x^2} )$, $( x^\alpha )$
- behavior near boundaries (0, infinity, endpoints)

**Rule:**  
Your proposal must put mass where the target puts mass.

---

### 2) Identify the dominant term
Ignore constants and lower-order factors at first.

Ask: *Which part of the density controls the shape?*

Examples:
- $( e^{-x} )$ → exponential
- $( e^{-x^2} )$ → Gaussian-like
- $( e^{-1/x} )$ → strong boundary concentration

**Rule:**  
Match the dominant term first; fix the rest using rejection.

---

### 3) Consider a change of variables
If the density contains:
- $( 1/x )$, $( \log x )$, or sharp boundary behavior

Ask: *Would this look simpler in another variable?*

Common transformations:
- $( Y = 1/X )$ for $( e^{-1/x} )$
- $( Y = \log X )$ for multiplicative scales

**Rule:**  
If the density is ugly in $( x )$, change coordinates.

---

### 4) Choose a proposal that is easy to sample from
Good proposals:
- Uniform (only if the target is fairly flat)
- Exponential or shifted exponential
- Gaussian

Bad proposals:
- hard-to-invert CDFs
- complicated expressions

**Rule:**  
If sampling from $( g(x) )$ is hard, you chose the wrong proposal.

---

### 5) Immediately check the ratio $( f(x)/g(x) )$
Before coding, compute:
$$
\frac{f(x)}{g(x)}
$$

Ask:
- Is it bounded?
- Does it simplify?
- Do exponentials cancel?

**Rule:**  
If exponentials cancel and the ratio is simple, the proposal is good.

---

### 6) Estimate the rejection constant $( M )$
Check where the maximum of $( f(x)/g(x) )$ occurs:
- often at boundaries
- sometimes at symmetry points

Good signs:
- $( M \approx 1 )$: very efficient
- $( M \gg 10 )$: rethink your proposal

---

### 7) Key mindset
Rejection sampling is not mechanical algebra — it is **distribution engineering**:
- understand the shape
- match it intelligently
- use rejection only to correct small differences

---

### One-line checklist (exam-ready)
1. Where is the mass?  
2. What term dominates?  
3. Should I change variables?  
4. Can I sample from $( g )$ easily?  
5. Is $( f/g )$ bounded?  
6. Is $( M )$ small?

If all answers are yes, your choice of $( g(x) )$ is good.

-----

-----
# Below is General functions that can be used in this course, with their description of what they actually do and when they are to be used:

-----

## This is the General Info from the 2025 January Exam: 

# SVD & Anomaly Detection — Exam-Oriented Summary (Part 1)

This section summarizes the **key functions and concepts** used in Part 1 of the SVD exercise.  
The goal is to understand **what each concept does, when it is used, and why it is used**, without focusing on mathematical derivations.

---

## Loading the data

### `pd.read_csv("data/SVD.csv", header=None)`

**What it does**  
Loads the CSV file and treats **every row as data**, not as column names.

**When it is used**  
Used when the dataset contains **only numerical values**, which is common in linear algebra and machine learning problems.

**Why it is used**  
SVD requires a **pure numeric matrix**. If a header were assumed, the first data row could be dropped or interpreted incorrectly.

**Exam takeaway**  
`header=None` means *“this CSV has no column names; everything is data.”*

---

## Singular Value Decomposition (SVD)

### `np.linalg.svd(X, full_matrices=False)`

**What it does**  
Decomposes the data matrix $X$ into three components:
\[
X \approx U D V^T
\]

**When it is used**  
- Dimensionality reduction  
- Anomaly detection  
- Noise reduction  
- Data compression  
- Feature extraction  

**Why it is used**  
It separates the most important patterns in the data from less important ones, making it easier to approximate and analyze the dataset.

**Exam takeaway**  
SVD breaks a matrix into **patterns** and **how important those patterns are**.

---

## Meaning of $U$, $D$, and $V$

### $U$ — Left singular vectors  
- A matrix whose **columns** describe patterns across **samples (rows)**.  
- Shape (with `full_matrices=False`):
\[
(n\_samples, r)
\]
- Each column shows how strongly each sample is associated with a component.

**Think of $U$ as:**  
“How each data point uses the components.”

---

### $D$ — Singular values (diagonal matrix)  
- A **diagonal matrix** containing the singular values.  
- Shape:
\[
(r, r)
\]
- Larger values indicate more important components.

**Why diagonal**  
Each component scales independently, without mixing with others.

---

### $V$ — Right singular vectors  
- A matrix whose **columns** describe patterns across **features (dimensions)**.  
- Shape:
\[
(n\_dimensions, r)
\]

**Think of $V$ as:**  
“What the components look like in feature space.”

---

## Why `np.diag(s)` is used

### `np.diag(s)`

**What it does**  
Converts the 1D vector of singular values `s` into a **diagonal matrix** $D$.

**When it is used**  
When explicitly reconstructing the matrix using:
\[
X \approx U D V^T
\]

**Why it is used**  
NumPy returns singular values as a vector for efficiency, but the SVD definition requires $D$ to be a matrix.

**Exam takeaway**  
`s` contains the values, `np.diag(s)` builds the proper $D$ matrix.

---

## `full_matrices=False` (important)

**What it controls**  
The **size** of the matrices returned by SVD.

**With `full_matrices=True`**  
- Produces large square matrices.
- Often unnecessary and inefficient.

**With `full_matrices=False`**  
- Produces the **compact (economy) SVD**.
- Keeps only the components needed to reconstruct and approximate the data.
- More memory-efficient and faster.

**Why it is used here**  
This is the standard choice in data science and machine learning because it keeps only the meaningful information.

**Exam takeaway**  
`full_matrices=False` means *“give me the smallest useful SVD.”*

---

## NumPy slicing for singular vectors

### `problem1_first_right_singular_vector = problem1_V[:, 0]`

**What it does**  
Extracts the **first column** of the matrix $V$ and returns it as a 1D array.

**How slicing works**  
In NumPy:
* array[rows, columns]

- `:` means all rows  
- `0` means column index 0  

So `[:, 0]` means *all rows from column 0*.

**Why it is used**  
- Columns of $V$ are **right singular vectors**.
- Column 0 corresponds to the **most important component**.

**Common mistake**  
This does **not** select a row — it selects a column.

**Exam takeaway**  
`[:, 0]` extracts the **first (most important) singular vector**.

---

## One-line exam summary

- `header=None` → CSV has no column names  
- `svd(X)` → decomposes data into patterns and their importance  
- $U$ → how samples relate to components  
- $D$ → importance of each component (diagonal matrix)  
- $V$ → what components look like in feature space  
- `full_matrices=False` → compact, efficient SVD  
- `np.diag(s)` → build the diagonal $D$ matrix  
- `[:, 0]` → extract the first singular vector  


# SVD — Explained Variance (Part 2) Exam-Oriented Summary

This section explains **explained variance**, how it is calculated in the context of SVD, and the roles of `cumsum()` and `argmax()`. The focus is on **what these concepts mean and why they are used**, not on mathematical derivations.

---

## What is explained variance?

**Explained variance** measures **how much of the total information (variation)** in the original dataset is captured when keeping only the first $k$ singular components from the SVD.

In plain terms:
- It tells you **how well a reduced version of the data represents the original data**
- Higher explained variance means **less information loss**
- It helps decide **how many components are enough**

For example:
- 60% explained variance → much structure is lost  
- 95% explained variance → almost all important structure is retained  

**Exam intuition**  
Explained variance answers the question:  
> “If I keep only the first $k$ components, how much of the original data do I still explain?”

---

## How is explained variance calculated?

From the SVD, each singular value $\sigma_j$ represents how important a component is.

The calculation follows this logic:

1. **Square each singular value**  
   - $\sigma_j^2$ represents the variance contribution of component $j$

2. **Sum all squared singular values**  
   - This gives the **total variance** in the data

3. **Compute the cumulative fraction**
$$
\text{ExplainedVariance}(k) =
\frac{\sum_{j=1}^{k} \sigma_j^2}{\sum_{j=1}^{r} \sigma_j^2}
$$

This produces values between $0$ and $1$ (or $0\%$ to $100\%$).

**Why this works conceptually**  
- Larger singular values correspond to more important directions
- By summing them in order, we see how information accumulates as we keep more components

---

## What does `np.cumsum()` do?

### `np.cumsum(array)`

**What it does**  
Computes the **cumulative sum** of an array.

Example:
Input: [a, b, c, d]
Output: [a, a+b, a+b+c, a+b+c+d]


**Why it is used here**
- Singular values are ordered from **most important to least important**
- `cumsum()` allows us to track:
  - variance explained by 1 component
  - variance explained by 2 components
  - variance explained by 3 components
  - and so on

**In this exercise**
np.cumsum(singular_values_squared) / total_variance

produces a vector where each entry tells:
“How much of the total variance is explained if we keep components up to this point”

**Exam takeaway**  
`cumsum()` builds the running total so we can see how variance accumulates as components are added.

---

## What is `argmax()` and what does it do?

### `np.argmax(array)`

**What it does**
- Returns the **index of the first maximum value** in an array.

When used on a boolean array:
- `False` is treated as 0  
- `True` is treated as 1  
- The first `True` is the maximum

---

## Why `argmax()` is used here

This line:
```python
np.argmax(problem1_explained_variance >= 0.95)
```

Works conceptually as follows:
problem1_explained_variance >= 0.95 → creates a boolean array like:
[False, False, False, True, True, ...]

argmax() → returns the index of the first True value

+ 1→ converts from zero-based indexing to a component count starting at 1

What this gives
* The smallest number of components needed to explain at least 95% of the variance

Exam intuition
argmax() is used to answer:
* “When does this condition become true for the first time?”

One-line exam summary
* Explained variance → how much information is retained using $k$ components
* Squared singular values → variance contributions
* Total variance → sum of all squared singular values
* cumsum() → builds cumulative explained variance
* argmax() → finds the first index where a condition is satisfied
* Combined → select the smallest number of components that reaches a target variance level (e.g. 95%)


# This is an example from the Exam 230815 of how the plotting of the empirical distribution function of the residual with confidence bands (i.e. using the DKW inequality and 95% confidence) works:

---

## 6. [3p] Empirical CDF of residuals with DKW 95% confidence band

We want to study the **distribution of the residuals on the test set** and add a **uniform confidence band** using the Dvoretzky–Kiefer–Wolfowitz (DKW) inequality.

---

### Step 1: Define residuals on the test set

For each test point $(i = 1,\dots,n)$, we have:

- True salary $(y_i)$ (from $(\text{problem2\_y\_test})$)
- Predicted salary $(\hat{y}_i)$ (from $(\text{problem2\_model.predict})$)

The **residual** is

$$
e_i = y_i - \hat{y}_i.
$$

Collect all test residuals in a vector

$$
e_1, e_2, \dots, e_n.
$$

---

### Step 2: Empirical distribution function (EDF) of the residuals

The **empirical distribution function** (EDF) of the residuals is defined as

$$
\hat{F}_n(t)
= \frac{1}{n} \sum_{i=1}^{n} \mathbf{1}\{ e_i \le t \},
$$

where $(\mathbf{1}\{\cdot\})$ is the indicator function.

In practice, we:

1. Sort the residuals:

   $$
   e_{(1)} \le e_{(2)} \le \dots \le e_{(n)},
   $$

2. At each sorted residual $(e_{(k)})$, the EDF jumps to

   $$ 
   \hat{F}_n\bigl(e_{(k)}\bigr) = \frac{k}{n}.
   $$

---

### Step 3: DKW inequality and 95% confidence band

Let $(F(t))$ be the **true CDF** of the residuals (unknown).  
The DKW inequality states that for any $(\varepsilon > 0)$,

$$
\mathbb{P}\!\left(
\sup_{t} \left| \hat{F}_n(t) - F(t) \right| > \varepsilon
\right)
\le
2 e^{-2 n \varepsilon^2}.
$$

To get a **\(95\%\)** (i.e. $(1-\alpha = 0.95)$) **uniform confidence band**, we set $(\alpha = 0.05)$ and solve

$$
2 e^{- 2 n \varepsilon^2 } = \alpha.
$$

Taking logarithms:

$$
e^{- 2 n \varepsilon^2 } = \frac{\alpha}{2}
\quad\Rightarrow\quad
-2 n \varepsilon^2 = \log \frac{\alpha}{2}
\quad\Rightarrow\quad
\varepsilon^2 = -\frac{1}{2n} \log \frac{\alpha}{2}.
$$

So

$$
\varepsilon_n
=
\sqrt{
-\frac{1}{2n} \log \frac{\alpha}{2}
}
=
\sqrt{
\frac{1}{2n} \log\!\left( \frac{2}{\alpha} \right)
}.
$$

For $(\alpha = 0.05)$,

$$
\varepsilon_n
=
\sqrt{
\frac{1}{2n} \log\!\left( \frac{2}{0.05} \right)
}
=
\sqrt{
\frac{1}{2n} \log(40)
}.
$$

---

### Step 4: Constructing the confidence band

For each $(t)$, the **95% confidence band** is

$$
\hat{F}_n(t) - \varepsilon_n
\le
F(t)
\le
\hat{F}_n(t) + \varepsilon_n,
$$

or in terms of lower and upper band functions:

$$
F^{-}(t) = \max\bigl( \hat{F}_n(t) - \varepsilon_n,\ 0 \bigr),
\qquad
F^{+}(t) = \min\bigl( \hat{F}_n(t) + \varepsilon_n,\ 1 \bigr).
$$

In the plot, we:

- Plot the EDF $(\hat{F}_n(t))$ as a step curve,
- Plot the lower band $(F^{-}(t))$,
- Plot the upper band $(F^{+}(t))$.

---

### Step 5: Interpretation — what does the band tell us?

The DKW band has the property that, with probability at least $(95\%)$,

$$
F(t) \in [F^{-}(t), F^{+}(t)] \quad \text{for all } t.
$$

In words:

- The band tells us how much **uncertainty** there is in the empirical CDF as an estimate of the true residual distribution.
- It is **uniform in \(t\)**: the guarantee holds simultaneously for all thresholds $(t)$.

**What can we use it for?**

- To assess how precisely we have estimated the distribution of residuals from a finite test sample.
- To check whether a **candidate theoretical distribution** for residuals (e.g. normal distribution) lies mostly within this band; if the theoretical CDF goes outside the band, this suggests a poor fit.
- More generally, to quantify uncertainty in distributional features of the residuals (e.g. quantiles, tail behavior) in a nonparametric way.

---


In [None]:
# Example code from the above:

# Part 6
# Put the code for part 6 below this line

import numpy as np
import matplotlib.pyplot as plt

# 1. Recompute predictions and residuals on the test set (for clarity)
y_test_true = problem2_y_test
y_test_pred = problem2_model.predict(problem2_X_test)
residuals_test = y_test_true - y_test_pred

# Number of test samples
n = len(residuals_test)

# 2. Sort residuals and build empirical CDF values
residuals_sorted = np.sort(residuals_test)
ecdf_values = np.arange(1, n + 1) / n  # k/n for k = 1,...,n

# 3. Compute epsilon using the DKW inequality for 95% confidence
alpha = 0.05
epsilon = np.sqrt((1.0 / (2.0 * n)) * np.log(2.0 / alpha))

# 4. Compute lower and upper confidence bands (clipped to [0, 1])
lower_band = np.clip(ecdf_values - epsilon, 0.0, 1.0)
upper_band = np.clip(ecdf_values + epsilon, 0.0, 1.0)

print(f"Number of test samples n = {n}")
print(f"DKW epsilon (95% band)  = {epsilon:.4f}")

# 5. Plot the empirical CDF and the confidence band
plt.figure(figsize=(7, 5))

# Empirical CDF as a step function
plt.step(residuals_sorted, ecdf_values, where="post", label="Empirical CDF of residuals")

# Confidence band as two lines
plt.step(residuals_sorted, lower_band, where="post", linestyle="--", label="Lower 95% band")
plt.step(residuals_sorted, upper_band, where="post", linestyle="--", label="Upper 95% band")

plt.xlabel("Residual (true - predicted salary)")
plt.ylabel("Empirical CDF")
plt.title("Empirical CDF of Test Residuals with DKW 95% Confidence Band")
plt.legend()
plt.grid(True, linestyle=":", alpha=0.6)
plt.tight_layout()
plt.show()


# This Section describes the difference between Inversion sampling and using simple Reject Accept:


## PROBLEM 3 — Random variable generation from a given CDF

We are given the CDF
$$
F(x)=
\begin{cases}
0, & x\le 0\\
e^x-1, & 0<x<\ln(2)\\
1, & x\ge \ln(2)
\end{cases}
$$
The support is the interval $(x\in(0,\ln 2))$.

---

# A) Step-by-step: Inversion sampling (construct 1000 samples)

### Step 1: Identify the part of the CDF we can invert
For $(0<x<\ln 2)$,
$$
F(x)=e^x-1.
$$
Also note $(F(0)=0)$ and $(F(\ln 2)=e^{\ln 2}-1=2-1=1)$, so this maps exactly to $((0,1))$.

### Step 2: Set $(U \sim \text{Unif}(0,1))$ and solve $(U = F(X))$
Let $(U\in(0,1))$. Set:
$$
U = e^X - 1.
$$
Solve for (X)$:
$$
U+1 = e^X \quad\Rightarrow\quad X=\ln(1+U).
$$

### Step 3: Sampling rule (the inverse CDF)
$$
X = F^{-1}(U) = \ln(1+U), \quad U\sim \text{Unif}(0,1).
$$

### Step 4: Generate 1000 samples
1. Draw $(U_1,\dots,U_{1000}\stackrel{iid}{\sim}\text{Unif}(0,1))$.
2. Compute $(X_i=\ln(1+U_i))$.

**Python (minimal):**
```python
import numpy as np

n = 1000
U = np.random.rand(n)         # U_i ~ Uniform(0,1)
X_inv = np.log(1 + U)         # X_i = ln(1+U_i)
```

## Step 5: Estimate mean and variance from the 1000 samples

The sample mean is estimated as

$\hat{\mu} = \frac{1}{n} \sum_{i=1}^{n} X_i,$

and the sample variance is estimated as

$\hat{\sigma}^2 = \frac{1}{n - 1} \sum_{i=1}^{n} (X_i - \hat{\mu})^2.$

```python
    mean_hat = X_inv.mean()
    var_hat  = X_inv.var(ddof=1)  # sample variance (divide by n-1)
```

## B) Step-by-step: Accept–Reject sampling (construct 1000 samples)

### Step 1: Compute the target density $f(x)$

Differentiate the CDF on the continuous part:

$$
f(x) = F'(x) = e^x, \qquad 0 < x < \ln 2,
$$

and

$$
f(x) = 0 \quad \text{otherwise}.
$$

Check normalization:

$$
\int_{0}^{\ln 2} e^x \, dx
= e^{\ln 2} - 1
= 2 - 1
= 1,
$$

so $f$ is already a valid PDF.

### Step 2: Choose an easy proposal density $g(x)$

A simple choice is uniform on the same support:

$$
g(x) = \text{Unif}(0, \ln 2)
\;\;\Rightarrow\;\;
g(x) = \frac{1}{\ln 2}, \quad 0 < x < \ln 2.
$$

**Why this proposal?**

- Very easy to sample from.
- Same support as $f$.
- Makes $f(x) / g(x)$ simple, giving a good acceptance rate.

### Step 3: Find a constant $M$ so that $f(x) \le M g(x)$

Compute the ratio on $[0, \ln 2]$:

$$
\frac{f(x)}{g(x)} = \frac{e^x}{1/\ln 2} = e^x \ln 2.
$$

This is maximized at $x = \ln 2$, where $e^{\ln 2} = 2$. Hence:

$$
\max_{x \in [0, \ln 2]} \frac{f(x)}{g(x)} = 2 \ln 2.
$$

So we can choose

$$
M = 2 \ln 2.
$$

### Step 4: Write the acceptance probability

Accept–Reject accepts a proposal $Y \sim g$ with probability

$$
\frac{f(Y)}{M g(Y)}.
$$

Here:

$$
\frac{f(y)}{M g(y)}
= \frac{e^y}{(2 \ln 2)\cdot (1/\ln 2)}
= \frac{e^y}{2}.
$$

So the acceptance test is:

* Draw $U \sim \text{Unif}(0,1)$. Accept $Y$ if $U \le e^Y/2$.

### Step 5: Generate 1000 accepted samples and compute acceptance proportion

**Algorithm:**

1. Propose $Y \sim \text{Unif}(0, \ln 2)$.
2. Draw $U \sim \text{Unif}(0, 1)$.
3. If $U \le e^Y / 2$, accept $Y$ as a sample; otherwise reject and repeat.
4. Keep going until you have 1000 accepted samples.
5. Acceptance proportion = (number accepted) / (number proposed).

**Expected acceptance proportion:**

$$
\text{AccRate} = \frac{1}{M} = \frac{1}{2 \ln 2} \approx 0.721.
$$





## This is the final and complete code for both of the different approaches:

```python
import numpy as np

# ============================================================
# PROBLEM 3 — Random variable generation from a given CDF
# Target CDF:
#   F(x)=0                  for x<=0
#   F(x)=e^x - 1            for 0<x<ln(2)
#   F(x)=1                  for x>=ln(2)
#
# Therefore the PDF on (0, ln(2)) is:
#   f(x) = d/dx (e^x - 1) = e^x,   for 0<x<ln(2)
# ============================================================

# -----------------------------
# Settings
# -----------------------------
n = 1000
# NOTE: Using np.random.* directly (no rng / default_rng).
# If you want reproducibility, uncomment the next line:
# np.random.seed(0)

ln2 = np.log(2.0)

# ============================================================
# A) Inversion sampling
#   U ~ Uniform(0,1)
#   X = F^{-1}(U) = ln(1+U)
#
# Implemented as a loop (as in your example).
# ============================================================

samples_inv = []

while len(samples_inv) < n:
    U = np.random.uniform(0.0, 1.0)  # U ~ Uniform(0,1)
    x = np.log(U + 1.0)              # X = ln(1+U)
    samples_inv.append(x)

# Convert to NumPy array for easy stats
X_inv = np.array(samples_inv, dtype=float)

# ---- Estimate mean and variance (sample variance uses ddof=1) ----
mean_inv = X_inv.mean()
var_inv  = X_inv.var(ddof=1)

print("=== Inversion sampling ===")
print("n =", n)
print("Length of samples:", len(samples_inv))
print("sample mean =", mean_inv)
print("sample var  =", var_inv)
print()

# ============================================================
# B) Accept–Reject sampling (written in the same style as your example)
#
# Target: f(x) = e^x on (0, ln2)
#
# Proposal: g(x) = Uniform(0, ln2)
#   g(x) = 1/ln2 on (0, ln2)
#
# Bound:
#   f(x)/g(x) = e^x ln2, maximized at x=ln2 -> 2 ln2
#   so we can choose M = 2 ln2
#
# Acceptance probability:
#   f(y)/(M g(y)) = e^y / 2
# ============================================================

def problem3_rejection(n_samples=1):
    """
    Return a numpy array of length n_samples with samples from
    f(x)=e^x on (0, ln(2)) using accept-reject with proposal Unif(0, ln(2)).
    """
    samples = []

    while len(samples) < n_samples:
        # 1) Sample from the proposal: Uniform(0, ln2)
        y = np.random.uniform(0.0, ln2)

        # 2) Compute acceptance probability w = f(y)/(M g(y)) = e^y / 2
        w = np.exp(y) / 2.0

        # 3) Sample U ~ Uniform(0,1) for acceptance
        u = np.random.uniform(0.0, 1.0)

        # 4) Accept if u <= w
        if u <= w:
            samples.append(y)

    return np.array(samples, dtype=float)

# Generate n samples with accept-reject
X_ar = problem3_rejection(n)

# ---- Estimate mean and variance (sample variance uses ddof=1) ----
mean_ar = X_ar.mean()
var_ar  = X_ar.var(ddof=1)

print("=== Accept–Reject sampling ===")
print("n =", n)
print("Length of samples:", len(X_ar))
print("sample mean =", mean_ar)
print("sample var  =", var_ar)
print()

# ============================================================
# (Optional) Quick sanity checks
# ============================================================

print("Sanity checks:")
print("Inversion: min/max =", X_inv.min(), X_inv.max(), " (should be within (0, ln2) )")
print("A-R      : min/max =", X_ar.min(),  X_ar.max(),  " (should be within (0, ln2) )")



```


# This is information about the Accept Reject problems:



# This explains how Recall and Precision works for both class 0 and 1:


### Precision and Recall (per class)

Let the confusion matrix be:

- **tn**: true class 0 predicted as 0  
- **fp**: true class 0 predicted as 1  
- **fn**: true class 1 predicted as 0  
- **tp**: true class 1 predicted as 1  

---

### Class 1 (treat class `1` as the positive class)

**Precision (class 1)**  
Fraction of samples predicted as class 1 that are truly class 1:
$$
\text{Precision}_1 = \frac{tp}{tp + fp}
$$

**Recall (class 1)**  
Fraction of true class 1 samples that are correctly predicted:
$$
\text{Recall}_1 = \frac{tp}{tp + fn}
$$

---

### Class 0 (treat class `0` as the positive class, one-vs-rest)

When evaluating class 0, we treat class `0` as the “positive” class.

- True positives for class 0: **tn**
- Predicted positives for class 0: **tn + fn**
- Actual positives for class 0: **tn + fp**

**Precision (class 0)**  
Fraction of samples predicted as class 0 that are truly class 0:
$$
\text{Precision}_0 = \frac{tn}{tn + fn}
$$

**Recall (class 0)**  
Fraction of true class 0 samples that are correctly predicted:
$$
\text{Recall}_0 = \frac{tn}{tn + fp}
$$

---

### Summary Table

| Class | Precision | Recall |
|------|-----------|--------|
| 0 | $( \frac{tn}{tn + fn} $) | $( \frac{tn}{tn + fp} $) |
| 1 | $( \frac{tp}{tp + fp} $) | $( \frac{tp}{tp + fn} $) |

---

**Key intuition:**  
- **Precision** conditions on what the model *predicted*  
- **Recall** conditions on what the class *actually was*


### Choice of $( n )$ when using Hoeffding’s inequality

Hoeffding’s inequality bounds the deviation of an empirical mean from its true
expectation. Therefore, $( n )$ must be the **number of independent Bernoulli trials**
used to compute that empirical mean.

For precision and recall, each metric is an average of indicator variables
(correct or incorrect classification) over a specific subset of samples.

---

### Precision

Precision is defined as:
$$
\text{Precision} = \frac{\text{number of correct positive predictions}}{\text{number of predicted positives}}
$$

Each predicted positive contributes one Bernoulli trial (correct or not). Therefore:
$$
n_{\text{precision}} = \text{number of predicted positives}
$$

- Class 1: $( n = tp + fp )$  
- Class 0: $( n = tn + fn )$

---

### Recall

Recall is defined as:
$$
\text{Recall} = \frac{\text{number of correctly classified positives}}{\text{number of actual positives}}
$$

Each actual positive contributes one Bernoulli trial. Therefore:
$$
n_{\text{recall}} = \text{number of actual positives}
$$

- Class 1: $( n = tp + fn )$  
- Class 0: $( n = tn + fp )$

---

### Key idea

The denominator of precision or recall is exactly the number of samples over which
the empirical average is computed. This is why it is the correct choice of $( n )$
in Hoeffding’s inequality.


### This is how you get the precision and recall using the confusion matrix:

```python
# === 2. Build the confusion matrix ===
# We specify labels=[0, 1] to ensure the order is TN, FP, FN, TP when we ravel()
tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
print("tn:", tn)
print("fp:", fp)
print("fn:", fn)
print("tp:", tp)

```