## Here is an example of how to check reverisibility of any matrix!

In [None]:
import numpy as np

# P = matrix
# pi = linear equation system we have 

# Transition matrix
P = np.array([
    [0.3, 0.7, 0.0, 0.0],  # Downtown
    [0.2, 0.5, 0.3, 0.0],  # Suburbs
    [0.0, 0.0, 0.5, 0.5],  # Countryside
    [0.0, 0.0, 0.0, 1.0]   # Workshop
])


stationary_dist = stationary_distribution(P)


# Code for checking reversibility for ANY matrix!:

def is_reversible(P, stationary_dist, tol=1e-12):
    n = P.shape[0]
    for i in range(n):
        for j in range(n):
            left = stationary_dist[i] * P[i,j]
            right = stationary_dist[j] * P[j,i]
            if not np.isclose(left, right, atol=tol):
                return False
    return True

print("Reversible:", is_reversible(P, stationary_dist))




Reversible: True


## This is how you can always find the Stationary distribution of any matrix!

In [11]:
import numpy as np

def stationary_distribution(P, tol=1e-12, verify=True):
    """
    Compute the stationary distribution of a finite-state Markov chain.

    Parameters
    ----------
    P : np.ndarray, shape (n, n)
        Transition matrix. Rows should sum to 1 (row-stochastic).
    tol : float, optional
        Tolerance used for cleaning small numerical noise.
    verify : bool, optional
        If True, checks that the result is approximately stationary.

    Returns
    -------
    pi : np.ndarray, shape (n,)
        Stationary distribution vector (non-negative, sums to 1).

    Notes
    -----
    This solves the linear system:

        (P^T - I) * pi = 0,  with  sum(pi) = 1

    by replacing one of the equations with the normalization condition.
    It assumes that a (unique) stationary distribution exists.
    """
    P = np.asarray(P, dtype=float)
    n = P.shape[0]

    if P.shape[0] != P.shape[1]:
        raise ValueError("P must be a square matrix.")

    # Build A * pi = b
    A = P.T - np.eye(n)
    b = np.zeros(n)

    # Replace last row with normalization condition: sum_i pi_i = 1
    A[-1, :] = 1.0
    b[-1] = 1.0

    # Solve linear system
    pi = np.linalg.solve(A, b)

    # Clean tiny numerical noise
    pi[np.abs(pi) < tol] = 0.0

    # If there are small negative values, clamp them to 0 and renormalize
    if np.any(pi < -tol):
        # Serious negativity -> indicate potential problem
        raise RuntimeError(
            "Computed stationary distribution has significantly negative entries. "
            "Check that P is a valid transition matrix with a unique stationary distribution."
        )

    # Clamp small negatives and renormalize
    pi = np.maximum(pi, 0.0)
    s = pi.sum()
    if not np.isfinite(s) or s <= 0:
        raise RuntimeError("Failed to compute a valid stationary distribution (sum <= 0).")
    pi /= s

    if verify:
        # Check stationarity: pi P ≈ pi
        if not np.allclose(pi @ P, pi, atol=1e-8):
            raise RuntimeError("Result does not satisfy pi P ≈ pi. Check the input matrix P.")

    return pi

P = np.array([[0.3,0.7,0,0], [0.2,0.5,0.3,0], [0,0,0.5,0.5], [0,0,0,1]])
print("This is stationary distribution: ", stationary_distribution(P))


This is stationary distribution:  [0. 0. 0. 1.]


In [10]:
# This stationary distribution fiunction could also work, just not always the same and correct way:
def stationary_distribution(P):
    """
    Computes the stationary distribution of a Markov chain
    by finding the eigenvector corresponding to eigenvalue 1.
    """
    eigenvalues, eigenvectors = np.linalg.eig(P.T)
    
    # Find the eigenvector associated with eigenvalue 1
    idx = np.argmin(np.abs(eigenvalues - 1))
    vec = np.real(eigenvectors[:, idx])
    
    # Normalize to sum to 1
    stationary = vec / np.sum(vec)
    return stationary

P = np.array([[0.3,0.7,0,0], [0.2,0.5,0.3,0], [0,0,0.5,0.5], [0,0,0,1]])
print("This is stationary distribution: ", stationary_distribution(P))

This is stationary distribution:  [0. 0. 0. 1.]


### Always use this stationary distribution function below to check, it WILL always work! 

In [None]:
def stationary_distribution_always_works(P):
    """
    Computes the stationary distribution by solving
    (P^T - I) * pi = 0  with  sum(pi) = 1.
    This method ALWAYS works for any Markov chain with a stationary distribution.
    """

    n = P.shape[0]

    # Build system: (P^T - I) * pi = 0
    A = P.T - np.eye(n)

    # Replace last equation with the normalization condition sum(pi)=1
    A[-1] = np.ones(n)

    b = np.zeros(n)
    b[-1] = 1.0

    # Solve the linear system
    pi = np.linalg.solve(A, b)

    return pi



This is stationary distribution:  [ 0. -0. -0.  1.]


### This is another stationary distribution function which the bot said will actually always for for any chain:



In [None]:
import numpy as np

def stationary_distribution_any_markov(P, tol=1e-12, cleanup_tol=1e-15, verify=True):
    P = np.asarray(P, dtype=float)
    n = P.shape[0]
    if P.ndim != 2 or n != P.shape[1]:
        raise ValueError("P must be square.")

    # Validate Markov matrix
    if np.any(P < -cleanup_tol):
        raise ValueError("P has negative entries.")
    if not np.allclose(P.sum(axis=1), 1.0, atol=1e-12):
        raise ValueError("Rows of P must sum to 1.")

    A = P.T - np.eye(n)

    # Add normalization as an extra equation (least squares)
    A_aug = np.vstack([A, np.ones((1, n))])
    b_aug = np.zeros(n + 1)
    b_aug[-1] = 1.0

    # Least-squares solution (works even if A is singular)
    pi, *_ = np.linalg.lstsq(A_aug, b_aug, rcond=None)

    # Cleanup / project to simplex
    pi[np.abs(pi) < cleanup_tol] = 0.0
    pi = np.maximum(pi, 0.0)
    s = pi.sum()
    if s <= 0 or not np.isfinite(s):
        raise RuntimeError("Could not recover a valid distribution.")
    pi /= s

    if verify and not np.allclose(pi @ P, pi, atol=1e-8):
        # If periodic/reducible, this should still pass; if not, input likely invalid/ill-conditioned
        raise RuntimeError("Result does not satisfy pi P ≈ pi (within tolerance).")

    return pi


-----
-----

### Expected hitting time function

This function computes expected hitting times to a given **target set of states** in a finite Markov chain with transition matrix \( P \).

We consider a Markov chain with state space \( \{0, 1, \dots, n-1\} \) and transition matrix

$$
P = (P_{ij})_{i,j=0}^{n-1},
$$

where \( P_{ij} = \mathbb{P}(X_{t+1} = j \mid X_t = i) \).

Given a set of **target states** \( T \subset \{0, \dots, n-1\} \), the *hitting time* of \( T \) is

$$
T_{\text{hit}} = \min\{ t \ge 0 : X_t \in T \}.
$$

For each state \( i \), we define the expected hitting time

$$
h(i) = \mathbb{E}[T_{\text{hit}} \mid X_0 = i].
$$

These satisfy

$$
h(i) = 0 \quad \text{for } i \in T,
$$

and for \( i \notin T \),

$$
h(i) = 1 + \sum_{j=0}^{n-1} P_{ij} h(j).
$$

If we collect the non-target states into a set \( S = \{0, \dots, n-1\} \setminus T \), and form the submatrix \( Q \) of \( P \) with rows and columns indexed by \( S \), then the vector \( h_S = (h(i))_{i \in S} \) solves

$$
(I - Q) h_S = \mathbf{1},
$$

where \( \mathbf{1} \) is a vector of ones.

The function `expected_hitting_time` implements this:

- **Parameters**
  - `P`: `np.ndarray` of shape `(n, n)`  
    Transition matrix of the Markov chain.
  - `target_states`: iterable of integers  
    Indices of the target states \( T \).
  - `start_state` (optional): integer  
    If provided, the function returns \( h(\text{start\_state}) \).
  - `start_dist` (optional): 1D array-like of length `n`  
    Initial distribution \( \alpha \). If provided, the function returns
    $$
    \mathbb{E}[T_{\text{hit}}] = \sum_{i=0}^{n-1} \alpha_i h(i).
    $$

- **Return value**
  - If `start_state` is given: a single float, \( h(\text{start\_state}) \).
  - If `start_dist` is given: a single float, the expected hitting time under that initial distribution.
  - If neither is given: a length-`n` NumPy array, containing \( h(i) \) for all states `i` (targets get value `0`).

- **Usage example (this exam problem)**

For the three-region chain

$$
P = \begin{pmatrix}
0.3 & 0.4 & 0.3 \\\\
0.2 & 0.5 & 0.3 \\\\
0.4 & 0.3 & 0.3
\end{pmatrix},
$$

with downtown = state 0 and suburbs = state 1:

```python
P = np.array([
    [0.3, 0.4, 0.3],  # Downtown
    [0.2, 0.5, 0.3],  # Suburbs
    [0.4, 0.3, 0.3],  # Countryside
])

ET_suburbs_to_downtown = expected_hitting_time(P, target_states=[0], start_state=1)
print(ET_suburbs_to_downtown)  # should be 50/13 ≈ 3.8461538


In [6]:

import numpy as np

def expected_hitting_time(P, target_states, start_state=None, start_dist=None):
    """
    Compute expected hitting times to a given set of target states in a finite Markov chain.

    Parameters
    ----------
    P : np.ndarray, shape (n, n)
        Transition matrix of the Markov chain.
    target_states : iterable of int
        Indices of the target states.
    start_state : int, optional
        If provided, return the expected hitting time starting from this state.
    start_dist : array-like, shape (n,), optional
        If provided, return the expected hitting time under this initial distribution.

    Returns
    -------
    float or np.ndarray
        - If start_state is given: expected hitting time from that state.
        - If start_dist is given: expected hitting time under that distribution.
        - If neither is given: array h of length n with expected hitting times
          from all states (targets have value 0).

    Notes
    -----
    This solves the linear system

        (I - Q) h_S = 1

    where Q is the submatrix of P restricted to non-target states,
    and 1 is a vector of ones. Assumes that the target set is hit
    with probability 1 from the relevant starting states.
    """
    P = np.asarray(P, dtype=float)
    n = P.shape[0]

    target_states = np.array(sorted(set(target_states)), dtype=int)
    all_states = np.arange(n, dtype=int)

    # Non-target states S
    non_target_states = np.array([s for s in all_states if s not in target_states], dtype=int)

    # If all states are targets, hitting time is identically zero
    if non_target_states.size == 0:
        h = np.zeros(n, dtype=float)
        if start_state is not None:
            return float(h[start_state])
        if start_dist is not None:
            start_dist = np.asarray(start_dist, dtype=float)
            return float(start_dist @ h)
        return h

    # Build Q and solve (I - Q) h_S = 1
    Q = P[np.ix_(non_target_states, non_target_states)]
    I = np.eye(Q.shape[0])
    ones = np.ones(Q.shape[0])

    # Solve for h_S
    h_S = np.linalg.solve(I - Q, ones)

    # Put back into full vector h of length n
    h = np.zeros(n, dtype=float)
    h[target_states] = 0.0
    for idx, s in enumerate(non_target_states):
        h[s] = h_S[idx]

    # Return according to user request
    if (start_state is not None) and (start_dist is not None):
        raise ValueError("Provide either start_state or start_dist, not both.")

    if start_state is not None:
        return float(h[start_state])

    if start_dist is not None:
        start_dist = np.asarray(start_dist, dtype=float)
        if start_dist.shape[0] != n:
            raise ValueError("start_dist must have length equal to number of states.")
        return float(start_dist @ h)

    return h


In [7]:
# Example from above code: 

P = np.array([
    [0.3, 0.4, 0.3],  # Downtown
    [0.2, 0.5, 0.3],  # Suburbs
    [0.4, 0.3, 0.3],  # Countryside
])

# Expected steps until first time in Downtown (state 0) starting from Suburbs (state 1)
ET_suburbs_to_downtown = expected_hitting_time(P, target_states=[0], start_state=1)
print(ET_suburbs_to_downtown)  # ~3.846153846153846 (50/13)


3.846153846153846


-----
# This is how you can always find M in a finite interval for the Reject-Accept sampling algorithm:



In [None]:
import numpy as np

def f_x(x):
    return np.exp(x)          # target pdf

def g_x(x):
    return 1/np.log(2)        # uniform(0, ln 2) pdf

# Remember to change the interval to your interval you have
xs = np.linspace(0, np.log(2), 1000)

ratio = f_x(xs) / g_x(xs)

# This works since M is always maximum of f / g
M_num = ratio.max()

print("Answer: ", 2*np.log(2))

print("Numeric M ≈", M_num)   # should be close to 2*np.log(2)


-----
# This is the different Hoeffding intervals from the Lecture notes:

## Summary of Concentration Inequalities from the Lecture Notes

This cell summarizes the main inequalities used in Monte Carlo estimation and empirical distribution analysis:  
- Hoeffding’s inequality for Monte Carlo means  
- The Dvoretzky–Kiefer–Wolfowitz (DKW) inequality  
- Alternative concentration inequalities listed in the notes  
For each formula we also show how to solve for $$\varepsilon$$ when forming a confidence interval.

---

## 1. Hoeffding’s Inequality for Monte Carlo Estimation

Assume we estimate a mean using  
$$
\overline{Y} = \frac{1}{n}\sum_{i=1}^n Y_i,
$$  
where the samples satisfy $Y_i \in [a,b]$.

The **two-sided Hoeffding bound** is:
$$
\mathbb{P}\left( \left| \overline{Y} - \mathbb{E}[Y] \right| \ge \varepsilon \right)
\;\le\; 2 \exp\left( \frac{-2 n \varepsilon^2}{(b-a)^2} \right).
$$

### Solving for $$\varepsilon$$

Set the right-hand side equal to $\delta$:
$$
2 \exp\left( \frac{-2 n \varepsilon^2}{(b-a)^2} \right) = \delta.
$$

Solving gives:
$$
\varepsilon = (b-a)\sqrt{\frac{\ln(2/\delta)}{2n}}.
$$

Thus a $(1-\delta)100\%$ confidence interval is:
$$
\left[\, \overline{Y} - \varepsilon,\; \overline{Y} + \varepsilon \,\right].
$$

---

## 2. Dvoretzky–Kiefer–Wolfowitz (DKW) Inequality

For empirical CDF $$F_n(x)$$ based on i.i.d. samples with true CDF $$F(x)$$:

$$
\mathbb{P}\!\left( \sup_x |F_n(x) - F(x)| \ge \varepsilon \right)
\;\le\; 2 \exp(-2n\varepsilon^2).
$$

### Solving for $$\varepsilon$$

Set the right-hand side equal to $\delta$:

$$
2 \exp(-2n\varepsilon^2) = \delta.
$$

Solving gives:
$$
\varepsilon = \sqrt{\frac{\ln(2/\delta)}{2n}}.
$$

Useful for constructing confidence bands:
$$
F_n(x) - \varepsilon \le F(x) \le F_n(x) + \varepsilon.
$$

---

## 3. Other Alternatives Mentioned in the Lecture Notes

### (a) Chebyshev’s Inequality
Assuming finite variance $\sigma^2$:

$$
\mathbb{P}\left( |\overline{Y} - \mathbb{E}[Y]| \ge \varepsilon \right)
\le \frac{\sigma^2}{n\varepsilon^2}.
$$

Solving for $\varepsilon$ by setting RHS = $\delta$:
$$
\varepsilon = \sigma \sqrt{\frac{1}{n\delta}}.
$$

---

### (b) Central Limit Theorem (CLT) Approximation
For large $n$:

$$
\overline{Y} \approx \mathcal{N}\!\left(\mathbb{E}[Y],\, \frac{\sigma^2}{n}\right).
$$

A $(1-\delta)$ interval is:
$$
\overline{Y} \;\pm\; z_{1-\delta/2}\,\frac{\sigma}{\sqrt{n}},
$$
where $z_{1-\delta/2}$ is the standard normal quantile.

---

### (c) Bernstein (or Chernoff–Hoeffding) Inequality  
Sometimes given in extended form when variance is known. In bounded case (same assumptions as Hoeffding):

$$
\mathbb{P}\!\left( |\overline{Y} - \mathbb{E}[Y]| \ge \varepsilon \right)
\le 2 \exp\!\left( 
\frac{-n\varepsilon^2}{2\sigma^2 + \frac{2}{3}(b-a)\varepsilon}
\right).
$$

Solving for $\varepsilon$ requires numerical methods; not algebraic in closed form.

---

## Summary Table of $$\varepsilon$$ Solutions

| Inequality | Bound | Solution for $$\varepsilon$$ |
|-----------|-------|-------------------------------|
| Hoeffding | $2\exp\!\left(-\frac{2n\varepsilon^2}{(b-a)^2}\right) \le \delta$ | $\varepsilon = (b-a)\sqrt{\frac{\ln(2/\delta)}{2n}}$ |
| DKW | $2\exp(-2n\varepsilon^2) \le \delta$ | $\varepsilon = \sqrt{\frac{\ln(2/\delta)}{2n}}$ |
| Chebyshev | $\frac{\sigma^2}{n\varepsilon^2} \le \delta$ | $\varepsilon = \sigma\sqrt{\frac{1}{n\delta}}$ |
| CLT | approx | $$\varepsilon = z_{1-\delta/2}\,\sigma/\sqrt{n}$$ |
| Bernstein | not closed form | requires numerical solution |



-----

### General information that could be important:

* .values and .to_numpy() both convert pandas DataFrames or Series into NumPy arrays; 
* .to_numpy() is the recommended modern approach.


-----
### Permutation Importance

Permutation importance is a model-agnostic method for measuring feature importance. It works by randomly permuting the values of a single feature in the test set and then measuring how much the model’s predictive performance decreases. If permuting a feature leads to a large drop in performance, the model relied heavily on that feature, and it is considered important.

This method measures the **impact of each feature on the model’s predictive performance**, rather than relying on model-specific parameters. Because it only requires the ability to make predictions and evaluate them with a chosen metric, permutation importance is applicable to **any type of predictive model**, including linear models, tree-based models, and neural networks.


In [None]:
# ============================================================
# GENERAL TEMPLATE: Permutation Importance (works for ANY model)
# ============================================================
# What you MUST change depending on your setup:
#   1) estimator      -> set this to your TRAINED model (or Pipeline)
#   2) X_test, y_test -> set these to your TEST split
#   3) feature_names  -> set these to your column names (list of strings)
#   4) scoring        -> choose a metric appropriate for your task
#
# Notes:
# - This works for any model as long as it has predict() (or predict_proba() for some scorers)
# - If you used preprocessing (scaling, one-hot encoding, etc.), it's best to wrap it in a Pipeline
#   and pass the Pipeline as the estimator to avoid mismatches.
# ============================================================

import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

# --------------------------
# 1) CHOOSE YOUR TRAINED MODEL
# --------------------------
# CHANGE THIS:
# - If you trained a Pipeline (recommended): estimator = my_pipeline
# - If you trained a plain model:           estimator = my_model
#
# Examples:
# estimator = problem3_model                   # Pipeline: scaler + logistic regression
# estimator = trained_random_forest_model      # e.g., RandomForestClassifier already fit
# estimator = trained_svm_model                # e.g., SVC already fit
estimator = problem3_model  # <-- CHANGE to your trained model / pipeline


# --------------------------
# 2) PROVIDE TEST DATA
# --------------------------
# CHANGE THESE:
# - X_test should be the test features (NumPy array or pandas DataFrame)
# - y_test should be the test labels
#
# Examples:
# X_test = problem3_X_test
# y_test = problem3_y_test
X_test = problem3_X_test   # <-- CHANGE if your variables are named differently
y_test = problem3_y_test   # <-- CHANGE if your variables are named differently


# --------------------------
# 3) PROVIDE FEATURE NAMES
# --------------------------
# CHANGE THIS:
# - If X_test is a pandas DataFrame, you can do: feature_names = X_test.columns
# - If X_test is a NumPy array, you must supply a list yourself (same order as columns in X_test)
#
# Examples:
# feature_names = problem3_features
# feature_names = list(X_test.columns)
feature_names = problem3_features  # <-- CHANGE to your feature name list (correct order!)


# --------------------------
# 4) CHOOSE A SCORING METRIC
# --------------------------
# CHANGE THIS depending on your task:
# Classification examples:
#   scoring = "accuracy"            (simple, common)
#   scoring = "balanced_accuracy"   (good if classes are imbalanced)
#   scoring = "f1"                  (if you care about positive class quality)
#   scoring = "roc_auc"             (needs probability or decision scores; many models support it)
#
# Regression examples:
#   scoring = "r2"
#   scoring = "neg_mean_squared_error"
#   scoring = "neg_mean_absolute_error"
#
# Tip:
# - If "roc_auc" fails, your estimator may not provide predict_proba/decision_function.
scoring = "accuracy"  # <-- CHANGE if needed


# --------------------------
# 5) RUN PERMUTATION IMPORTANCE
# --------------------------
perm = permutation_importance(
    estimator=estimator,
    X=X_test,
    y=y_test,
    n_repeats=30,        # increase for more stable estimates (slower)
    random_state=42,
    n_jobs=-1,
    scoring=scoring
)

# --------------------------
# 6) FORMAT RESULTS
# --------------------------
perm_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance Mean": perm.importances_mean,
    "Importance Std": perm.importances_std
}).sort_values(by="Importance Mean", ascending=False)

print("Top features by permutation importance:")
print(perm_df.head(15))

# --------------------------
# 7) OPTIONAL: GET MOST IMPORTANT FEATURE
# --------------------------
most_important_feature = perm_df.iloc[0]["Feature"]
print("\nMost important feature (by permutation importance):", most_important_feature)


# --------------------------
# 8) OPTIONAL: RESTRICT TO A SUBSET OF FEATURES
# --------------------------
# Example: if you want "most important one-hot encoded feature"
# CHANGE the selection rule to match your one-hot naming scheme.
# For your diabetes case (features starting with smoking_ or sex_):
subset = [f for f in feature_names if str(f).startswith("smoking_") or str(f).startswith("sex_")]

if len(subset) > 0:
    perm_subset = perm_df[perm_df["Feature"].isin(subset)].sort_values(by="Importance Mean", ascending=False)
    print("\nPermutation importance for subset features:")
    print(perm_subset)

    most_important_in_subset = perm_subset.iloc[0]["Feature"]
    print("\nMost important feature in subset:", most_important_in_subset)
else:
    print("\nSubset list is empty. Adjust the subset selection rule to match your feature names.")


### Remember:
If the dataset does not have a header, then you need to use header=None in the pd.read_csv("data", header=None). Otherwise we will miss one column. 

-----
### Using Utils.py file

If the exam question requires the file Utils.py, then simply copy paste the file into the current folder I am in and then the code will be able to find the file Utils.py. 

-----

-----
# This is general information when to use which data set in the exam:

## When to use each dataset and variable

The data in this assignment is split into **training**, **validation**, and **test** sets. Each set has a specific role, and using them correctly is essential to avoid data leakage and to obtain an unbiased evaluation.

---

## Training set: fit the model

**Variables**
- `PROBLEM3_X_train`
- `PROBLEM3_y_train`

**When to use**
Use the training set **only to learn the model parameters**.

**Typical operations**
- Fit a model:

```python
    model.fit(PROBLEM3_X_train, PROBLEM3_y_train)
```


- Do **not** compute performance metrics or choose thresholds using training data.

**Purpose**
The training set teaches the model the relationship between features and labels.

---

## Validation set: model selection and threshold choice

**Variables**
- `PROBLEM3_X_val`
- `PROBLEM3_y_val` (or `PROBLEM3_y_true_val`)
- `PROBLEM3_y_pred_proba_val`

**When to use**
Use the validation set to **make decisions about the model**, such as:
- choosing a classification threshold,
- comparing different loss functions,
- computing cost, precision, recall, and 0–1 loss.

**Typical operations**
- Predict probabilities:

```python
    y_pred_proba_val = model.predict_proba(PROBLEM3_X_val)[:,1]
```

- Convert probabilities to predictions:

```python
    y_pred_val = (y_pred_proba_val >= threshold).astype(int)
```

- Compute metrics:
- cost
- precision
- recall
- 0–1 loss

**Purpose**
The validation set is used to **tune decisions** without biasing the final evaluation.

---

## Test set: final evaluation only

**Variables**
- `PROBLEM3_X_test`
- `PROBLEM3_y_test` (or `PROBLEM3_y_true_test`)
- `PROBLEM3_y_pred_proba_test`

**When to use**
Use the test set **only after**:
- the model has been trained,
- the threshold has been chosen using validation data.

**Typical operations**
- Predict probabilities:

```python
    y_pred_proba_test = model.predict_proba(PROBLEM3_X_test)[:,1]
```

- Evaluate final performance:
- compute final cost,
- build a confidence interval,
- report final metrics.

**Purpose**
The test set provides an **unbiased estimate of real-world performance**.

---

## Summary table (conceptual)

- Training set → **fit the model**
- Validation set → **choose thresholds and compare decision rules**
- Test set → **final evaluation and confidence intervals**

---

## Important rules to remember

- Never choose thresholds using the test set.
- Never report final performance using the validation set.
- The test set must only be used **once**, at the very end.
- Predicted probabilities (`predict_proba`) are used for **threshold-based decisions**.
- Binary predictions (`>= threshold`) are used for **cost, precision, recall, and loss**.

Following these rules ensures a correct and exam-safe machine learning workflow.


# General Guide: Which Dataset to Use, When, and Why (Logistic Regression & Classification)

THIS RESPONSE IS **INTENTIONALLY AND EXCLUSIVELY** A SINGLE MARKDOWN TEXT CELL.  
There is **NO TEXT OUTSIDE THIS BLOCK**.  
You can copy **once** and paste directly into a Jupyter Notebook **Markdown cell**.

---

## 1. Defining Features and Target

### Features (`X`)
- Features are the **input variables** used by the model to make predictions.
- These typically include:
  - Numerical variables (e.g. age, BMI, blood glucose)
  - One-Hot encoded categorical variables (e.g. `sex_Male`, `smoking_former`)
- **Rule**: Features must represent information that is available **before** a prediction is made.
- **Rule**: Never include the target variable inside the feature set.

### Target (`y`)
- The target is what the model is trying to predict.
- For classification:
  - Binary variable (e.g. diabetes = 0 or 1)
- **Rule**: The target must NEVER be included among the features.

```python
problem3_X = problem3_df[feature_columns].values
problem3_y = problem3_df[target_column].values
```

## 2. Train–Test Split (Why and How)

Training Dataset
- Used to train (fit) the model.
- The model learns patterns from this data.

Test Dataset
- Used to evaluate final performance.
- Simulates unseen, real-world data.
- Must NEVER be used during training.

Standard Split
- 80% training
- 20% testing

```python
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    problem3_X,
    problem3_y,
    test_size=0.2,
    random_state=42
)
```


## 3. Training the Model (.fit())
Which Dataset Goes Into .fit()?
- ONLY the training dataset.

```python
  from sklearn.linear_model import LogisticRegression

  model = LogisticRegression(C=1.0, max_iter=1000)
  model.fit(X_train, y_train)
```

Notes
- C controls regularization:
  - Smaller C → stronger penalization
  - Important when many One-Hot encoded features exist
- max_iter is increased to avoid convergence warnings


## 4. Making Predictions (.predict())
Which Dataset Goes Into .predict()?

| Purpose | Dataset |
|--------|---------|
| Train the model | `X_train`, `y_train` |
| Final model evaluation | `X_test`, `y_test` |
| Generate predicted class labels | `X_test` |
| Generate predicted probabilities | `X_test` |
| Predict on completely new / unseen data | New data with the **same feature structure** as `X_train` |
| Debugging or sanity checks only | `X_train` (NOT for evaluation) |
| Compute precision / recall | Compare `y_test` with predictions from `X_test` |
| Compute confidence intervals | Predictions made on `X_test` |
| Extract feature importance | Trained model (`model.coef_`) |

```python
  y_pred = model.predict(X_test)
```
Rule: Never evaluate performance using predictions from X_train.


## 5. Probability Predictions (.predict_proba())
Used when:
- You want class probabilities instead of labels
- You want threshold-based decisions
- You want confidence-aware analysis

```python
  y_prob = model.predict_proba(X_test)
```
* Output shape: (n_samples, 2)
  * Column 0 → probability of class 0
  * Column 1 → probability of class 1


## 6. Evaluation Metrics: Precision & Recall
Precision
* Of all predicted positives, how many are correct?
* Interpretation:
  * “When the model predicts diabetes, how often is it right?”

Recall
* Of all actual positives, how many were found?
* Interpretation:
  * “How many diabetes cases did the model detect?”

```python
  from sklearn.metrics import precision_score, recall_score

  precision_1 = precision_score(y_test, y_pred, pos_label=1)
  recall_1 = recall_score(y_test, y_pred, pos_label=1)

  precision_0 = precision_score(y_test, y_pred, pos_label=0)
  recall_0 = recall_score(y_test, y_pred, pos_label=0)
```

## 7. Feature Importance (Logistic Regression)

How Feature Importance Is Defined
* Logistic Regression uses coefficients
* Larger absolute coefficient ⇒ stronger influence

```python
  coefficients = model.coef_[0]
```

One-Hot Encoded Features
* Compare absolute values of coefficients
* The most important One-Hot encoded feature is the one with the largest absolute coefficient

```python
  important_idx = np.argmax(np.abs(coefficients))
  important_feature = feature_columns[important_idx]
```

## 8. What Dataset to Use for Each Task (Summary Table)
| Purpose | Dataset |
|--------|---------|
| Train the model | `X_train`, `y_train` |
| Final model evaluation | `X_test`, `y_test` |
| Generate predicted class labels | `X_test` |
| Generate predicted probabilities | `X_test` |
| Predict on completely new / unseen data | New data with the **same feature structure** as `X_train` |
| Debugging or sanity checks only | `X_train` (NOT for evaluation) |
| Compute precision / recall | Compare `y_test` with predictions from `X_test` |
| Compute confidence intervals | Predictions made on `X_test` |
| Extract feature importance | Trained model (`model.coef_`) |


## 10. Exam-Safe Golden Rules (MEMORIZE)
* Never train on test data
* Never evaluate on training data
* .fit() → training data ONLY
* .predict() → test or unseen data ONLY
* Metrics → always computed using y_test
* One-Hot feature importance → coefficient magnitude
* If unsure: ask which dataset is allowed before using it




-----



-----

-----
## How to think when choosing a proposal distribution \( g(x) \) in rejection sampling

When facing a difficult rejection sampling problem, do **not** start by guessing formulas. Instead, follow this reasoning process.

---

### 1) Locate where the probability mass is
Ask: *Where does the distribution actually concentrate its mass?*

Look for terms like:
- $( e^{-1/x} \), \( e^{x^2} \), \( x^\alpha )$
- behavior near boundaries (0, infinity, endpoints)

**Rule:**  
Your proposal must put mass where the target puts mass.

---

### 2) Identify the dominant term
Ignore constants and lower-order factors at first.

Ask: *Which part of the density controls the shape?*

Examples:
- $( e^{-x} )$ → exponential
- $( e^{-x^2} )$ → Gaussian-like
- $( e^{-1/x} )$ → strong boundary concentration

**Rule:**  
Match the dominant term first; fix the rest using rejection.

---

### 3) Consider a change of variables
If the density contains:
- $( 1/x \), \( \log x )$, or sharp boundary behavior

Ask: *Would this look simpler in another variable?*

Common transformations:
- $( Y = 1/X )$ for $( e^{-1/x} )$
- $( Y = \log X )$ for multiplicative scales

**Rule:**  
If the density is ugly in $( x )$, change coordinates.

---

### 4) Choose a proposal that is easy to sample from
Good proposals:
- Uniform (only if the target is fairly flat)
- Exponential or shifted exponential
- Gaussian

Bad proposals:
- hard-to-invert CDFs
- complicated expressions

**Rule:**  
If sampling from $( g(x) )$ is hard, you chose the wrong proposal.

---

### 5) Immediately check the ratio $( f(x)/g(x) )$
Before coding, compute:
$$
\frac{f(x)}{g(x)}
$$

Ask:
- Is it bounded?
- Does it simplify?
- Do exponentials cancel?

**Rule:**  
If exponentials cancel and the ratio is simple, the proposal is good.

---

### 6) Estimate the rejection constant $( M )$
Check where the maximum of $( f(x)/g(x) )$ occurs:
- often at boundaries
- sometimes at symmetry points

Good signs:
- $( M \approx 1 )$: very efficient
- $( M \gg 10 )$: rethink your proposal

---

### 7) Key mindset
Rejection sampling is not mechanical algebra — it is **distribution engineering**:
- understand the shape
- match it intelligently
- use rejection only to correct small differences

---

### One-line checklist (exam-ready)
1. Where is the mass?  
2. What term dominates?  
3. Should I change variables?  
4. Can I sample from $( g )$ easily?  
5. Is $( f/g )$ bounded?  
6. Is $( M )$ small?

If all answers are yes, your choice of $( g(x) )$ is good.

-----