In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

PROBLEM 1: Data analysis using markov chians 

In this problem, you will empirically analyze a Markov chain 
with a finite state space. Transition probabilities are unknown.

The state space is:
    S = {0, 1, 2, 3}

You are given the data for the observed X_t for t  = 0..19

Tasks:
1. Estimate the transition matrix P from the observed transitions.
2. Verify that the estimated matrix is a probability transition matrix.
3. Compute the stationary distribution pi of the chain.
4. Simulate the chain using the estimated transition matrix
5. Compute the expected hitting times via

   (a) Simulation

   (b) Solving linear equations (analytical hitting times). 

Compare the estimates and interpret the results


In [None]:
import numpy as np

# state space
S = [0, 1, 2, 3]
N_states = len(S)

# Observed transitions: each row is (current_state, next_state)
X_t = np.array([
    [0, 1],
    [1, 2],
    [2, 3],
    [3, 0],
    [0, 1],
    [1, 1],
    [1, 2],
    [2, 2],
    [2, 3],
    [3, 3],
    [3, 0],
    [0, 2],
    [2, 1],
    [1, 3],
    [3, 1],
    [1, 0],
    [0, 0],
    [0, 1],
    [1, 2],
    [2, 0],
], dtype=int)




Below are methods that you need to complete

In [None]:
# 1.1
def comp_transition_matrix(transitions, n_states):
    """
    Estimate the transition matrix P from observed transitions.

    Args:
        transitions: array of shape (n_samples, 2)
        n_states: number of states

    Returns:
        P_hat: estimated transition matrix
    """
    P_hat = np.zeros((n_states, n_states))
    
    # implement P_hat


    return P_hat


#  1.2
def is_transition_matrix(P):
    """
    Check if P is a transition matrix.
    """

    # implement the check here
    
    return False


# 1.3
def stationary_distribution(P):
    """
    Compute stationary distribution
    """
    pi = "..."
    
    
    # Here you implement the method for computing pi. Remember that we did it during lessons - and there are at least 2 ways of computing pi. You can choose either of them
    
    
    return pi



def simulate_chain(P, start_state, n_steps):
    """
    Simulate a Markov chain trajectory with a fixed random seed.

    Returns: array of visited states of length n_steps + 1
    """
    seed = 1234 # don't change that
    
    rng = np.random.default_rng(seed)


    path = np.zeros(n_steps + 1, dtype=int)
    path[0] = start_state
    

    #  sample next states using rng.choice

    return path



def hitting_times_sim(P, start_state, n_sim=10_000):
    """
    Estimate expected hitting times E[T_{start -> j}] for ALL states j.

    Returns:
        est: 1D array, where est[j] the estimated expected steps to hit state j from start_state. 
    """
    
    est = np.full(N_states, np.nan, dtype=float)
    seed = 1234

    # Find simulation estimates of hitting time for all states 0,1, 2, 3

    return est



def theoretical_hitting_times(P, start_state):


    hit_theor = np.full(N_states, np.nan, dtype=float)
    
    # here you will solve a system of equations to find analytical hitting times.
    # Hint: remember that, for start_state = j, the hitting time of j is always 1. 
    
    return hit_theor

When you are done, run the following cell (no need to implement anything else)

In [None]:
def problem1_main():
    print("\n=== Problem 1: Markov chain estimation + hitting times ===")

    # 1) Estimate P
    P_hat = comp_transition_matrix(X_t, N_states)
    print("Estimated P_hat:\n", np.round(P_hat, 3))

    # 2) Validate
    print("Is valid transition matrix?", is_transition_matrix(P_hat))

    # 3) Expected steps from given start state to all states
    start_state = 0

    # simulation
    mc = hitting_times_sim(P_hat, start_state=start_state, n_sim=5000)

    # Theory (linear system)
    th = theoretical_hitting_times(P_hat, start_state=start_state)

    # 4) Compare
    df = pd.DataFrame({
        "target_state": np.arange(N_states),
        "MC_estimate": mc,
        "theoretical": th,
        "abs_diff": np.abs(mc - th),
    })
    print("\nComparison table:\n", df)

PROBLEM 2: Cost-Sensitive Classification

You are given a binary classification problem for fraud detection.

Class labels:

    y = 1 => fraud

    y = 0 => ok



The costs of classification outcomes are:
    TP = 0, TN = 0, FP = 100, FN = 500

Tasks:
1. Train an SVM classifier.
2. Compute classification costs at a fixed threshold (0.5).
3. Evaluate total cost for multiple probability thresholds.
4. Find the threshold that minimizes total cost.

In [None]:
import numpy as np
import pandas as pd

costs = {"TP": 0, "TN": 0, "FP": 100, "FN": 500}


def generate_fraud_table(seed=0, n=3000, fraud_rate=0.05):
    """
    Generate a simple fraud dataset as a single table. The table contains:
        - numerical features: x1, x2, x3
        - binary target column: fraud (1 = fraud, 0 = legitimate)
    """
    rng = np.random.default_rng(seed)

    # Target variable
    fraud = (rng.random(n) < fraud_rate).astype(int)

    # Features
    x1 = rng.normal(0, 1, size=n)
    x2 = rng.normal(0, 1, size=n)
    x3 = rng.normal(0, 1, size=n)

    #  fraud cases are shifted
    x1[fraud == 1] += 2.0
    x2[fraud == 1] += 1.0

    df = pd.DataFrame({
        "x1": x1,
        "x2": x2,
        "x3": x3,
        "fraud": fraud,
    })

    return df


fraud_data = generate_fraud_table()

fraud_data.head()

Unnamed: 0,x1,x2,x3,fraud
0,-0.250243,-0.863902,-0.307019,0
1,-0.380736,0.018756,-0.559577,0
2,1.126431,2.055912,0.973126,1
3,0.806991,2.10416,-0.211368,1
4,0.059649,0.652374,-0.437259,0


Fill in the methods in the cell below:

In [None]:
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


def train_test_split_table(df):
    """
    Split a data table into training and test sets.

    Returns:
        X_train, X_test, y_train, y_test
    """
    # implement splitting
    # first, decide what are features and what are target 
    X = df[...]
    y = df[...]

    # then split into train and test
    X_train = None
    X_test = None
    y_train = None
    y_test = None

    return X_train, X_test, y_train, y_test

def fit_linear_svm(fraud_data):
    """
    Fit a linear SVM classifier.

    Args: data table

    Returns:
        predicted labels of length len(y_test) 
    """
    # define our model
    clf = LinearSVC(
        C=1.0,
        max_iter=10_000,
        random_state=0
    )

    # split the data into trian and test:
    X_train, X_test, y_train, y_test = train_test_split_table(fraud_data)
    #   Fit the SVM using X_train and y_train and predict the label using y_test. return y_pred
    y_pred = '...'

    return y_pred


def confusion_counts(y_true, y_pred):
    
    """
    Computes TP, TN, FP, FN.
    """
    
    TP_est, TN_est, FP_est, FN_est = 0,0,0,0 
    
    # Here you Ccmpute TP, TN, FP, FN.
    
    return {"TP": TP_est, "TN": TN_est, "FP": FP_est, "FN": FN_est}


def total_cost(counts):
    """
    Compute total cost from confusion counts.

    """
    # Multiply counts by costs and sum
    total_cost = '...'
    
    return total_cost

# evaluate how the classification cost changes when you change the decision threshold.
def sweep_thresholds(y_true, thresholds, X, clf):
    """
    Evaluate total cost for a range of thresholds.
    
    Here, clf is your trained SVM classifier
    """

    results = []
    
    # note: here, I define y_probs to be just a decision function. Think: does it need to be calibrated to be used in this problem?
    y_probs = clf.decision_function(X)

    for t in thresholds:
        # 1) compute the prediction for a chosen theshold
        y_pred = (y_probs >= t).astype(int)

        # 2) Confusion matrix counts  (previoulsy implemented by you)
        counts = confusion_counts(y_true, y_pred)

        # 3) Total cost (previoulsly implemented by you)
        cost = total_cost(counts)

        # 4) Store results
        results.append({
            "threshold": t,
            "TP": counts["TP"],
            "TN": counts["TN"],
            "FP": counts["FP"],
            "FN": counts["FN"],
            "total_cost": cost,
        })

    return pd.DataFrame(results)



When you are done, run the following cell (no need to implement anything else)

In [None]:
def main():

    df = fraud_data

    print("Dataset head:")
    print(df.head(), "\n")

    # split in train and test:
    _, X_test, _, y_test = train_test_split_table(df)
    # Fit linear SVM
    clf = fit_linear_svm(df)

    # thresholds
    thresholds = np.linspace(-2.0, 2.0, 21)
    df_results = sweep_thresholds(
        y_test,
        clf,
        X_test,
        thresholds,
    )

    print("Threshold sweep results:")
    print(df_results)

    # 6) Identify optimal threshold
    best_row = df_results.loc[df_results["total_cost"].idxmin()]
    print("Optimal threshold:", best_row)

PROBLEM 3: Confidence estimation of the cost

In Problem 2, you trained a classifier, selected a decision threshold, evaluated its performance on a test set, and computed the cost

In this problem, you will quantify the uncertainty of this estimated cost. Each observation in the test set produces a cost depending on the
classification outcome:

    TN: 0
   
    FP: 100

    TP: 0

    FN: 500

Thus, the cost per observation is a bounded random variable taking
values in the interval [0, 500].

Tasks:
1. Compute the average cost per observation on the test set.
2. Use Hoeffdingâ€™s inequality to construct a 95% confidence interval
   for the true expected cost of the classifier.
3. Interpret the resulting interval:
   - What does it say about the reliability of your estimate?
   - Is the interval likely to be tight or conservative? Why?

You may assume that test observations are independent and identically
distributed.

In [None]:
def per_observation_cost(y_true, y_pred):
    """
    Compute per-observation cost vector.
    """
    
    # here, you will compute the average cost using the test set
    cost_avg = 0 
    
    return np.zeros_like(y_true, dtype=float)


def hoeffding_ci(per_obs_costs, mean, n, a, b, delta=0.05):
    """
    Hoeffding confidence interval
    """
    # Step 1: deterministic costs per observation
    c = per_obs_costs

    # Step 2:   average cost
    mean_cost = np.mean(c)

    # Step 3: construct a Hoeffding intevral of the estimated cost
    ci = '...'
    
    
    return ci