# SMS Spam Detection - Evaluation Metrics with Confidence Intervals
This notebook computes precision, recall, and accuracy for a binary classification problem
(SMS spam detection), and calculates 95% confidence intervals using Hoeffding’s inequality.


In [None]:
# Assumed to be provided:
# y_true: true labels for the test set (0 = ham, 1 = spam)
# y_pred: predicted labels by the black-box model

from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np

# Example placeholders (replace with real values)
y_true = [...]  # Replace with actual values
y_pred = [...]  # Replace with actual values

# Confidence level
delta = 0.05


## 1. Precision and 95% Hoeffding Confidence Interval

In [None]:
precision = precision_score(y_true, y_pred, pos_label=1)
n_pred_positives = np.sum(np.array(y_pred) == 1)
epsilon_p = np.sqrt(np.log(2 / delta) / (2 * n_pred_positives))
precision_interval = (precision - epsilon_p, precision + epsilon_p)
precision, precision_interval

## 2. Recall and 95% Hoeffding Confidence Interval

In [None]:
recall = recall_score(y_true, y_pred, pos_label=1)
n_actual_positives = np.sum(np.array(y_true) == 1)
epsilon_r = np.sqrt(np.log(2 / delta) / (2 * n_actual_positives))
recall_interval = (recall - epsilon_r, recall + epsilon_r)
recall, recall_interval

## 3. Accuracy and 95% Hoeffding Confidence Interval

In [None]:
accuracy = accuracy_score(y_true, y_pred)
n_total = len(y_true)
epsilon_a = np.sqrt(np.log(2 / delta) / (2 * n_total))
accuracy_interval = (accuracy - epsilon_a, accuracy + epsilon_a)
accuracy, accuracy_interval

## 4. Would Using All Data Improve Accuracy Interval with a VC-dim 3 Classifier?
**Answer:** Yes. Using all data increases sample size \( n \), and Hoeffding's bound tightens as:
\[ 
\epsilon \propto \frac{1}{\sqrt{n}} 
\]
So the confidence interval for accuracy becomes smaller. A classifier with VC-dim 3 also generalizes well if trained on more data.

# Markov Chain for Travel Dataset
This section analyzes flight data as a stationary Markov chain. We estimate the transition matrix, compute the stationary distribution, and calculate the 3-step return probability to a specific city.

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("data/flights.csv")

# Define columns and extract cities
start_col = 'start'
end_col = 'end'
cities = sorted(set(df[start_col]) | set(df[end_col]))
city_to_idx = {city: i for i, city in enumerate(cities)}
n_cities = len(cities)

## Estimate Transition Matrix

In [None]:
# Initialize and populate transition count matrix
counts = np.zeros((n_cities, n_cities))
for _, row in df.iterrows():
    i = city_to_idx[row[start_col]]
    j = city_to_idx[row[end_col]]
    counts[i, j] += 1

# Normalize to get transition matrix
transition_matrix = counts / counts.sum(axis=1, keepdims=True)
transition_matrix

## Compute Stationary Distribution

In [None]:
# Get stationary distribution (left eigenvector of P^T with eigenvalue 1)
eigvals, eigvecs = np.linalg.eig(transition_matrix.T)
stationary = np.real(eigvecs[:, np.isclose(eigvals, 1)])[:, 0]
stationary_distribution = stationary / stationary.sum()
stationary_distribution

## 3-Step Return Probability to 'Aracaju (SE)'

In [None]:
# Compute 3-step transition matrix
P3 = np.linalg.matrix_power(transition_matrix, 3)
start_idx = city_to_idx['Aracaju (SE)']
prob_return = P3[start_idx, start_idx]
prob_return