In [1]:
from __future__ import annotations
from typing import Protocol, runtime_checkable, Iterable, Optional, Union, List, Tuple
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt


### Presenter notes — Inheritance & Polymorphism
- **Interface**: `BaseRegressor` communicates expected behavior (`fit`, `predict`).
- **Substitutability**: Adapters show how different implementations can share an API.
- **Why adapters?**: Our existing classes differ slightly from the uniform interface; adapters are a lightweight bridge.
- **Client code**: `evaluate_model` treats any `BaseRegressor` the same; implementation details hidden.
- **Takeaway**: OOP enables architecture where components can be swapped with minimal friction.
@runtime_checkable
class BaseRegressor(Protocol):
    def fit(self, X: np.ndarray, y: np.ndarray) -> "BaseRegressor":
        ...
    def predict(self, X: np.ndarray) -> np.ndarray:
        ...

# Adapters: Provide a uniform interface for our existing classes
class UnivariateOLSAdapter:
    def __init__(self, fit_intercept: bool = True):
        self._impl = UnivariateOLS(fit_intercept=fit_intercept)
    def fit(self, X: np.ndarray, y: np.ndarray) -> "UnivariateOLSAdapter":
        self._impl.fit(X, y)
        return self
    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.asarray(self._impl.predict(X))

class UnivariateOnlineOLSAdapter:
    def __init__(self, fit_intercept: bool = True):
        self._impl = UnivariateOnlineOLS(fit_intercept=fit_intercept)
    def fit(self, X: np.ndarray, y: np.ndarray) -> "UnivariateOnlineOLSAdapter":
        for x_i, y_i in zip(np.asarray(X).reshape(-1), np.asarray(y).reshape(-1)):
            self._impl += (float(x_i), float(y_i))
        return self
    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.asarray(self._impl.predict(X))

# Client code that is agnostic to the specific implementation
def evaluate_model(model: BaseRegressor, X: np.ndarray, y: np.ndarray) -> float:
    model.fit(X, y)
    yhat = model.predict(X)
    u = y - yhat
    v = y - y.mean()
    den = float(np.dot(v, v))
    return 1.0 - float(np.dot(u, u)) / den if den != 0.0 else 0.0

# Demo
X_demo = X  # from earlier synthetic data cell
y_demo = y

batch_model = UnivariateOLSAdapter(True)
online_model = UnivariateOnlineOLSAdapter(True)

print("R^2 (batch adapter):", round(evaluate_model(batch_model, X_demo, y_demo), 4))
print("R^2 (online adapter):", round(evaluate_model(online_model, X_demo, y_demo), 4))## Stretch — Inheritance and Polymorphism (Extended)

- **Goal**: Show how a common interface enables interchangeable implementations.
- **Approach**: Define a minimal `BaseRegressor` with `fit` and `predict`.
- **Children**: `UnivariateOLS` (batch) and `UnivariateOnlineOLS` (streaming) implement the same API.
- **Polymorphism**: Write helper functions that work with any `BaseRegressor`.

Presenter note: Emphasize API shape over implementation details; show that client code doesn’t care whether it’s batch or streaming.### Presenter notes — Optional: Online OLS and operator overloading
- **Motivation**: Process data streams or large datasets incrementally; avoid storing all points.
- **State**: Maintain running sums (n, Σx, Σy, Σx², Σxy); compute parameters lazily when needed.
- **Design choices**:
  - `add(x, y)` mutates internal state; invalidates cached parameters.
  - `__iadd__` enables `m += (x, y)`; `__add__` returns a new object (non-mutating).
  - Properties `slope`/`intercept` trigger computation only when requested.
- **Teaching tip**: Ingest a few points interactively; compare with batch OLS on the same data.
- **OOP angle**: Operator overloading demonstrates polymorphism with built-in syntax; shows careful state invalidation/caching.
### Presenter notes — Block 5: Dunder methods `__str__`, `__repr__`
- **`__repr__`**: Unambiguous representation for debugging; include config and learned state.
- **`__str__`**: Friendly printing; switch message depending on fitted vs not fitted.
- **Teaching tip**: Print before and after fitting to emphasize object lifecycle.
- **OOP angle**: Integrates with Python tooling (print, REPL, logging).### Presenter notes — Block 4: `score`
- **Metric**: R² = 1 − SS_res / SS_tot (with intercept), else denominator uses ||y||².
- **Behavior**: Calls `predict`, computes residuals, then R²; handles zero denominator by returning 0.
- **Teaching tip**: Show clean vs noisy data R² to calibrate expectations.
- **OOP angle**: Demonstrates method composition and reuse (`predict` within `score`).### Presenter notes — Block 3: `predict`
- **Purpose**: Use learned parameters to generate outputs for new inputs.
- **Precondition**: Require model to be fit; otherwise raise informative error.
- **Design**: Return scalars or arrays; rely on NumPy broadcasting for ergonomics.
- **Teaching tip**: Compare predictions for intercept vs no-intercept models.
- **OOP angle**: Reads `self` state; no mutation if purely predictive.### Presenter notes — Block 2: `fit`
- **Purpose**: Compute parameters (a, b) from data; update object state.
- **Key lines**:
  - Input sanitation: ensure 1-D arrays, equal lengths; guard against shape mismatch.
  - Closed form: with intercept uses `n, Σx, Σy, Σx², Σxy`; without intercept uses `Sxy / Sxx`.
  - Edge cases: denominator zero when all x identical (with intercept) or all x zero (no intercept).
- **Teaching tip**: Print `(m.slope_, m.intercept_)` on a perfect line to build intuition.
- **OOP angle**: Method modifies `self` (learned state persists across method calls).### Presenter notes — Block 1: Class and `__init__`
- **Encapsulation**: Group configuration (`fit_intercept`) and learned parameters (`intercept_`, `slope_`).
- **Constructor**: Convert to `bool` defensively; initialize learned params to `None`.
- **Teaching tip**: Show `model.fit_intercept` and discuss object state vs local variables.
- **Gotcha**: Distinguish configuration (unchanging after construction) vs learned state (mutates after `fit`).### Presenter notes — Synthetic data + OLS plot
- **Purpose**: Make OLS tangible; show the goal visually before classes/methods.
- **Key callouts**:
  - Data: `X ~ Uniform[-3, 3]`, noise `ε ~ N(0, 0.8)`; true line `y = 2.5 + 1.8 x + ε`.
  - Closed-form solution uses sums Σx, Σy, Σx², Σxy; ensures we connect math → code.
  - The red line is the fitted model `y ≈ â + b̂ x`.
- **Transitions**: “We’ll now encapsulate this logic in a class to illustrate OOP.”# Synthetic data and initial OLS fit (closed-form)
%matplotlib inline



rng = np.random.default_rng(193)
num_points = 80
X = rng.uniform(-3.0, 3.0, size=num_points)
true_intercept = 2.5
true_slope = 1.8
noise = rng.normal(0.0, 0.8, size=num_points)
y = true_intercept + true_slope * X + noise

# Closed-form OLS with intercept
Sx = X.sum()
Sy = y.sum()
Sxx = float(np.dot(X, X))
Sxy = float(np.dot(X, y))
den = num_points * Sxx - Sx * Sx
b_hat = (num_points * Sxy - Sx * Sy) / den
a_hat = (Sy - b_hat * Sx) / num_points

x_line = np.linspace(X.min() - 0.5, X.max() + 0.5, 100)
y_line = a_hat + b_hat * x_line

plt.figure(figsize=(6, 4))
plt.scatter(X, y, s=18, alpha=0.7, label="data")
plt.plot(x_line, y_line, color="crimson", linewidth=2.0,
         label=f"OLS fit: y ≈ {a_hat:.2f} + {b_hat:.2f} x")
plt.xlabel("x")
plt.ylabel("y")
plt.title("Synthetic data and OLS fit (closed-form)")
plt.legend()
plt.tight_layout()## OLS vs Online OLS — Problem Outline

- Ordinary Least Squares (batch OLS):
  - **Goal**: Fit y ≈ a + b x by minimizing sum of squared residuals.
  - **Inputs**: Full dataset available in memory.
  - **Output**: Closed-form parameters a (intercept), b (slope).
- Online (Streaming) OLS:
  - **Goal**: Update estimates as (x, y) arrive sequentially, without storing all data.
  - **State**: Maintain running statistics (n, Σx, Σy, Σx², Σxy) and compute a, b on demand.
  - **Benefit**: Works with data streams and memory-constrained settings.

Presenter note: Contrast batch vs streaming. Frame Online OLS as the same objective, different computation/constraints.# CME 193 — 50‑Minute Intro to OOP in Python (via Univariate OLS)

This lecture targets `https://web.stanford.edu/class/cme193/index.html` and is designed for slide-style delivery with concise presenter notes.

## Agenda (≈50 minutes)
- What is Univariate OLS? Primer and goals
- OLS vs Online (Streaming) OLS overview
- Block 1: Defining a class and `__init__` (encapsulation)
- Block 2: Instance methods (`fit`)
- Block 3: Using internal state (`predict`)
- Block 4: Quality metric (`score`), docstrings, type hints
- Block 5: Magic/dunder methods: `__str__` / `__repr__`
- Optional: Streaming OLS + operator overloading (`__iadd__`, `__add__`)
- Stretch: Inheritance and polymorphism

Presenter note: Emphasize OOP principles through a tangible example (OLS). Keep math light; focus on how object state and methods interact.

# CME 193: Introduction to Scientific Python
## Week 2: Object-Oriented Programming (Oct. 1, 2025)

## Agenda

In this mini‑session, you’ll learn core OOP ideas in Python by building a tiny linear regression model and progressively adding features.

- What is Univariate Ordinary Least Squares (OLS)? Quick statistical primer.

- Defining a class and `__init__` (encapsulation).

- Instance methods (`fit`).

- Using internal state (`predict`).

- Quality metric (`score`), docstrings, type hints.

- Magic/dunder methods: `__str__` / `__repr__`

- Optional: Streaming OLS + operator overloading (`__iadd__`, `__add__`)

- Stretch: Inheritance sketch (if time permits)

## Primer: What is Univariate Ordinary Least Squares (OLS)?

### Simulated Data Generating Process

We'll create synthetic data following a simple linear relationship with noise:

$$y = \beta_0 + \beta_1 x + \epsilon$$

Where:
- $x$ = predictor variable, uniformly spaced from 0 to 10 (50 points)
- $\beta_0$ = true intercept (10.0)
- $\beta_1$ = true slope (2.5)
- $\epsilon$ ~ $\mathcal{N}$(0, 1.5) = random noise

Our goal is to estimate $\beta_0$ and $\beta_1$ from the observed data points using OLS.

In [2]:
# Fix the random seed for reproducibility.
np.random.seed(42)
# Number of N points.
n_points = 50
# Generate x values from 0 to 10.
x = np.linspace(0, 10, n_points)
# Define the true slope and intercept.
true_slope = 2.5
true_intercept = 10.0
# Generate noise.
noise = np.random.normal(0, 1.5, n_points)
# Generate y values.
y = true_intercept + true_slope * x + noise

In [None]:
# Professional visualization of synthetic data
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(14, 6), dpi=300)

# Plot 1: Raw data points with enhanced styling
axes[0].scatter(x, y, alpha=0.8, color='#2E86AB', s=60, edgecolors='white', linewidth=0.5)
axes[0].set_xlabel('Predictor Variable (x)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Response Variable (y)', fontsize=12, fontweight='bold')
axes[0].set_title('Synthetic Dataset\n(n=50 observations)', fontsize=14, fontweight='bold', pad=20)
axes[0].grid(True, alpha=0.4, linestyle='-', linewidth=0.5)
axes[0].tick_params(axis='both', which='major', labelsize=10)

# Plot 2: Data with true relationship line
axes[1].scatter(x, y, alpha=0.8, color='#2E86AB', s=60, edgecolors='white',
                linewidth=0.5, label='Observed Data', zorder=3)
y_true = true_intercept + true_slope * x
axes[1].plot(x, y_true, color='#A23B72', linewidth=3, linestyle='--',
             label=f'True Model: y = {true_intercept:.1f} + {true_slope:.1f}x', zorder=2)
axes[1].set_xlabel('Predictor Variable (x)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Response Variable (y)', fontsize=12, fontweight='bold')
axes[1].set_title('Data with Underlying Linear Relationship', fontsize=14, fontweight='bold', pad=20)
axes[1].legend(loc='upper left', frameon=True, fancybox=True, shadow=True, fontsize=11)
axes[1].grid(True, alpha=0.4, linestyle='-', linewidth=0.5)
axes[1].tick_params(axis='both', which='major', labelsize=10)

# Enhance overall figure appearance
plt.tight_layout(pad=3.0)
plt.subplots_adjust(top=0.85)
fig.suptitle('Univariate Linear Regression: Data Generation Process',
             fontsize=16, fontweight='bold', y=0.98)

plt.show()

# Professional data summary table
print("=" * 60)
print("DATA GENERATION SUMMARY".center(60))
print("=" * 60)
print(f"{'Parameter':<25} {'Value':<15} {'Description'}")
print("-" * 60)
print(f"{'Sample Size (n)':<25} {n_points:<15} {'Number of observations'}")
print(f"{'X Range':<25} {f'[{x.min():.1f}, {x.max():.1f}]':<15} {'Predictor variable range'}")
print(f"{'Y Range':<25} {f'[{y.min():.1f}, {y.max():.1f}]':<15} {'Response variable range'}")
print(f"{'True Intercept (β₀)':<25} {true_intercept:<15} {'Population parameter'}")
print(f"{'True Slope (β₁)':<25} {true_slope:<15} {'Population parameter'}")
print(f"{'Noise Std Dev (σ)':<25} {f'{noise.std():.2f}':<15} {'Error term variability'}")
print(f"{'R² (theoretical)':<25} {f'{1 - (noise.var() / y.var()):.3f}':<15} {'Expected explained variance'}")
print("=" * 60)

## Ordinary Least Squares: Mathematical Foundation

### **Linear Model Assumption:**
- We assume a linear relationship between predictor $x$ and response $y$: 
  $$y = \beta_0 + \beta_1 x + \varepsilon$$
  where $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ represents random error.

### **Optimization Objective:**
- Ordinary Least Squares (OLS) estimates parameters $(\beta_0, \beta_1)$ by minimizing the sum of squared residuals:
  $$\min_{\beta_0, \beta_1} \sum_{i=1}^{n} (y_i - \beta_0 - \beta_1 x_i)^2$$

### **Analytical Solution:**
- Define convenient notation for sums:
  - $S_x = \sum_{i=1}^{n} x_i$
  - $S_y = \sum_{i=1}^{n} y_i$
  - $S_{xy} = \sum_{i=1}^{n} x_i y_i$
  - $S_{xx} = \sum_{i=1}^{n} x_i^2$

- The closed-form OLS estimators are:
  $$\hat{\beta_1} = \frac{n S_{xy} - S_x S_y}{n S_{xx} - S_x^2}$$
  
  $$\hat{\beta_0} = \frac{S_y - \hat{\beta_1} S_x}{n}$$

- For regression without intercept (through origin, useful when we have a lot of variables):
  $$\hat{\beta_1}^{\text{(no-intercept)}} = \frac{S_{xy}}{S_{xx}}$$

## Code this up.

In [4]:
# Compute OLS coefficients manually (for illustration)
n = len(x)
Sxy = np.sum(x * y)
Sxx = np.sum(x**2)
Sx = np.sum(x)
Sy = np.sum(y)
slope = (n * Sxy - Sx * Sy) / (n * Sxx - Sx**2)
intercept = (Sy - slope * Sx) / n

slope_without_intercept = Sxy / Sxx

In [None]:
# Generate fitted line for visualization
x_line = np.linspace(x.min(), x.max(), 100)
y_line = intercept + slope * x_line
y_line_no_intercept = slope_without_intercept * x_line

# Create comprehensive visualization
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.7, color='steelblue', s=50, label='Observed data')
plt.plot(x_line, y_line, 'r-', linewidth=2.5,
         label=f'OLS fit: y = {intercept:.3f} + {slope:.3f}x')

# Add true regression line for comparison
y_true_line = true_intercept + true_slope * x_line
plt.plot(x_line, y_true_line, 'g--', linewidth=2, alpha=0.8,
         label=f'True model: y = {true_intercept:.3f} + {true_slope:.3f}x')

# Add slope without intercept line
plt.plot(x_line, y_line_no_intercept, 'm:', linewidth=2,
         label=f'No intercept: y = {slope_without_intercept:.3f}x')

plt.xlabel('Predictor Variable (x)', fontsize=12)
plt.ylabel('Response Variable (y)', fontsize=12)
plt.title('Univariate Ordinary Least Squares Regression', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Display parameter comparison
print("Parameter Estimation Results:")
print("=" * 40)
print(f"True parameters:      β₀ = {true_intercept:6.3f}, β₁ = {true_slope:6.3f}")
print(f"OLS estimates:        β̂₀ = {intercept:6.3f}, β̂₁ = {slope:6.3f}")
print(f"No-intercept slope:   β̂₁ = {slope_without_intercept:6.3f}")
print(f"Estimation errors:    Δβ₀ = {intercept - true_intercept:6.3f}, Δβ₁ = {slope - true_slope:6.3f}")

## Now you can use the OLS class to fit the model!

But this could be cumbersome to code this up every time.

Can we encapsulate this into an (univariate) OLS object that we could re-use this easily?

## Block 1 — Defining a Class and `__init__` (Encapsulation)

- A class bundles data (attributes) and behavior (methods)

- `__init__` initializes object state when you create an instance

- Instance attributes live on `self` and belong to each object

- We’ll start a minimal `UnivariateOLS` model that stores configuration

Exercise: Run the code and inspect `model.fit_intercept`.


In [None]:
x_data = x.tolist()
y_data = y.tolist()
print(f"{x_data=:}")
print(f"{y_data=:}")

In [7]:
class UnivariateOLS:  # we start with the simplest model, with class ModelName.
    def __init__(self, x_data: List[float], y_data: List[float], fit_intercept: bool):
        # This is the constructor (if you are familiar with C++).
        # Initialize the model with the fit_intercept parameter, whether we want to fit the intercept.
        # The self points to the instance of this class.
        self.fit_intercept = bool(fit_intercept)
        # Store the training data (X, y) to the instance.
        self.x_data = deepcopy(x_data)
        self.y_data = deepcopy(y_data)
        # Store the intercept and slope fitted, initialized to None because we haven't fitted the model yet!
        self.intercept = None
        self.slope = None

### Calling the UnivariateOLS(...) constructor will (i) create an instance of the class, and (ii) call the `__init__` method inside the class with the provided arguments.

In [None]:
model = UnivariateOLS(x_data=x, y_data=y, fit_intercept=True)
# This isn't very informative, it shows the class name and the memory address of the object.
# we will fix this later by defining the `__repr__` method.
model

In [None]:
# We have passed in a few data and a boolean flag to the model.
print(f"{model.fit_intercept=:}")
print(f"{model.x_data=:}")
print(f"{model.y_data=:}")

In [None]:
# However, we haven't fitted the model yet, so the intercept and slope are None.
print(f"{model.intercept=:}")
print(f"{model.slope=:}")

## Block 2 — Instance Method `fit` (Behavior on State)

- Let's now define our first method (apart from the constructor) in this `UnivariateOLS` class.

- Instance methods always take `self` as the first parameter, so that it can refer to other information stored in the instance (e.g., `self.x_data` and `self.y_data`).

- Methods can read/modify attributes stored on `self`.

- We implement `fit(X, y)` using the closed form solution to OLS.

In [11]:
class UnivariateOLS:  # we start with the simplest model, with class ModelName.
    def __init__(self, x_data: List[float], y_data: List[float], fit_intercept: bool):
        # This is the constructor (if you are familiar with C++).
        # Initialize the model with the fit_intercept parameter, whether we want to fit the intercept.
        # The self points to the instance of this class.
        self.fit_intercept = bool(fit_intercept)
        # Store the training data (X, y) to the instance.
        self.x_data = deepcopy(x_data)
        self.y_data = deepcopy(y_data)
        # Store the intercept and slope fitted, initialized to None because we haven't fitted the model yet!
        self.intercept = None
        self.slope = None

    # ==================================================================================================================
    # Everything above is the same as the previous version.
    # ==================================================================================================================
    def fit(self):
        # The method inside the class can READ other attributes stored on this instance via the `self`.
        # For example, `self.x_data` and `self.y_data` are the training data.
        # Again, it is always good to check for potential data errors before you do the computation.
        if len(self.x_data) != len(self.y_data):
            raise ValueError("X and y must have same length")

        # get the length of the training data.
        n = len(self.x_data)

        # Get the sum of the training x and trainig y, we need them to compute the slope and intercept.
        Sx = sum(self.x_data)
        Sy = sum(self.y_data)

        # Also the sum of suqare of x and the sum of x and y.
        Sxx = 0
        # you can loop over lists using for item in list.
        for x in self.x_data:
            Sxx += x * x

        # Also the sum of x and y.
        Sxy = 0
        # you can loop over lists using index i.
        for i in range(n):  # (0, 1, 2, ..., n-1).
            Sxy += self.x_data[i] * self.y_data[i]

        # You can choose to report some information, which can be helpful for debugging.
        print(f"Training data: n={n}, Sx={Sx}, Sy={Sy}, Sxx={Sxx}, Sxy={Sxy}")

        # Here we got two options!
        if self.fit_intercept:
            # The denominator of the slope.
            den = n * Sxx - Sx * Sx
            # If the denominator is zero, it means all x are identical, which is not a good thing.
            # We rise the error here.
            if den == 0:
                raise ZeroDivisionError("Denominator is zero (all x identical?).")
            # The slope.
            self.slope = float((n * Sxy - Sx * Sy) / den)
            # The intercept.
            self.intercept = float((Sy - self.slope * Sx) / n)
            print(f"Model intercept and slope have been fitted successfully.")
        else:
            if Sxx == 0:
                raise ZeroDivisionError("Sxx is zero (all x are zero?).")
            self.slope = float(Sxy / Sxx)
            # we do not touch the intercept here, it remains None.
            print(f"Model slope has been fitted successfully.")

In [None]:
# Let's now fit the first model (with intercept).
model_with_intercept = UnivariateOLS(x_data=x, y_data=y, fit_intercept=True)
model_with_intercept.fit()

In [None]:
# One can access the attributes of the instance via the dot notation.
print(f"{model_with_intercept.slope=:}")
print(f"{model_with_intercept.intercept=:}")

# These are pretty close to the ground truth!

In [None]:
# Let's now fit the second model (without intercept).
model_without_intercept = UnivariateOLS(x_data=x, y_data=y, fit_intercept=False)
model_without_intercept.fit()
print(f"{model_without_intercept.slope=:}")
print(f"{model_without_intercept.intercept=:}")  # the intercept remains None because we did not fit the intercept.

In [None]:
# Let's visualize both models with the original data
plt.figure(figsize=(10, 6))

# Plot the original data points
plt.scatter(x, y, alpha=0.6, color='blue', label='Data points')

# Create x values for plotting the fitted lines
x_plot = np.linspace(min(x), max(x), 100)

# Plot the model with intercept
y_with_intercept = model_with_intercept.slope * x_plot + model_with_intercept.intercept
plt.plot(x_plot, y_with_intercept, 'red', linewidth=2,
         label=f'With intercept: y = {model_with_intercept.slope:.3f}x + {model_with_intercept.intercept:.3f}')

# Plot the model without intercept
y_without_intercept = model_without_intercept.slope * x_plot
plt.plot(x_plot, y_without_intercept, 'green', linewidth=2,
         label=f'Without intercept: y = {model_without_intercept.slope:.3f}x')

# Add labels and legend
plt.xlabel('x')
plt.ylabel('y')
plt.title('Univariate OLS: Fitted Lines Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


## Block 3 — Using `self.intercept` and `self.slope` in Out-of-Sample `predict`

- After `fit`, parameters live on the object (`self.intercept`, `self.slope`)

- `predict(X)` uses learned parameters to generate outputs

- We can also use `self.intercept` and `self.slope` in the `fit` method.

In [16]:
class UnivariateOLS:  # we start with the simplest model, with class ModelName.
    def __init__(self, x_data: List[float], y_data: List[float], fit_intercept: bool):
        # This is the constructor (if you are familiar with C++).
        # Initialize the model with the fit_intercept parameter, whether we want to fit the intercept.
        # The self points to the instance of this class.
        self.fit_intercept = bool(fit_intercept)
        # Store the training data (X, y) to the instance.
        self.x_data = deepcopy(x_data)
        self.y_data = deepcopy(y_data)
        # Store the intercept and slope fitted, initialized to None because we haven't fitted the model yet!
        self.intercept = None
        self.slope = None

    # the new fit method.
    def fit(self):
        # The method inside the class can READ other attributes stored on this instance via the `self`.
        # For example, `self.x_data` and `self.y_data` are the training data.
        # Again, it is always good to check for potential data errors before you do the computation.
        if len(self.x_data) != len(self.y_data):
            raise ValueError("X and y must have same length")

        # get the length of the training data.
        n = len(self.x_data)

        # Get the sum of the training x and trainig y, we need them to compute the slope and intercept.
        Sx = sum(self.x_data)
        Sy = sum(self.y_data)

        # Also the sum of suqare of x and the sum of x and y.
        Sxx = 0
        # you can loop over lists using for item in list.
        for x in self.x_data:
            Sxx += x * x

        # Also the sum of x and y.
        Sxy = 0
        # you can loop over lists using index i.
        for i in range(n):  # (0, 1, 2, ..., n-1).
            Sxy += self.x_data[i] * self.y_data[i]

        # You can choose to report some information, which can be helpful for debugging.
        print(f"Training data: n={n}, Sx={Sx}, Sy={Sy}, Sxx={Sxx}, Sxy={Sxy}")

        # Here we got two options!
        if self.fit_intercept:
            # The denominator of the slope.
            den = n * Sxx - Sx * Sx
            # If the denominator is zero, it means all x are identical, which is not a good thing.
            # We rise the error here.
            if den == 0:
                raise ZeroDivisionError("Denominator is zero (all x identical?).")
            # The slope.
            self.slope = float((n * Sxy - Sx * Sy) / den)
            # The intercept.
            self.intercept = float((Sy - self.slope * Sx) / n)
            print(f"Model intercept and slope have been fitted successfully.")
        else:
            if Sxx == 0:
                raise ZeroDivisionError("Sxx is zero (all x are zero?).")
            self.slope = float(Sxy / Sxx)
            # we do not touch the intercept here, it remains None.
            print(f"Model slope has been fitted successfully.")

    # ==================================================================================================================
    # Everything above is the same as the previous version.
    # ==================================================================================================================
    def predict(self, new_x_data: List[float]) -> List[float]:
        # there are again two options!
        if self.fit_intercept:
            # check whether both intercept and slope are fitted.
            if self.intercept is None or self.slope is None:
                raise ValueError("Model not fit.")

            # otherwise, we can predict the new y using the new x.
            return [self.intercept + self.slope * x for x in new_x_data]
        else:
            # Otherwise, we only require the slope to be fitted.
            if self.slope is None:
                raise ValueError("Model not fit.")

            # otherwise, we can predict the new y using the new x.
            return [self.slope * x for x in new_x_data]

In [None]:
x_new = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
model = UnivariateOLS(x_data=x, y_data=y, fit_intercept=True)
model.fit()
y_new_pred = model.predict(x_new)
print(f"The predicted y values for the new x values are: {y_new_pred=:}")

In [None]:
# Visualize the prediction
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.7, label='Training data', color='blue', s=50)
plt.scatter(x_new, y_new_pred, alpha=0.8, label='Predictions', color='red', s=80, marker='^')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Univariate OLS: Training Data and Predictions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## We have completed our first OOP object!

- It loads the training data
- Fits the model using the training data
- Can make predictions on new data
- The `UnivariateOLS` class already handles the complete data science!

## Block 4 — One Additional Metric: RMSE
- Add a method that uses object state and inputs to compute a metric, let's compute the Root Mean Squared Error (RMSE).

$$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}$$

where $y_i$ is the true value, $\hat{y}_i$ is the predicted value, and $n$ is the number of data points.

### Stop here: are we going to make a copy of the UnivariateOLS class and add a new method? No we can define a new class that inherits from the UnivariateOLS class.

## Class Inheritance

- Create a new class that inherits from the UnivariateOLS class.

- Add a new method that computes the Root Mean Squared Error (RMSE).

- The new class inherits the (i) data management `__init__`, (ii) model fitting `fit`, and (iii) prediction `predict` methods from the UnivariateOLS class.

- This is pretty helpful if you want to build upon an existing class: for example, if you want to build an enhanced version of the random forest model from sklearn, you don't need to start from scratch by copying and pasting the code.

- Instead, you can inherit from the existing class and add the new functionality or override the existing functionality.

- Inheritance is a powerful feature of OOP that allows you **focus on the changes** rather than the entire codebase.

In [19]:
class UnivariateOLSwithRMSE(UnivariateOLS):  # <-- Inherits from the UnivariateOLS class, put the "parent class" in the parentheses.
    # Multiple parent classes are possible, but we will not cover that in this course.
    def __init__(self, x_data: List[float], y_data: List[float], fit_intercept: bool=True) -> None:
        # The updated __init__ method calls the parent class's __init__ method so we don't
        # You can access the parent class's attributes and methods using the super() function.
        # We call the parent class's __init__ method to initialize the attributes of the parent class.
        super().__init__(x_data, y_data, fit_intercept)

        # after calling the parent class's __init__ method, we now have self.slope and self.intercept even though we did not define them in the UnivariateOLSwithRMSE.__init__ method.
        print(f"After calling the parent class's __init__ method, the slope is {self.slope} and the intercept is {self.intercept}.")

    def compute_rmse(self, x_data: List[float], y_data: List[float]) -> float:
        # This is the new functionality that we want to add.
        # It computes the RMSE of the model
        squared_errors = []
        # Note: we can use the parent class's method (i.e., self.predict) in this class even though we did not define it here.
        predictions = self.predict(x_data)

        # Note: there are much faster and more elegant ways to compute the RMSE using for example numpy (next quarter).
        # But let's stick to the basics for now.
        # Loop through each data point to calculate squared errors
        for i in range(len(y_data)):
            # Calculate the squared difference between actual and predicted values
            squared_error = (y_data[i] - predictions[i]) ** 2
            squared_errors.append(squared_error)

        # Calculate the mean of all squared errors
        mean_squared_error = sum(squared_errors) / len(squared_errors)

        # Return the square root of the mean squared error (RMSE)
        return np.sqrt(mean_squared_error)

    # We can also override the parent class's methods, we can print out the training set RMSE after fitting the model.
    def fit(self) -> None:
        # We call the parent class's fit method to fit the model as usual.
        super().fit()
        # This new method also reports the training set RMSE.
        print(f"The training set RMSE is {self.compute_rmse(self.x_data, self.y_data)}")

In [None]:
model = UnivariateOLSwithRMSE(x_data=x, y_data=y, fit_intercept=True)

In [None]:
model.fit()

In [None]:
# Suppose that we have a new set of observations.
x_new = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y_new = [12.3, 14.1, 15.8, 17.2, 25.5, 23.1, 25.8, 29.2, 30.7, 36.1]
y_new_pred = model.predict(x_new)
print(f"The predicted y values for the new x values are: {y_new_pred=:}")

In [None]:
# Visualize the original data, new data, and predictions
plt.figure(figsize=(10, 6))

# Plot original training data
plt.scatter(x, y, color='blue', alpha=0.7, label='Original training data')

# Plot new actual data points
plt.scatter(x_new, y_new, color='red', alpha=0.7, label='New actual data')

# Plot predictions for new data
plt.scatter(x_new, y_new_pred, color='green', alpha=0.7, label='Predictions for new data')

# Add a line connecting predictions to show the trend
plt.plot(x_new, y_new_pred, color='green', linestyle='--', alpha=0.5)

plt.xlabel('x')
plt.ylabel('y')
plt.title('Original Data, New Data, and Predictions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# We can compute how good the model fits on the new set of data.
model.compute_rmse(x_new, y_new)

## Block 5 — Special Methods

- Special methods let objects integrate with Python syntax/printing

In [None]:
# As mentioned, we can use the `print` function to print any object.
# But since this is a customized class we just built, the `print` function will not be able to print it in an informative way.
# It only tells the class name and the memory address of the object.
print(model)

In [None]:
str(model)

In [None]:
repr(model)

### Difference between `__str__` and `__repr__`

#### Intended Audience
- `__str__` 
    - Designed for **end users**. Its goal is to return a string that is readable and friendly — something you’d want to show in logs or a user interface.
    - Called by the built-in `str()` function and by `print(obj)`.
- `__repr__`
    - Designed for **developers / debugging**. It should return an unambiguous string representation of the object — ideally one that could be used to recreate the object if passed to `eval()` (when practical).
    - Called by the built-in `repr()` function and used by the interactive interpreter when you type an object at the prompt.

#### Fallback Behavior
- If you define only `__repr__` but not `__str__`, Python will use `__repr__` when you call `str(obj)` or `print(obj)`.
- If you define only `__str__`, the interpreter will still use the default `__repr__` for the console/debug display.


#### Conclusion
- In this course, we are both the developer and the end user. I will be definig `__repr__` here because it updates the `__str__` method as well.

### What information do you think to be helpful to report?

A few helpful pieces of information to include:
- What the model is for (e.g., "OLS model with intercept" or "OLS model without intercept")?
- Whether the model has been fitted or not?
- Some summary statistics for the training data (e.g., number of data points, mean of x, mean of y, etc.)
- The fitted parameters (slope and intercept) if available.
- A human-readable equation representation perhaps?
- For debugging: the class name and memory address (keep it for debugging purposes)

# Sample printing results:
```
# Before fitting:
UnivariateOLS(not fitted)
UnivariateOLS(fit_intercept=True, slope_=None, intercept_=None)

After fitting:
y ≈ 2.000 + 1.500 x
UnivariateOLS(fit_intercept=True, slope_=1.5, intercept_=2.0)
```

In [28]:
# We don't need to rewrite everything, instead, we inherit from the `UnivariateOLS` class and override the `__str__` and `__repr__` methods.

class UnivariateOLSwithSummary(UnivariateOLS):
    def __repr__(self) -> str:
        representation_string = f"UnivariateOLSwithSummary"
        if self.fit_intercept:
            representation_string += "\n - Estimated with intercept"
        else:
            representation_string += "\n - Estimated without intercept (i.e., fitted through the origin)"

        # Add summary statistics of the data.
        representation_string += f"\n - Data summary: {len(self.x_data)} data points, mean of x={sum(self.x_data)/len(self.x_data):.3f}, mean of y={sum(self.y_data)/len(self.y_data):.3f}."
        # Add range of x and y.
        representation_string += f"\n - Data range: x in [{min(self.x_data):.3f}, {max(self.x_data):.3f}], y in [{min(self.y_data):.3f}, {max(self.y_data):.3f}]."
        # You can even add an ascii art plot of the data (less useful lol...)
        representation_string += f"\n - Data plot for the trainig data: \n"
        # Create a simple ASCII plot
        plot_width = 40
        plot_height = 8

        # Get data ranges for scaling
        x_min, x_max = min(self.x_data), max(self.x_data)
        y_min, y_max = min(self.y_data), max(self.y_data)

        # Create empty plot canvas
        canvas = [[' ' for _ in range(plot_width)] for _ in range(plot_height)]
        # Plot the first 100 data points.
        for i in range(min(100, len(self.x_data))):
            # Scale coordinates to canvas size
            if x_max != x_min:
                x_pos = int((self.x_data[i] - x_min) / (x_max - x_min) * (plot_width - 1))
            else:
                x_pos = plot_width // 2

            if y_max != y_min:
                y_pos = plot_height - 1 - int((self.y_data[i] - y_min) / (y_max - y_min) * (plot_height - 1))
            else:
                y_pos = plot_height // 2

            # Place point on canvas
            if 0 <= x_pos < plot_width and 0 <= y_pos < plot_height:
                canvas[y_pos][x_pos] = '*'

        # Convert canvas to string with axes
        for i, row in enumerate(canvas):
            # Add y-axis labels on the left
            if i == 0:
                y_label = f"{y_max:.1f}"
            elif i == plot_height - 1:
                y_label = f"{y_min:.1f}"
            else:
                y_label = " " * 4

            representation_string += f"   {y_label:>4}|" + "".join(row) + "\n"

        # Add x-axis
        representation_string += "   " + " " * 4 + "+" + "-" * plot_width + "\n"
        representation_string += f"   {' ' * 4} {x_min:.1f}" + " " * (plot_width - 8) + f"{x_max:.1f}\n"

        # Add the fitted parameters if the model has been fitted.
        if self.slope is None:
            # the model has not been fitted yet
            representation_string += "\n - Not fitted, call `fit` method to fit the model before using it."
        else:
            # the model has been fitted
            if self.fit_intercept:
                representation_string += f"\n - Fitted (slope={self.slope:.3f}, intercept={self.intercept:.3f})."
            else:
                representation_string += f"\n - Fitted (slope={self.slope:.3f}, fitted through the origin). "

        # finally, the __repr__ method returns the string representation of the model.
        return representation_string

In [29]:
model = UnivariateOLSwithSummary(x_data=x_data, y_data=y_data, fit_intercept=True)

In [None]:
str(model)

In [None]:
repr(model)

In [None]:
print(model)

In [None]:
# The model changes after fitting.
model.fit()
print(model)

## Optional — Streaming/Online OLS with `__iadd__` and `__add__` Special Methods

- Motivation: Imagine you're running a real-time analytics system where new data points arrive continuously throughout the day. Instead of re-fitting your entire model every time a new observation comes in, you want to update your regression incrementally. This is especially important when dealing with streaming data from sensors, financial markets, or user interactions.

- Overload `__iadd__` (for `+=`) and `__add__` (for `+`) to add new data points to update the model.

- This demonstrates: custom behavior for built-in operators.

- When you write `a + b`, Python does not hard-code addition, instead, it calls the special (dunder) method: `a.__add__(b)`.

- When you write `a += b`, Python calls `a.__iadd__(b)`.

- The pattern here is, the addition method (`__add__` and `__iadd__`) of a given objectis called with a single arugment that is **compatiable to be added to the object** and returns the modified object.

### Let's define the addition here to be: adding a new pair of training data to the model.

In [34]:
class UnivariateOnlineOLS(UnivariateOLSwithSummary):
    def __iadd__(self, new_data: Tuple[float, float]) -> "UnivariateOnlineOLS":
        self.x_data.append(new_data[0])
        self.y_data.append(new_data[1])
        # refit the model with the new data.
        self.fit()
        # return the modified object.
        return self

    def __add__(self, new_data: Tuple[float, float]) -> "UnivariateOnlineOLS":
        # we can reuse the __iadd__ method to add the new data.
        return self.__iadd__(new_data)

In [None]:
# let's see how it works..
model = UnivariateOnlineOLS(x_data=x_data, y_data=y_data, fit_intercept=True)
model.fit()
print(model)

In [None]:
# Add a new pair of data points to the model.
# See how the summay statistics and the fitted line change.
model = model + (15, 5)
print(model)

In [None]:
model += (20, 3)
print(model)

In [None]:
new_data_pairs = [(15, 5), (20, 3), (25, 2), (30, 1), (35, 0.5), (40, 0.25), (45, 0.1), (50, 0.05), (55, 0.025), (60, 0.01)]

model = UnivariateOnlineOLS(x_data=x_data, y_data=y_data, fit_intercept=True)
model.fit()
# add them one by one and see how the model changes.
fitted_slopes = [model.slope]
fitted_intercepts = [model.intercept]
for pair in new_data_pairs:
    model += pair
    fitted_slopes.append(model.slope)
    fitted_intercepts.append(model.intercept)

In [None]:
# plot the evolution of fitted lines as new data points are added
fig, ax = plt.subplots(1, 1, figsize=(12, 8))

# Create initial model to get x_data range
temp_model = UnivariateOnlineOLS(x_data=x_data, y_data=y_data, fit_intercept=True)
temp_model.fit()

# Plot original data points
ax.scatter(x_data, y_data, color='blue', alpha=0.6, s=30, label='Original data')

# Plot all added data points
all_added_x = [pair[0] for pair in new_data_pairs]
all_added_y = [pair[1] for pair in new_data_pairs]
ax.scatter(all_added_x, all_added_y, color='red', alpha=0.8, s=50, marker='s', label='Added data')

# Get overall x range for all data
all_x = list(x_data) + all_added_x
x_min, x_max = min(all_x), max(all_x)
x_range = np.linspace(x_min, x_max, 100)

# Create color map from yellow to dark red
colors = plt.cm.YlOrRd(np.linspace(0.3, 1.0, len(fitted_slopes)))

# Plot each fitted line with different colors (yellow to dark red)
for i in range(len(fitted_slopes)):
    y_pred = fitted_slopes[i] * x_range + fitted_intercepts[i]
    data_points = len(x_data) + i
    ax.plot(x_range, y_pred, color=colors[i], linewidth=2,
            label=f'Step {i}: {data_points} points (slope={fitted_slopes[i]:.3f})')

ax.set_title('Evolution of Fitted Lines as New Data Points are Added', fontsize=14)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.grid(True, alpha=0.3)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## References & Next Steps

**For deeper learning:**
- 📖 **Real Python OOP Guide**: Comprehensive tutorial on Python OOP concepts
  `https://realpython.com/python3-object-oriented-programming/`

- 💻 **Interactive Examples**: Companion Colab notebook (from last quarter's OOP lecture) with hands-on exercises
  `https://colab.research.google.com/drive/1lElGzDa_uOUNB2YeZ8h92QCCuCRR2esm?usp=sharing`