# PAC learning in 2D rectangular classifiers

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

Let's create the function that defines the real classification rule which, in this case, is a rectangle (left bottom and right upper points are enough to define it)

<img src="images/rect1.png" width="500">

In [None]:
def f(x,points):
    if x[0] >= points[0,0] and x[0] <= points[1,0] and \
       x[1] >= points[0,1] and x[1] <= points[1,1]:
        return True
    return False

Let's define the X space and two points of the classification rule:

In [None]:
points = np.array([[4,2],
                   [6.9,8]])
xlims = [0,10]
ylims = [0,10]

Now, we create the dataset with random-sampled descriptive feature `X`(2 dimensions, `x` and `y`) and the class variable `c`:

In [None]:
np.random.seed(11)

N=100
X = np.array([np.random.uniform(xlims[0],xlims[1],N),
              np.random.uniform(ylims[0],ylims[1],N)]).T
c = np.array([f(X[i,:],points) for i in range(N)]).astype(int)

## Learning
The learning method is as simple as finding two points **among the positive samples**: 
- The left-bottom point is composed of: (the smallest `x` value, the smallest `y` value)
- The right-upper point is composed of: (the largest `x` value, the largest `y` value)

In [None]:
def learning_method(X,c):
    idx_pos = np.where(c==1)[0]
    pos_samples = X[idx_pos,:]
    points = np.array([np.min(pos_samples, axis=0),
                      np.max(pos_samples, axis=0)])
    return points

Let's learn our classification rule, and then plot it.

In [None]:
learned_points = learning_method(X,c)

In [None]:
plt.scatter(X[:,0],X[:,1],c=c)
plt.gca().add_patch(
    patches.Rectangle(
        xy=points[0,:],  # point of origin.
        width=points[1,0]-points[0,0], height=points[1,1]-points[0,1], linewidth=1, linestyle="dotted",
        color='green', fill=False))

plt.gca().add_patch(
    patches.Rectangle(
        xy=learned_points[0,:],  # point of origin.
        width=learned_points[1,0]-learned_points[0,0], 
        height=learned_points[1,1]-learned_points[0,1], 
        linewidth=1, linestyle="solid", color='red', fill=False))
plt.show()

Note than in this scenario we can quantify the error of the learned rule as the area of the real classification rule not covered by the learned one. Let's draw it first:

In [None]:
plt.scatter(X[:,0],X[:,1],c=c)
plt.gca().add_patch(
    patches.Rectangle(
        xy=points[0,:],  # point of origin.
        width=points[1,0]-points[0,0], height=points[1,1]-points[0,1], linewidth=1, linestyle="dotted",
        color='green', fill=False))

plt.gca().add_patch(
    patches.Rectangle(
        xy=learned_points[0,:],  # point of origin.
        width=learned_points[1,0]-learned_points[0,0], 
        height=learned_points[1,1]-learned_points[0,1], 
        linewidth=1, linestyle="solid", color='red', fill=False))

plt.gca().add_patch( #left error space (RED)
    patches.Rectangle(
        xy=points[0,:],  # point of origin.
        width=learned_points[0,0]-points[0,0],  #x
        height=points[1,1]-points[0,1], #y
        linewidth=1, linestyle="solid", color='red', fill=True, alpha=0.25))

plt.gca().add_patch( #bottom error space (GREEN)
    patches.Rectangle(
        xy=(learned_points[0,0],points[0,1]),  # point of origin.
        width=learned_points[1,0]-learned_points[0,0],  #x
        height=learned_points[0,1]-points[0,1], #y
        linewidth=1, linestyle="solid", color='green', fill=True, alpha=0.25))

plt.gca().add_patch( #upper error space (BLUE)
    patches.Rectangle(
        xy=(learned_points[0,0],learned_points[1,1]),  # point of origin.
        width=learned_points[1,0]-learned_points[0,0],  #x
        height=points[1,1]-learned_points[1,1], #y
        linewidth=1, linestyle="solid", color='blue', fill=True, alpha=0.25))

plt.gca().add_patch( #upper error space (GRAY)
    patches.Rectangle(
        xy=(learned_points[1,0],points[0,1]),  # point of origin.
        width=points[1,0]-learned_points[1,0],  #x
        height=points[1,1]-points[0,1], #y
        linewidth=1, linestyle="solid", color='yellow', fill=True, alpha=0.25))

plt.show()

For convenience, the error area is decomposed into 4 rectangles. Computing the error of the classifier is just:
1. calculating the area of each of these 4 error areas, and adding them up,
2. calculating the area of the whole input space,
3. dividing the error area by the input space area

In [None]:
def area(p1,p2):
    return np.abs((p2[0]-p1[0])*(p2[1]-p1[1]))

In [None]:
def measure_error(real_sq, pred_sq, xlims, ylims):
    err_area  = area(real_sq[0,:],(pred_sq[0,0],real_sq[1,1])) # left
    err_area += area((pred_sq[0,0],real_sq[0,1]), (pred_sq[1,0],pred_sq[0,1]))#bottom
    err_area += area((pred_sq[0,0],pred_sq[1,1]), (pred_sq[1,0],real_sq[1,1]))#upper
    err_area += area((pred_sq[1,0],real_sq[0,1]), real_sq[1,:]) #right
    
    total_area = area((xlims[0],ylims[0]),(xlims[1],ylims[1]))

    return err_area/total_area

So, the error of the classifier is:

In [None]:
measure_error(points, learned_points, xlims, ylims)

## Using PAC bounds to provide a minimum sample size
The PAC learning theory establishes the minimum number of samples to guarantee a given maximum allowed error with at most a given probability value $\delta$ with the following formula:

$$N\geq \frac{4}{\epsilon}\log{\frac{4}{\delta}}$$

<sub><sup>(the derivation of this expression is in the lecture's material)</sup></sub>

In [None]:
def pac_sample_complexity(eps, delta):
    return 4/eps * np.log(4/delta)

So, if we want to allow at most 10% of error at most in 5% of times we learn a classifier, the result is:

In [None]:
eps = 0.1
delta = 0.05
int(np.ceil(pac_sample_complexity(eps, delta))) # N

Let's now make an experiment with different $\epsilon$ and $\delta$ values. For each combination, we will repeat the data generation and learning processes a given number of times (`nreps`) to study whether the learning guarantees hold.

In [None]:
considered_eps = [0.3,0.2,0.1,0.05,0.01]
considered_delta = [0.1,0.05,0.01]
nreps = 1000 # run each experiment this amount of times
factor_of_pac_bound = 1

res = np.zeros((len(considered_eps),len(considered_delta)))
for ie, eps in enumerate(considered_eps):
    for id, delta in enumerate(considered_delta):
        N = int(np.ceil(pac_sample_complexity(eps, delta)*factor_of_pac_bound))
        errs = []
        for r in np.arange(nreps):
            c=np.zeros(1)
            while np.sum(c)==0:
                X = np.array([np.random.uniform(xlims[0],xlims[1],N),
                              np.random.uniform(ylims[0],ylims[1],N)]).T
                c = np.array([f(X[i,:],points) for i in range(N)]).astype(int)
            
            learned_points = learning_method(X,c)

            err = measure_error(points, learned_points, xlims, ylims)
            errs.append(err)
        res[ie,id] = len(np.where(np.array(errs)<eps)[0])/nreps # proportion of cases in which epsilon-bound holds

These are the results of the experiment:


In [None]:
res

showing the proportion of repetitions in which the epsilon bounds hold (for different $\epsilon$ values --per row-- and different $\delta$ values --per column).

## Questions
- What do we observe within these results? What should we observe? What can you say about the tightness of the PAC bounds for this problem?
- Play a bit with `factor_of_pac_bound` $(0\leq factor\leq 1)$ to test different scenarios where we actually use a fewer number of samples regarding the PAC-based suggested sample size.
- Which would be the bound based on the VCdim? Try it!