# Introduction to Machine Learning
## Home Assignment 1

In [1]:
import tensorflow as tf
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [4]:
# We get the data 
_URL = 'http://www.di.ens.fr/appstat/spring-2020/project/data.zip'

path_to_zip = tf.keras.utils.get_file('data.zip', origin=_URL, extract=True)

data_path = os.path.join(os.path.dirname(path_to_zip), 'data')

training_data = os.path.join(data_path, 'train')

test_data = os.path.join(data_path, 'test')

Downloading data from http://www.di.ens.fr/appstat/spring-2020/project/data.zip
4784128/Unknown - 12s 2us/step

KeyboardInterrupt: 

In [2]:
# We define the constants

number_of_letters = 6000
pixel_height = 28
pixel_width = 28
batch_size = 100
shape = (1, pixel_height*pixel_width)
# We define a flow of images: 

train_image_generator = ImageDataGenerator(rescale=1./255) 
train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
                                                           directory=training_path,
                                                           shuffle=True,
                                                           target_size=(pixel_height, pixel_width),
                                                           class_mode='binary')

test_image_generator = ImageDataGenerator(rescale=1./255) 
test_data_gen = test_image_generator.flow_from_directory(batch_size=batch_size,
                                                           directory=test_path,
                                                           shuffle=True,
                                                           target_size=(pixel_height, pixel_width),
                                                           class_mode='binary')


def plotImages(images_arr):
    fig, axes = plt.subplots(1, 5, figsize=(20,20))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

    
sample_training_images, b = next(train_data_gen)
plotImages(sample_training_images[:5])
print(b)
print(type(b))

NameError: name 'training_path' is not defined

1. Formalize the problem by defining the input space $X$ , the output space $Y$ and the training data set. What are their dimension?

Having preprocessed the data we are going to use the images as vectors of pixels with size $n = pixel\_width*pixel\_height + 1$. The additional 1 is to add a bias term.Thus: \
$X$ is $[0, 1]^n$ and $Y$ is $\{-1, 1\}$ and $D = (X \times Y) $



2. a) What are the empirical risk (training error) and the true risk associated with the 0-1 loss? Why is it complicated to minimize the empirical risk in this case

The empirical risk for the 0-1 loss is : $$R_d(f) = \frac{1}{d} \sum_{i = 1}^{d} \mathbb{1}_{f(X_i) \neq Y_i}$$ and the true risk is : $$R(f) = \mathbb{E}\{\mathbb{1}_{f(X) \neq Y} | D \}$$
The empirical risk here is of the 0-1 loss function is non-convex and discontinuous, thus (sub)gradient methods cannot be applied. Any iterative method would be exponential in the input size thus the choice of the 0-1 loss would be inadequate.

2. b) Why should we use the test data to assess the performance ? 

Test data should be from the same statistical distribution as the training data, and should not have been fed to the learning model before (otherwise there is no point given the model already adjusted to that data before)

2. c) Recall the definition of the optimization problems associated with the linear least square regression andthe linear logistic regression.

We start firstly with linear regression: 
$$f_n\in argmin_{f \in \mathbb F}R_n(f) := argmin_{f \in \mathbb F}\frac{1}{n}\sum_{i = 1}^{n}l(x_i, y_i)$$
with $l$ being the loss function associated to the problem. 
In linear regression, we usually choose the square loss, and as the name states we choose a linear model: $f(x) = \theta^T x$. Thus the linear regression problem becomes: 
$$min_{\theta \in \mathbb R^n} R_n(\theta) := min_{\theta \in \mathbb R^n}\frac{1}{n}\sum_{i = 1}^{n}( \theta^T x_i - y_i)^2$$
That can simply be rewritten to: 
$$min_{\theta \in \mathbb R^n} R_n(\theta) := min_{\theta \in \mathbb R^n} \frac{1}{n} || Y - X\theta ||_2^2 $$
with $X \in \mathbb R^{d*n}$ being the design matrix having at the row $i$ $x_i^T$, and $Y \in \mathbb R^d$ being the vector of outputs with $y_i$ at the position $i$

For logistic regression we have: 
$$f_n\in argmin_{f \in \mathbb F}R_n(f) := argmin_{f \in \mathbb F}\frac{1}{n}\sum_{i = 1}^{n}l(x_i, y_i)$$
with $l$ being the loss function associated to the problem. 
In logistic regression, we try to maximize the log-likelihood of an estimator and we end up with a minimization problem using the logistic loss, we choose a linear model: $f(x) = \theta^T x$. Thus the logistic regression problem becomes: 

$$min_{\theta \in \mathbb R^n} R_n(\theta) := min_{\theta \in \mathbb R^n}\frac{1}{n}\sum_{i = 1}^{n}log(1 + e^{−y_i \theta^T x_i}))$$


In [3]:
#we define here some utility functions to compute the regression sum and the gradient

def logistic_regression_loss_on_batch (theta, x, y): 
    return 1/batch_size * np.sum(np.log(1+np.exp(-y*np.transpose(theta)@x_)), axis=1)

def logistic_gradient (theta, x, y):
    k = -y*x
    e = np.exp(np.transpose(theta)@k)
    u_ = k*e
    return 1/batch_size * np.sum(u_**2/(1+e), axis=1)


def square_loss_on_batch (theta, x, y): 
    return 1/batch_size * np.sum((np.transpose(theta)@x_ - y)**2, axis=1)

def square_gradient (theta, x, y):
    return 1/batch_size*2*x*square_loss_on_batch(theta, x, y)
    

def regression_sum (theta, stream, f):
    R = 0
    for x_, y in stream:
        x = np.append(np.reshape(x_[:,:,0], shape), [[1]], axis = 1)
        R += f(theta, stream)
    stream.reset()
    return 1/number_of_letters*batch_size*R
        
    
        

In [1]:
#here we define the gradient descent and the stochastic gradient descent with backtracking line search (to get an optimal learning rate)


epsilon = 10**-6

def backtracking_line_search (f, x, delta_x, grd_x, alpha= 0.25, beta=0.89):
    t = 1
    while (f(x+t*delta_x) >= f(x) + alptha*t*np.transpose(grd_x)@delta_x):
        t *= beta
    return t

def gradient_descent (f, grd, starting_point, epsilon = epsilon, learning_rate_function = backtracking_line_search):
    theta = starting_point
    prev_theta = starting_point
    r = regression_sum(theta, train_data_gen, f)
    while True:
        prev_theta = theta
        f_theta, delta_theta = r, -grd(theta, train_data_gen, grd)
        theta += learning_rate_function(lambda x: regression_sum(x, train_data_gen, f), theta, delta_theta, -delta_theta)*delta_theta
        r = regression_sum(theta, train_data_gen, f)
        if (r - f_theta < epsilon):
            break;
    return theta


def stochastic_gradient_descent (f, grd, starting_point, epsilon=epsilon, learning_rate_function = backtracking_line_search) :
    theta = starting_point
    prev_theta = starting_point
    for x_, y in training_data_gen:
        r = f(theta, x, y)
        x = np.append(np.reshape(x_[:,:,0], shape), [[1]], axis = 1)
        while True:
            prev_theta = theta
            f_theta, delta_theta = r, -grd(theta, x, y)
            theta += learning_rate_function(lambda t: f(t, x, y), theta, delta_theta, -delta_theta)*delta_theta
            r = f(theta, x, y)
            if (r - f_theta < epsilon):
                break;
    return theta
    

def predict (theta, X, Y):
    

In [None]:
def k_nn (k, stream, distance, cross_validation_percentage):
    return 0