In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
sns.set()
plt.rc('axes', titlesize='xx-large')
plt.rc('axes', labelsize='x-large')
plt.rc('legend',fontsize='x-large')
plt.rc('xtick',labelsize='x-large')
plt.rc('ytick',labelsize='x-large')
plt.rc('lines',linewidth=4)

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)
np.set_printoptions(suppress=True, precision=2)

from keras import regularizers, optimizers
from keras.utils.vis_utils import plot_model, model_to_dot
from IPython.display import SVG

import pickle

with open('../data/acc_grid', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    acc_grid =pickle.load(f)
    
with open('../data/loss_grid', 'rb') as f:
    loss_grid =pickle.load(f)

Using TensorFlow backend.


# An introduction to neural networks with Keras
Dr. Florent Martin (Universität Regensburg)  
March 2018

**Machine Learning**
* choose a **MODEL** which depends on **PARAMETERS**
*  learn from **DATA**
* choose model parameters that **FIT** the data

**Neural Networks**  =  family of models

**Keras** = Python Library for Neural Networks

<img style="height:600px;margin: -5px 0px 0px 100px" src="../figures/img/table1.png">

1. Logistic Regression
  1. Iris Dataset
  3. Logistic Regression with scikit-learn
  4. Logistic Regression with Keras
1. Gradient descent 
  1. Optimization
  2. Loss function
1. Neural Networks
  2. Logistic Regression again
  3. Neural Networks with hidden layers

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Part 1  
# Before Neural Networks: Logistic Regression

# 1.1. Iris dataset

In [None]:
iris  = sns.load_dataset("iris")
iris.sample(5)

In [None]:
iris.shape

In [None]:
iris.species.value_counts()

<center><font size=7>Setosa</font></center> | <center><font size=7>**Versicolor**</font></center> | <center><font size=7> **Virginica** </font></center>
---|---|---
![setosa](../figures/img/setosa.jpg) | ![setosa](../figures/img/versicolor.jpg) | ![setosa](../figures/img/virginica.jpg)

In [None]:
sns.pairplot(iris , hue="species");

### Goal: knowing the petal width, predict if the iris is a virginica  

**Input** = petal width

**Question**: Is the iris sample a virginica?

**Output** =  False / True (equivalently 0 / 1)

In [None]:
iris['isVirginica'] = (iris['species'] == 'virginica').astype(int)
iris.sample(5)

In [None]:
fig, ax = plt.subplots( figsize=(12,7) )
iris.groupby('isVirginica').hist( column='petal_width' , ax = ax , bins=15 )
plt.legend( [ 'not virginca' , 'virginica' ] )
plt.xlabel('petal width')
plt.ylabel('number of samples')
plt.title('');

# Is the plant a virginica?
![guess the probabilities](../figures/img/probas1.png)

# Is the plant a virginica?
![guess the probabilities](../figures/img/probas2.png)

# Is the plant a virginica?
![guess the probabilities](../figures/img/probas3.png)

# Logistic Regression

<font size=6>
**Logistic Regression** returns a function 
<br><br>
$$ P: [0,3] \to [0,1]$$
</font>
* $x = $ petal width 
* $P(x) =$ estimate of the probability that the plant is a virginica.
<br><br>
$$0\leq P(x) \leq 1$$

# Sigmoid function
<br><br>
$$ \sigma : x \mapsto  \frac{1}{1+e^{-x}}$$

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
plt.rc('xtick',labelsize='xx-large')
plt.rc('ytick',labelsize='xx-large')

In [None]:
#plt.rcParams
plt.rc('figure',figsize=(14,7))

In [None]:
x = np.arange(-5, 5, .01)
fig, ax = plt.subplots(figsize=(15,8))
plt.plot( x , sigmoid(x) , 'r--' , label='sigmoid' );  

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
iris.groupby('isVirginica').hist(column='petal_width' , normed=True , ax=ax)
plt.legend(['not virginca', 'virginica'])
plt.plot(x, sigmoid(x) , 'r--' );

<font size=6>
**Logistic Regression** is a model depending on **parameters**:  **W** and **B**.  
    <br>
For an **input x** it  ouputs the probability
<br><br>
$$P_{W,B}(x) = \sigma(Wx+B)$$
where     
$$ \sigma : x \mapsto  \frac{1}{1+e^{-x}}$$
is the **sigmoid function **.   
</font>

## 1.2 Logistic Regression with Scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
model.fit( iris[['petal_width']] , iris['isVirginica'] )

In [None]:
w , b = model.coef_ , model.intercept_
print(w,b)


In [None]:
plt.rc('figure',figsize=(12,6))

In [None]:
petal_widths = np.arange(0,3,0.01)
predicted_proba = model.predict_proba(petal_widths.reshape(-1,1))[:,1]
plt.figure(figsize=(10,7))
plt.plot(petal_widths, predicted_proba, 'r--')
plt.xlabel('petal_width')
plt.ylabel('predicted probability');

In [None]:
fig, ax = plt.subplots(figsize=(13,8))
iris.groupby('isVirginica').hist(column='petal_width', normed=True, ax=ax, alpha=.8)
plt.legend(['not virginca', 'virginica'])
plt.plot(petal_widths, predicted_proba,'r--');

### How to evaluate the model?  

<font size=6>
$$ \textbf{Accuracy} = \frac{\text{number of samples correctly classified}}{\text{total number of samples}}$$
<size>

In [None]:
model.score( iris[['petal_width']] , iris['isVirginica'] )

# 1.3 Logistic Regression with Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

# Graphical representation of Logistic Regression

### Choose the parameters:

* W 
* B 

![logistic regression](../figures/img/01-log.png)

![logistic regression](../figures/img/01-log.png)

In [None]:
model = Sequential( [
    Dense(1, input_dim=1), # for the map x -> W*x + B
    Activation('sigmoid') # for the sigmoid function
]  ) 

In [None]:
model.compile(optimizer='sgd', loss='binary_crossentropy' , metrics=['acc'] )

In [None]:
model.fit( iris[['petal_width']] , iris['isVirginica'] , epochs=500 , verbose=False);

In [None]:
model.get_weights() # get the paramters w and b

In [None]:
model.metrics_names

In [None]:
model.evaluate( iris[['petal_width']] , iris['isVirginica'] )

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
iris.groupby('isVirginica').hist(column='petal_width', normed=True, ax=ax)
plt.legend(['not virginica', 'virginica'])
predicted_proba = model.predict(petal_widths.reshape(-1,1))[:,0]
plt.plot(petal_widths, predicted_proba,'r--');

# Part 2
# Gradient descent

## How does Logistic Regression work? 

# 2.1 Optimization

In [None]:
model = Sequential([
    Dense(1 , input_dim=1 , kernel_regularizer=regularizers.l2(0.1)) ,
    Activation('sigmoid') , 
]) 

In [None]:
model.compile( optimizer='sgd' , loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit( iris[['petal_width']] , iris['isVirginica'] , epochs=500 )

In [None]:
x_grid, y_grid = np.mgrid[ -10:10:.1 , -10:10:.1 ]

In [None]:
def get_accuracy( w , b ):
    layer =  model.layers[0]
    layer.set_weights( [ np.array([[w]]) , np.array( [b] ) ] ) 
    accuracy = model.evaluate( iris[['petal_width']] , iris[['isVirginica']] , verbose=0 )[1]
    return accuracy
vectorize_accuracy = np.vectorize(get_accuracy)

In [None]:
import pickle
with open('../data/acc_grid', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    acc_grid =pickle.load(f)

In [None]:
acc_grid  = vectorize_accuracy( x_grid , y_grid )  

In [None]:
fig , ax = plt.subplots(figsize=(15,7))
plt.pcolormesh(x_grid , y_grid , acc_grid , cmap='RdBu_r')
plt.colorbar()
ax.set(title='Accuracy', xlabel='W', ylabel='B');
model.layers[0].set_weights( [ np.array([[-9]]) , np.array([-9]) ] ) 
for i in range(20):
    old_w , old_b = model.get_weights()[0][0][0], model.get_weights()[1][0]
    model.fit( iris[['petal_width']] , iris['isVirginica'] , epochs=30 , verbose=0 )
    new_w , new_b = model.get_weights()[0][0][0] , model.get_weights()[1][0]
    plt.plot([old_w , new_w] , [old_b ,new_b] , 'kX--' , markersize=8 ,linewidth=2 )  

## Problem: the accuracy is constant on huge zones

# 2.2 Loss function

<font size=6>
$$\text{Loss} := -\frac{1}{n}\sum_{i=1}^n y_i \log(p_i) + (1-y_i)\log(1-p_i)$$
</font>

* <font size=6>n = number of samples</font>
*  <font size=6>$y_i\in \{0,1\}$ is the class (=species) of the i-th sample</font>

*  <font size=6>$p_i \in [0,1]$ is the predicted probability $P(x_i)$ calculated by the model</font>

If $y=1$ 
$$ -\big(y \log(p) + (1-y)\log(1-p)\big)  = -\log(p)$$

If $y=0$ 
$$ -\big(y \log(p) + (1-y)\log(1-p) \big)  =  -\log(1-p)$$

In [None]:
probas = np.arange(0,1,.0001)
loss = - np.log(probas)
plt.figure(figsize=(12,7))
plt.plot(probas , loss )
plt.xlabel('p=probability')
plt.ylabel('Loss = $-\log(p)$')
plt.title('Plot of $p \mapsto  -\log(p)$');

In [None]:
model.metrics_names

In [None]:
model.evaluate( iris[['petal_width']] , iris['isVirginica'] )

In [None]:
def get_loss(weight,bias):
    layer =  model.layers[0]
    layer.set_weights( [ np.array([[weight]]) , np.array([bias]) ] ) 
    loss = model.evaluate(iris[['petal_width']], iris[['isVirginica']], verbose=0 )[0]
    return loss
vloss = np.vectorize(get_loss)

In [None]:
loss_grid = vloss(x_grid,y_grid)  

In [None]:
fig , ax_loss = plt.subplots(figsize=(16,7))
plt.pcolormesh(x_grid , y_grid , loss_grid  , cmap='RdBu_r')
plt.colorbar(); plt.contour(x_grid, y_grid, loss_grid,20)
plt.title('Loss function'); plt.xlabel('w'); plt.ylabel('b');
model.layers[0].set_weights( [ np.array([[-9]]) , np.array([-9]) ] ) 
for i in range(15):
    old_weight, old_bias = model.get_weights()[0][0][0], model.get_weights()[1][0]
    model.fit(iris[['petal_width']] , iris[['isVirginica']] , epochs=40 , verbose=0 )
    weight, bias = model.get_weights()[0][0][0] , model.get_weights()[1][0]
    plt.plot([old_weight,weight] , [old_bias,bias] , 'kX--' , markersize=8 )

# Gradient descent
* Goal: **minimize** the **loss** of the **model**
* Step by step **change** the parameters in a **direction** to **minimize** the **loss**
* **direction**: calculated with **backpropagation** (computes **gradient**)

# Part 3
# Neural Networks 

# 3.1 Logistic Regression again

# Replace the species virginica by versicolor

In [None]:
iris['isVersicolor'] = (iris['species'] == 'versicolor').apply(int)
iris.sample(5)

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
iris.groupby('isVersicolor').hist( column='petal_width', ax=ax , bins=15 )
plt.legend( [ 'not versicolor' , 'versicolor' ] )
plt.xlabel('petal width'); plt.ylabel('number of samples'); plt.title('');

## Goal: determine if an iris is a versicolor knowing its petal width

In [None]:
model = Sequential([
    Dense( 1 , input_dim=1 ) ,
    Activation('sigmoid') ,
])

In [None]:
model.compile( optimizer='sgd' , loss='binary_crossentropy' , metrics=['accuracy'] )

In [None]:
model.fit( iris[['petal_width']] , iris['isVersicolor'] , epochs=500 );

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
iris.groupby('isVersicolor').hist(column='petal_width', normed=True, ax=ax)
plt.legend(['not versicolor', 'versicolor'])
predicted_proba = model.predict(petal_widths.reshape(-1,1))[:,0]
plt.plot(petal_widths, predicted_proba  ,'r--' )
plt.hlines(0.5, *ax.get_xlim(), linestyles='dotted');

<font size=8><center> We want</center></font>| <font size=8><center>We don't want</center> </font>
--------|-----------
![non monotonic curve](../figures/img/graph_nn.png) | ![monotonic curve](../figures/img/graph_logistic.png) 

<font size=7>**BAD NEWS**: PREDICTED PROBABILITIES BY LOGISTIC REGRESSION CAN NOT GO UP AND DOWN</font>

# 3.2 Neural networks with hidden layers

![NN](../figures/img/01-log.png)

![NN](../figures/img/nn_color.png)

![NN](../figures/img/nn_color.png)

In [None]:
model = Sequential( [
Dense( 3 , input_dim=1 ) , 
Activation('sigmoid') ,
Dense(1) ,
Activation('sigmoid')
] )

In [None]:
model.compile(optimizer=optimizers.SGD(lr=.1), loss='binary_crossentropy', metrics = ['acc'])

In [None]:
plt.subplots(nrows=2  , ncols=3 , figsize=(18,8) , sharex=True , sharey=True)
for i in range(1,7):
    plt.subplot(2 , 3 , i)
    model.fit( iris[['petal_width']] , iris['isVersicolor'] , epochs=300 , verbose=0 )
    probas = model.predict(petal_widths.reshape(-1,1))[:,0]
    plt.plot( petal_widths , probas , 'r--' , label='{} epochs'.format(i*300))
    plt.ylim((0,1))
    plt.legend()

In [None]:
model.evaluate( iris[['petal_width']] , iris['isVersicolor'] )

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
iris.groupby('isVersicolor').hist(column='petal_width' , normed=True , ax=ax)
plt.legend(['not versicolor', 'versicolor'])
predicted_proba = model.predict(petal_widths.reshape(-1,1))[:,0]
plt.plot(petal_widths , predicted_proba , 'r--');

# Conclusion

* Neural Networks are **models** depending on **parameters**

* **Logistic Regression**: the simplest neural network 

* Neural Networks are made of **layers**. More layers = more expressivity

* Behind the hood: **fit** the **parameters** with the **data** to **minimize** the **loss**

* **Keras**: easy interface to use Neural Networks

# <center>Thank you for your attention</center>