# Extreme Learning Machine

* Huang et al, 2004
* Single Hidden Layer Feedforward Neural Networks

Let **input data** $X = [x_1, x_2, \cdots, x_N]^T$, $x\in \mathbb{R}^M$,

s.t. $N$ is a number of input data and $M$ is a number of feature data.

Let $L$ be a number of **hidden node** , 

$D$ be a number of **output data**  

and $\beta \in \mathbb{R}^{L\times D}, \beta = [\beta_1, \beta_2, \cdots, \beta_L]^T$ be **weight** between hidden node and output data.

In [1]:
import numpy as np
import pandas as pd

activate function
$$h(x) = g(x, w, c)$$

* Sigmoid Function
$$g(x, w, c) = \frac{1}{1 + e^{-(wx + c)}}$$

* Gaussian Function 
$$g(x, w, c) = e^{-c\|x-w\|}$$

* Hyperbolic Tangent Function
$$g(x, w, c) = \frac{1 - e^{-(wx+c)}}{1 + e^{-(wx+c)}}$$

In [2]:
def sigmoid(x, w, c):
    return 1 / (1 + np.exp(-(np.dot(x, w) + c)))

def gaussian(x, w, c):
    return np.exp(-c * np.linalg.norm(x - w, 'fro'))

def hyperbolic_tangent(x, w, c):
    return (1 - np.exp(-(np.dot(w, x) + c)))/(1 + np.exp(-(np.dot(x, w) + c)))

In [3]:
#Get function
def getActivation(name):
    return {
        'sigmoid': sigmoid,
        'gaussian': gaussian,
        'hyperbolic_tangent': hyperbolic_tangent,
    }[name]

### Mdel 
\begin{align*} 
f(x) &=  \sum_{i=1}^L \beta_i h_i(x) \\ 
 &=  h_i(x)\beta
\end{align*}

Let $H$ be an activat function matrix.

$$H = \begin{bmatrix}
h(x_1) & \\
\vdots & \\
h(x_N) & 
\end{bmatrix} = 
\begin{bmatrix}
h_1(x_1) & \cdots & h_L(x_1) \\
\vdots & \vdots & \vdots \\
h_1(x_N) & \cdots & h_L(x_N)
\end{bmatrix}$$

$$H =  
\begin{bmatrix}
g(w_1\cdot x_1 + c_1) & \cdots & g(w_L\cdot x_1 + c_L) \\
\vdots & \vdots & \vdots \\
g(w_1\cdot x_N + c_1) & \cdots & g(w_L\cdot x_N + c_L)
\end{bmatrix}_{N\times L}$$

and let $Y \in \mathbb{R}^{N\times D}$  be an output matrix.

$$Y = \begin{bmatrix}
y_1 \\
\vdots \\
y_N
\end{bmatrix}
= 
\begin{bmatrix}
y_11 & \cdots & y_{1D} \\
\vdots & \vdots & \vdots \\
y_N1 & \cdots & y_{ND} 
\end{bmatrix}$$

In [4]:
x = np.random.rand(10, 5)

In [5]:
L = 2
M = x.shape[1]
w = np.random.normal(size=(M, L))
c = np.random.normal(size=(L))

In [6]:
x

array([[0.29937212, 0.34926431, 0.72280354, 0.10369867, 0.03293492],
       [0.32241137, 0.49245794, 0.20098303, 0.85968763, 0.66317051],
       [0.30045341, 0.38701065, 0.77354309, 0.19583069, 0.01478452],
       [0.5886273 , 0.34771763, 0.88466116, 0.58820682, 0.24402686],
       [0.10739046, 0.19089319, 0.92735433, 0.75041296, 0.21279248],
       [0.64824813, 0.38467016, 0.57781509, 0.80134625, 0.48959671],
       [0.63418568, 0.97705802, 0.00592538, 0.15391633, 0.98192962],
       [0.02554223, 0.93609574, 0.48551297, 0.21496528, 0.02208028],
       [0.88322273, 0.44525438, 0.97835224, 0.40757616, 0.6283355 ],
       [0.43743962, 0.58341947, 0.81397852, 0.27301867, 0.45653337]])

In [7]:
w

array([[0.89065459, 0.73220386],
       [0.40063727, 0.44344866],
       [0.09428806, 0.51616951],
       [0.27317769, 0.05220332],
       [0.06365799, 0.38798314]])

In [8]:
c

array([0.85275404, 0.33936095])

In [9]:
np.dot(x, w) + c

array([[1.35789597, 1.10472392],
       [1.63322209, 1.19973045],
       [1.40277874, 1.14621197],
       [1.77595851, 1.50657082],
       [1.33086123, 1.10304997],
       [1.88879013, 1.51463093],
       [1.91415314, 1.6290548 ],
       [1.35644556, 1.04356914],
       [2.06137214, 1.95356351],
       [1.65649428, 1.52990313]])

In [10]:
sigmoid(x, w, c)

array([[0.79541752, 0.75114418],
       [0.83661055, 0.76847683],
       [0.80262446, 0.75881834],
       [0.85519711, 0.81855245],
       [0.79098306, 0.75083114],
       [0.86861752, 0.8197465 ],
       [0.87148501, 0.83604011],
       [0.7951814 , 0.73953809],
       [0.88709168, 0.87583469],
       [0.83976684, 0.82199214]])

In [11]:
def H(x, activate, L):
    M = x.shape[1]
    w = np.random.normal(size=(M, L))
    c = np.random.rand(L)
    act = getActivation(activate)
    return act(x, w, c)

### Objective
$$\underset{\beta}{\mathrm{min}} \|H\beta - Y\|^2$$

So,

$$\beta = H^\dagger Y$$

where $H^\dagger$ is psudo inverse matrix of H.
$$H^\dagger = (H^TH)^{-1}H^T$$

#### Regularize Model

$$\underset{\beta}{\mathrm{min}} \frac{C}{2}\|H\beta - Y\|^2 + \frac{1}{2}\|\beta\|$$

where $C$ is a hyperparameter.

\begin{align*} 
\nabla_{\beta}\big(\frac{C}{2}\|H\beta - Y\|^2 + \frac{1}{2}\|\beta\|\big) & = 0\\\\
CH^T(H\beta - Y) + \beta & = 0\\\\
CH^TH\beta - CH^TY + \beta & = 0\\\\
(CH^TH + I)\beta & = CH^TY \\\\
\beta & = (H^TH + \frac{I}{C})^{-1}H^TY
\end{align*} 

In [12]:
C = 1
I = np.eye(L, L) 
H = sigmoid(x, w, c)
Y = np.random.rand(10, 1)

In [13]:
I

array([[1., 0.],
       [0., 1.]])

In [14]:
Y

array([[0.1677238 ],
       [0.01480375],
       [0.16060866],
       [0.61763789],
       [0.31681444],
       [0.96845021],
       [0.14902667],
       [0.83422112],
       [0.3922468 ],
       [0.77740615]])

In [15]:
Beta = np.linalg.inv(H.T @ H + I/C) @ H.T @ Y

In [16]:
Beta

array([[0.25462301],
       [0.2498493 ]])

In [17]:
H @ Beta

array([[0.39020445],
       [0.4050237 ],
       [0.39395689],
       [0.42226762],
       [0.38899712],
       [0.4259831 ],
       [0.43078417],
       [0.38724456],
       [0.44470064],
       [0.41919812]])

# Test in Real Data

In [18]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [19]:
# Loading Iris Datasets:
iris = sns.load_dataset("iris")

In [20]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [21]:
X = iris.iloc[:, :4]
y = iris.iloc[:, -1]
y = pd.get_dummies(y)

In [23]:
X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [22]:
y

Unnamed: 0,setosa,versicolor,virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
145,0,0,1
146,0,0,1
147,0,0,1
148,0,0,1


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [25]:
class ELM:
    def __init__ (self,num_hidden, activation='sigmoid'):
        self.activation = getActivation(activation)
        self.L = num_hidden
        
    def fit(self,  X,  y, C=1):
        self.X = X
        self.Y = y
        self.I = np.eye(self.L, self.L) 
        self.M = X.shape[1]
        self.w =  np.random.normal(size=(self.M, self.L))
        self.c = np.random.normal(size=(self.L))
        self.C = C
        
        self.H = self.activation(self.X, self.w, self.c)
        self.Beta = np.linalg.inv(self.H.T @ self.H + self.I /self.C) @ self.H.T @ self.Y
        
    def predict(self, X):
        H_pre = self.activation(X, self.w, self.c)
        return H_pre @ self.Beta

In [26]:
model = ELM(15)

In [27]:
model.fit(X_train, y_train, C=1.2)

In [28]:
y_pred = model.predict(X_test)

In [29]:
sum(np.argmax(y_pred.values, axis=1) == np.argmax(y_test.values, axis=1)) / len(y_pred)

0.8888888888888888

In [30]:
from sklearn.metrics import confusion_matrix

In [31]:
confusion_matrix(np.argmax(y_test.values, axis=1), np.argmax(y_pred.values, axis=1), labels=[0, 1, 2])

array([[16,  0,  0],
       [ 0, 13,  5],
       [ 0,  0, 11]], dtype=int64)