In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#Nueral Network



## Neuron
- Neuron has linear part, and activation part → neuron = linear($wx+b$) + activation($\sigma$)
- Model consisted to architecture, and parameter → Model = Architecture + parameter

- Notation:
    - neuron $a_1^{[1]}$: [1] - layer of neuron , 1 - index in such layer,
    - linear part $z_1^{[1]}$: same

###How to Learn Neuron
(e. g. To determine what animal is in an image)
1. initialize $w, b$
2. Find the optimal $w, b$ (train)
    
    → $L = -[y\log\hat{y}+(1-y)\log(1-\hat{y}) ]$(Loss function, y=0 or 1)
    
    → $w = w - \alpha\frac{\partial L}{\partial w}$,  $b = b - \alpha\frac{\partial L}{\partial b}$
    
3. use $\hat{y}=\sigma(wx+b)$ to predict (activation part)

(+) $ \frac{\partial \mathcal{L}}{\partial \hat{y}}= \frac {\hat{y}-y}{\hat{y}(1-\hat{y})}$, $\frac{\partial \mathcal{L}}{\partial z}=\hat{y}-y$, $\frac{\partial \mathcal{L}}{\partial w}=(\hat{y}-y)x$,

###Characteristic of Neuron

1. is it cat or not → one neuron
2. is it cat or lion or iguana → three neuron → it can learn image that contain multiple animals
    
    - $L$ cant match to three neuron (only represent one classification)
    
    - $L_{3N} = \sum_{k=1}^3 [y_k\log\hat{y}_k+(1-y_k)\log(1-\hat{y_k})]$  
    (binary cross-entopy, loss of logistic regression)
    
3. only one animal in each image? →   softmax - $\exp(z_1^{[1]}/\sum_iz_i^{[1]})$
    
    - output depends on other neuron’s outcomes, not logistic
    
    - can’t use gradient descent, no logistic
     
    - cross-entropy Loss: $L_{CE} = -\sum_{k=1}^3y_k\log \hat{y}_k$
    
4. getting an age of cat
    
    - change activation function → ReLU
    
    - change Loss function → $|y-\hat{y}|$ or squared


##Notation
- layer: not connected to each other

- hidden layer: not directly observable from input, or output

**Why hidden layer**: may each neurons detect different characteristic of inputs

- (e. g. house price prediction) \
inputs: bedroom, size, zip code, wealth \
→ second edges(hidden layer): family size, walkable, school quality, \
→ output: price

- Often, network can determine these features better than human
→ blackbox model, end to end learning

##Propagation Equations

(i.g. 3 first layer, 2 second layer, 1 last layer.)

- $z^{[1]} = w^{[1]}x + b$
- $z^{[2]} = w^{[2]}a^{[1]}+b^{[2]}$ …
    
    → $w^{[n]}$ = [# layer [n], # layer n-1 (for n=1, input)],     $b^{[n]}$ = [# layer [n], 1]
    
    → $z^{[n]}$ = [# layer [n], 1],   $x$ = [# input, 1]
    
- $a^{[1]} = \sigma(z^{[1]})$
- $a^{[2]}= \sigma(z^{[2]})$ …
    
    → $a^{[n]}$ = same to z
    

For bunch of test case → parrarelize computation

- $Z^{[1]} = w^{[1]}x + b^{[1]}$
- $Z^{[2]} = w^{[2]}A^{[1]}+b^{[2]}$…
    
    → $w^{[n]}$ = [# layer [n], # layer n-1 (for n=1, input)],     $b^{[n]}$(broadcasting) = [# layer [n], m]
    
    → $Z^{[n]}$ = [# layer [n], # examples],   $x$ = [# input, # examples]
    
- $A^{[1]} = \sigma(Z^{[1]})$
- $A^{[2]}= \sigma(Z^{[2]})$ …
    
    → $a^{[n]}$ = same to z
    

##Backward Propagation

Define loss/cost function (1 example for loss, n example for cost)

$$
J(\hat{y}, y) = \frac1m\sum L^{(i)}  \quad L = -[y^{(i)}\log\hat{y}^{(i)}+(1-y^{(i)})\log(1-\hat{y}^{(i)}) ]
$$

$$
\forall l=1\dots3: \begin{cases}
w^{[l]} = w^{[l]}-\alpha\frac{\partial J}{\partial w^{[l]}} \\
b^{[l]} = b^{[l]}-\alpha\frac{\partial J}{\partial b^{[l]}}
\end{cases}
$$

We should start from the end (closest to y hat)

### Derivative of Loss function

$$
\hat{y}^{(i)} =a^{[3]}  \\
\frac{\partial L}{\partial w^{[3]}}=-[y^{(i)}\frac1{a^{[3]}}\cdot a^{[3]}(1-a^{[3]})a^{[2]T} + \\ (1-y^{(i)})\frac1{1-a^{[3]}}\cdot (-1) a^{[3]}(1-a^{[3]})a^{[2]T}] \\
= -(y^{(i)}-a^{[3]})a^{[2]T} =
\frac{\partial L}{\partial z^{[3]}} \cdot
\frac{\partial z^{[3]}}{\partial w^{[3]}}
$$

(why a2 transpose? derivate w3 should be shape of w3 [1,2], a2 is [2,1])

$$
\frac {\partial J}{\partial w^{[3]} } = \frac{-1}m\sum_{i=1}^m (y^{(i)}-a^{[3]})a^{[2]T}
$$

→ Backward Propagation

$$
\frac{\partial L}{\partial w^{[2]}}= \frac{\partial L}{\partial a^{[3]}} \cdot \frac{\partial a^{[3]}}{\partial z^{[3]}} \cdot
\frac{\partial z^{[3]}}{\partial a^{[2]}} \cdot
\frac{\partial a^{[2]}}{\partial z^{[2]}} \cdot
\frac{\partial z^{[2]}}{\partial w^{[2]}}
$$

$$
= (a^{[3]}-y)\cdot w^{[3]T}\cdot a^{[2]}(1-a^{[2]})\cdot a^{[1]T}
$$

→ Check it

- Result: [2, 3]
- First term: [1, 1]
- Second term: [2, 1]
- Third term: [2, 1]
- Fourth term: [1, 3]   → How to compute?
- Derivative of sigmoid(Third term) is element wise product…

→ ( Second * Third [element wise] )( First )( Fourth ) → [2, 3]

Save all values (a1 or a2 …) to avoid recomputing terms.

In [None]:
from sklearn.datasets import make_blobs

np.random.seed(0)
N_samples = 300
n_features = 2
n_classes = 3
X, Y = make_blobs(n_samples=N_samples, n_features=n_features, centers=n_classes,
                  cluster_std=1.5, random_state=42)

Y = Y.reshape(-1, 1)
NewY = np.zeros((Y.shape[0], n_classes))
for i in range(Y.shape[0]):
    NewY[i][Y[i][0]] = 1


plt.figure(figsize=(6, 5))
plt.scatter(X[:, 0], X[:, 1], c = NewY, label='Data', alpha=0.8)
plt.title('Synthetic Data for Softmax Regression (3 Classes)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Class Label')
plt.legend()
plt.grid(True)
plt.show()

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, NewY, test_size=0.2, random_state=42)

In [None]:
from sklearn.datasets import load_iris # scikit-learn의 샘플 데이터 로드를 위해 import
from sklearn.preprocessing import MinMaxScaler

iris = load_iris()
X = iris.data
Y = iris.target
Y = Y.reshape(-1, 1)
NewY = np.zeros((Y.shape[0], n_classes))
for i in range(Y.shape[0]):
    NewY[i][Y[i][0]] = 1

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

plt.figure(figsize=(6, 5))
plt.scatter(X[:, 0], X[:, 1], c = Y, label='Data', alpha=0.8)
plt.title('Synthetic Data for Softmax Regression (3 Classes)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Class Label')
plt.legend()
plt.grid(True)
plt.show()

# Split data into training and testing sets
#X_train, X_test, Y_train, Y_test = train_test_split(X, NewY, test_size=0.2, random_state=42)\

X_train = X
Y_train = NewY
X_test = X
Y_test = Y

In [None]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))
linear = lambda x: x
ReLU = lambda x: np.maximum(0,x)

def one_hot_encode(Y, num_classes):
    N = Y.shape[0]
    one_hot_Y = np.zeros((N, num_classes))
    # Ensure Y is flattened to 1D for advanced indexing
    one_hot_Y[np.arange(N), Y.flatten()] = 1
    return one_hot_Y

class Network:
    def __init__(self, architecture: list):
        self.architecture = architecture
        self.layers = []
        self.activations = []
        length = len(architecture)
        for i in range(length-2):
            self.layers.append(Dense(architecture[i], architecture[i+1], ReLU, i))

        self.layers.append(Dense(architecture[-2], architecture[-1], linear, length-1))
        self.layers.append(OutSoftmax())

    #x: np.array[feature, 1]
    def propagation(self, x):
        prev = x
        for layer in self.layers:
            prev = layer.forward(prev)
        return prev

    def back_propagation(self, x, y_hat, y_res):
        dloss = y_hat
        for layer in reversed(self.layers):
            dloss = layer.backward(dloss, self.alpha)

    def learn_network(self, x, y_true, epoch = 100, alpha=0.01):
        self.alpha = alpha
        num_classes = self.architecture[-1]

        for cnt in range(epoch):
            y_res = self.propagation(x)
            self.back_propagation(x, y_true, y_res)

class Layer:
    def __init__(self):
        self.params = {}
        self.grads = {}

    def forward(self, x):
        raise NotImplementedError

    def backward(self, dout):
        raise NotImplementedError

class Dense(Layer):
    def __init__(self, input_size, output_size, activation, layerindex):
        super().__init__()
        std_dev = np.sqrt(2 / input_size)
        self.params['w'] = np.random.randn(input_size, output_size) * std_dev
        self.params['b'] = np.zeros((1, output_size))
        self.activation = activation
        self.layerindex = layerindex

    def forward(self, x):
        self.x = x
        out = x @ self.params['w']+ np.tile(self.params['b'],(x.shape[0],1))
        out = self.activation(out)
        self.out = out
        return out

    def backward(self, dout, alpha):
        w = self.params['w']
        if self.activation == sigmoid:
            dZ = dout * (self.out * (1 - self.out))
        elif self.activation == ReLU:
            dZ = dout * (self.out > 0)
        else: # linear
            dZ = dout

        dw = self.x.T @ dZ
        db = np.sum(dZ, axis=0)
        dx = dZ @ w.T

        self.params['w'] -= alpha * dw
        self.params['b'] -= alpha * db
        #print(f"Layer ({self.layerindex}) W: {self.params['w'] }, b: {self.params['b']}", end="")
        return dx

class OutSoftmax(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        out = (lambda e: np.exp(e))(x)
        sums = np.sum(out, axis = 1)
        sums = sums.reshape(-1,1)
        sums = sums.repeat(out.shape[1], axis=1)
        out = out / sums
        self.out = out
        return out

    def backward(self, y_hat, alpha):
        #print()
        return (self.out - y_hat) / y_hat.shape[0]



net = Network([4,10,10,3])
net.learn_network(X_train, Y_train, 100, 0.1)

In [None]:
Y_result = net.propagation(X_test)

def get_predicted_classes(Y_probs):
    return np.argmax(Y_probs, axis=1)

print("Raw probabilities from network:\n", Y_result)

predicted_classes = get_predicted_classes(Y_result)

print("Predicted class labels:\n", predicted_classes)

plt.figure(figsize=(6, 5))
# Use the predicted_classes (integer labels) for coloring
plt.scatter(X_test[:, 0], X_test[:, 1], c=predicted_classes, label='Data', alpha=0.8)
plt.title('Synthetic Data for Softmax Regression (3 Classes)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Class Label')
plt.legend()
plt.grid(True)
plt.show()
