## HW3 - Neural Net using TensorFlow
Реализовать двухслойную полносвязную нейросетку на чистом TF. Как функцию активации можете использовать что хотите. Размер тоже на ваше усмотрение. Предлагаю сделать по образу и подобию тетрадки с пары.

Для fashion mnist:

### Data Loading and preprocessing

In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

In [2]:
SEED = 1998
tf.random.set_seed(SEED) # фиксируем random_seed

In [3]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10**4, random_state=SEED)

X_train = X_train/ 255.
X_val = X_val/ 255.
X_test = X_test/ 255.

X_train = X_train.reshape(X_train.shape[0], 28**2)
X_val = X_val.reshape(X_val.shape[0], 28**2)
X_test = X_test.reshape(X_test.shape[0], 28**2)



class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [5]:
from tensorflow.keras.utils import to_categorical

y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)
y_val_ohe = to_categorical(y_val)

### Auxiliary finctions

In [6]:
def logloss(p_pred, y_true):
    p_pred = tf.clip_by_value(p_pred, 1e-9, 1.)
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(p_pred), axis=1))

In [7]:
def accuracy(predict_proba, target):
    target = tf.math.argmax(target, axis=-1, output_type=tf.dtypes.int64)
    predict = tf.math.argmax(predict_proba, axis=-1, output_type=tf.dtypes.int64)
    return (predict==target).numpy().sum()/len(target)

### Constructing a model

In [8]:
W1 = tf.Variable(tf.random.normal([784, 256], mean=0, stddev=0.01, seed=SEED), name='weight_1')
b1 = tf.Variable(tf.random.normal([1, 256],  mean=0, stddev=0.01, seed=SEED), name='bias_1')

W2 = tf.Variable(tf.random.normal([256,10],  mean=0, stddev=0.01, seed=SEED), name='weight_2')
b2 = tf.Variable(tf.random.normal([1,10],  mean=0, stddev=0.01, seed=SEED) , name='bias_2')

In [9]:
def softmax(x):
    exp = tf.math.exp(x)
    exp_sum = tf.reduce_sum(exp, axis=-1)
    exp = tf.transpose(exp)
    result = tf.math.divide_no_nan(exp, exp_sum)
    result = tf.transpose(result)
    return result

In [10]:
def model(X):
    x = X@W1+b1
    x = tf.clip_by_value(x, 0., tf.float32.max)
    x = x@W2+b2
    x = softmax(x)
    return x

In [11]:
# X = X_test[:5]
# Y = y_test_ohe[:5]
# predict_proba = model(X)
# logloss(predict_proba, Y)
# accuracy(predict_proba, Y)

### Fitting the model on our dataset

In [12]:
optimizer = tf.optimizers.Adam(learning_rate=0.01)

In [13]:
def model_train(X, Y):

    with tf.GradientTape() as g:
        predict_proba = model(X)
        loss = logloss(predict_proba, Y)

    gradients = g.gradient(loss, [W1, b1, W2, b2])
    
    optimizer.apply_gradients(zip(gradients, [W1, b1, W2, b2]))
    
    return loss.numpy()

In [14]:
epochs = 201

for i in range(epochs):
    
    # Делаем щаг градиентного спуска 
    loss = model_train(X_train, y_train_ohe)
    
    if i%100 == 0:
        predict_proba = model(X_train)
        metric_train = accuracy(predict_proba, y_train_ohe)
        
        predict_proba = model(X_val)
        loss_val = logloss(predict_proba, y_val_ohe).numpy()
        metric_val = accuracy(predict_proba, y_val_ohe)
        
        print(f"step: {i}, train loss: {loss}, valid loss: {loss_val}")
        print(f'step: {i}, train_accuracy: {metric_train}, valid_accuracy: {metric_val}')

step: 0, train loss: 2.303663969039917, valid loss: 2.0003325939178467
step: 0, train_accuracy: 0.40064, valid_accuracy: 0.403
step: 100, train loss: 0.2902778387069702, valid loss: 0.3322506844997406
step: 100, train_accuracy: 0.89796, valid_accuracy: 0.8784
step: 200, train loss: 0.2068111002445221, valid loss: 0.3105197250843048
step: 200, train_accuracy: 0.92562, valid_accuracy: 0.8904
