In [78]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import copy
from lr_utils import load_dataset
import torch
import tensorflow as tf

## 1. Import Data

In [7]:
train_set_x_orig, train_set_y,\
test_set_x_orig, test_set_y, classes = load_dataset()

In [10]:
m_train = train_set_x_orig.shape[0]
m_test = test_set_x_orig.shape[0]
num_px = train_set_x_orig.shape[1]

In [11]:
print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))
print ("Height/Width of each image: num_px = " + str(num_px))
print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 3)")
print ("train_set_x shape: " + str(train_set_x_orig.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x shape: " + str(test_set_x_orig.shape))
print ("test_set_y shape: " + str(test_set_y.shape))

Number of training examples: m_train = 209
Number of testing examples: m_test = 50
Height/Width of each image: num_px = 64
Each image is of size: (64, 64, 3)
train_set_x shape: (209, 64, 64, 3)
train_set_y shape: (1, 209)
test_set_x shape: (50, 64, 64, 3)
test_set_y shape: (1, 50)


## 2. Reshape training and test examples

In [12]:
train_set_x_flatten = train_set_x_orig.reshape(m_train, -1).T
test_set_x_flatten = test_set_x_orig.reshape(m_test, -1).T

print ("train_set_x_flatten shape: " + str(train_set_x_flatten.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x_flatten shape: " + str(test_set_x_flatten.shape))
print ("test_set_y shape: " + str(test_set_y.shape))

train_set_x_flatten shape: (12288, 209)
train_set_y shape: (1, 209)
test_set_x_flatten shape: (12288, 50)
test_set_y shape: (1, 50)


### Standardize features

In [13]:
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

# Build Model

<a name='3'></a>
## 3 - General Architecture of the learning algorithm ##

It's time to design a simple algorithm to distinguish cat images from non-cat images.

You will build a Logistic Regression, using a Neural Network mindset. The following Figure explains why **Logistic Regression is actually a very simple Neural Network!**

<img src="images/LogReg_kiank.png" style="width:650px;height:400px;">

**Mathematical expression of the algorithm**:

For one example $x^{(i)}$:
$$z^{(i)} = w^T x^{(i)} + b \tag{1}$$
$$\hat{y}^{(i)} = a^{(i)} = sigmoid(z^{(i)})\tag{2}$$ 
$$ \mathcal{L}(a^{(i)}, y^{(i)}) =  - y^{(i)}  \log(a^{(i)}) - (1-y^{(i)} )  \log(1-a^{(i)})\tag{3}$$

The cost is then computed by summing over all training examples:
$$ J = \frac{1}{m} \sum_{i=1}^m \mathcal{L}(a^{(i)}, y^{(i)})\tag{6}$$

**Key steps**:
In this exercise, you will carry out the following steps: 
    - Initialize the parameters of the model
    - Learn the parameters for the model by minimizing the cost  
    - Use the learned parameters to make predictions (on the test set)
    - Analyse the results and conclude

## 1. Sigmoid

In [16]:
def sigmoid_np(x):  
    return 1 / (1 + np.exp(-x))

def sigmoid_torch(x):
    return 1 / (1 + torch.exp(-x))

@tf.function
def sigmoid_tf(x):
    return 1 / (1 + tf.math.exp(-x))

## 2. Initialing parameters

In [140]:
def initialize_with_zeros_np(dim):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.
    
    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)
    
    Returns:
    w -- initialized vector of shape (dim, 1)
    b -- initialized scalar (corresponds to the bias) of type float
    """
    
    w = np.zeros((dim, 1))
    b = 0.
    
    return w, b

def initialize_with_zeros_torch(dim):
    w = torch.zeros((dim, 1))
    b = torch.zeros(1)
    # b = 0.
    return w, b

def initialize_with_zeros_tf(dim):
    w = tf.zeros((dim, 1), dtype='float64')
    b = tf.zeros(1, dtype='float64')
    # b = 0.
    return w, b

## 3. Forward and backward propagation

Forward Propagation:
- You get X
- You compute $A = \sigma(w^T X + b) = (a^{(1)}, a^{(2)}, ..., a^{(m-1)}, a^{(m)})$
- You calculate the cost function: $J = -\frac{1}{m}\sum_{i=1}^{m}(y^{(i)}\log(a^{(i)})+(1-y^{(i)})\log(1-a^{(i)}))$

Here are the two formulas you will be using: 

$$ \frac{\partial J}{\partial w} = \frac{1}{m}X(A-Y)^T\tag{7}$$
$$ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (a^{(i)}-y^{(i)})\tag{8}$$

In [143]:
def propagate_np(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation explained above

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dw -- gradient of the loss with respect to w, thus same shape as w
    db -- gradient of the loss with respect to b, thus same shape as b
    
    """
    
    m = X.shape[1]
    
    # FORWARD PROPAGATION (FROM X TO COST)
    
    A = sigmoid_np(np.dot(w.T, X) + b)
    cost = (-1 / m) * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    
    dw = (1 / m) * np.dot(X, (A - Y).T)
    db = (1 / m) * np.sum(A - Y)
    
    cost = np.squeeze(cost)

    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

<object data="images/Forward_bacward_notes.pdf" type="application/pdf" width="700px" height="700px">
    <embed src="images/Forward_bacward_notes.pdf">
        <p>This browser does not support PDFs. Please download the PDF to view it: <a href="images/Forward_bacward_notes.pdf">Download PDF</a>.</p>
    </embed>
</object>


In [144]:
def propagate_torch(w, b, X, Y):
    
    m = X.shape[1]
    
    # FORWARD PROPAGATION (FROM X TO COST)
    
    A = sigmoid_torch(torch.mm(w.T, X) + b)
    cost = (-1 / m) * torch.sum(Y * torch.log(A) + (1 - Y) * torch.log(1 - A))
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    
    dw = (1 / m) * torch.mm(X, (A - Y).T)
    db = (1 / m) * torch.sum(A - Y)
    
    cost = torch.squeeze(cost)

    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

In [145]:
@tf.function
def propagate_tf(w, b, X, Y):
    
    m = X.shape[1]
    
    # FORWARD PROPAGATION (FROM X TO COST)
    
    A = sigmoid_tf(
        tf.tensordot(
            tf.transpose(w), X, axes=1) + b)
    
    cost = (-1 / m) * tf.math.reduce_sum(Y * tf.math.log(A)\
                                         + (1 - Y) * tf.math.log(1 - A))
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    
    dw = (1 / m) * tf.tensordot(X, tf.transpose(A - Y), axes=1)
    db = (1 / m) * tf.math.reduce_sum(A - Y)
    
#     cost = tf.squeeze(cost)

    
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

### tests

In [146]:
w =  np.array([[1.], [2]])
b = 1.5
X = np.array([[1., -2., -1.], [3., 0.5, -3.2]])
Y = np.array([[1, 1, 0]])

w_torch = torch.Tensor(w)
b_torch = torch.Tensor([b])
X_torch = torch.Tensor(X)
Y_torch = torch.Tensor(Y)

w_tf = tf.convert_to_tensor(w)
b_tf = tf.convert_to_tensor([b])
X_tf = tf.convert_to_tensor(X)
Y_tf = tf.convert_to_tensor(Y, dtype='float64')

In [147]:
grads, cost = propagate_np(w, b, X, Y)
grads, cost

({'dw': array([[ 0.25071532],
         [-0.06604096]]),
  'db': -0.1250040450043965},
 0.15900537707692405)

In [148]:
grads, cost = propagate_torch(w_torch, b, X_torch, Y_torch)
grads, cost

({'dw': tensor([[ 0.2507],
          [-0.0660]]),
  'db': tensor(-0.1250)},
 tensor(0.1590))

In [149]:
grads, cost = propagate_tf(w_tf, b, X_tf, Y_tf)
grads, cost

({'dw': <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
  array([[ 0.25071532],
         [-0.06604096]])>,
  'db': <tf.Tensor: shape=(), dtype=float64, numpy=-0.1250040450043965>},
 <tf.Tensor: shape=(), dtype=float64, numpy=0.15900537707692405>)

## 4. Optimize

In [83]:
def optimize_np(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False):
    """
    This function optimizes w and b by running a gradient descent algorithm
    
    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- True to print the loss every 100 steps
    
    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.
    """
    
    w = copy.deepcopy(w)
    b = copy.deepcopy(b)
    
    costs = []
    
    for i in range(num_iterations):
        # Cost and gradient calculation 
        
        grads, cost = propagate_np(w, b, X, Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        
        w = w - learning_rate * grads['dw']
        b = b - learning_rate * grads['db']
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
            # Print the cost every 100 training iterations
            if print_cost:
                print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [80]:
def optimize_torch(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False):
    
    costs = []
    
    for i in range(num_iterations):
        # Cost and gradient calculation 
        
        grads, cost = propagate_torch(w, b, X, Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        
        w = w - learning_rate * grads['dw']
        b = b - learning_rate * grads['db']
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
            # Print the cost every 100 training iterations
            if print_cost:
                print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [81]:
def optimize_tf(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False):
    
    costs = []
    
    for i in range(num_iterations):
        # Cost and gradient calculation 
        
        grads, cost = propagate_tf(w, b, X, Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        
        w = w - learning_rate * grads['dw']
        b = b - learning_rate * grads['db']
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost)
        
            # Print the cost every 100 training iterations
            if print_cost:
                print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

### tests

In [85]:
params, grads, costs = optimize_np(w, b, X, Y, num_iterations=100, learning_rate=0.009, print_cost=False)
params, grads, costs

({'w': array([[0.80956046],
         [2.0508202 ]]),
  'b': 1.5948713189708588},
 {'dw': array([[ 0.17860505],
         [-0.04840656]]),
  'db': -0.08888460336847771},
 [0.15900537707692405])

In [86]:
params, grads, costs = optimize_torch(w_torch, b, X_torch, Y_torch, num_iterations=100, learning_rate=0.009, print_cost=False)
params, grads, costs

({'w': tensor([[0.8096],
          [2.0508]]),
  'b': tensor(1.5949)},
 {'dw': tensor([[ 0.1786],
          [-0.0484]]),
  'db': tensor(-0.0889)},
 [tensor(0.1590)])

In [87]:
params, grads, costs = optimize_tf(w_tf, b, X_tf, Y_tf, num_iterations=100, learning_rate=0.009, print_cost=False)
params, grads, costs

({'w': <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
  array([[0.80956046],
         [2.0508202 ]])>,
  'b': <tf.Tensor: shape=(), dtype=float64, numpy=1.5948713189708588>},
 {'dw': <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
  array([[ 0.17860505],
         [-0.04840656]])>,
  'db': <tf.Tensor: shape=(), dtype=float64, numpy=-0.08888460336847771>},
 [<tf.Tensor: shape=(), dtype=float64, numpy=0.15900537707692405>])

## 5. Predict

In [95]:
def predict_np(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)
    
    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    
    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''
    
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    
    A = sigmoid_np(np.dot(w.T, X) + b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if A[0, i] > 0.5:
            Y_prediction[0, i] = 1
        else:
            Y_prediction[0, i] = 0
        
    return Y_prediction

In [96]:
def predict_torch(w, b, X):
    
    m = X.shape[1]
    Y_prediction = torch.zeros((1, m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    
    A = sigmoid_torch(torch.mm(w.T, X) + b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if A[0, i] > 0.5:
            Y_prediction[0, i] = 1
        else:
            Y_prediction[0, i] = 0
        
    return Y_prediction

In [123]:
def predict_tf(w, b, X):
    
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    w = tf.reshape(w, (X.shape[0], 1))
    
    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    
    A = sigmoid_tf(tf.tensordot(tf.transpose(w), X, axes=1) + b)
    
    for i in range(A.shape[1]):
        
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if A[0, i] > 0.5:
            Y_prediction[0, i] = 1
        else:
            Y_prediction[0, i] = 0
        
    return Y_prediction

### tests

In [109]:
predict_np(w, b, X)

array([[1., 1., 0.]])

In [110]:
predict_torch(w_torch, b, X_torch)

tensor([[1., 1., 0.]])

In [124]:
predict_tf(w_tf, b, X_tf)

array([[1., 1., 0.]])

## 6. Model

In [126]:
def model_np(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to True to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    # initialize parameters with zeros 
    w, b = initialize_with_zeros_np(X_train.shape[0])

    # Gradient descent 
    params, grads, costs = optimize_np(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "params"
    w = params['w']
    b = params['b']
    
    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict_np(w, b, X_test)
    Y_prediction_train = predict_np(w, b, X_train)
    

    # Print train/test Errors
    if print_cost:
        print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
        print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [135]:
def model_torch(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to True to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    # initialize parameters with zeros 
    w, b = initialize_with_zeros_torch(X_train.shape[0])

    # Gradient descent 
    params, grads, costs = optimize_torch(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "params"
    w = params['w']
    b = params['b']
    
    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict_torch(w, b, X_test)
    Y_prediction_train = predict_torch(w, b, X_train)
    

    # Print train/test Errors
    if print_cost:
        print("train accuracy: {} %".format(100 - torch.mean(torch.abs(Y_prediction_train - Y_train)) * 100))
        print("test accuracy: {} %".format(100 - torch.mean(torch.abs(Y_prediction_test - Y_test)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [130]:
def model_tf(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False):
    """
    Builds the logistic regression model by calling the function you've implemented previously
    
    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to True to print the cost every 100 iterations
    
    Returns:
    d -- dictionary containing information about the model.
    """
    # initialize parameters with zeros 
    w, b = initialize_with_zeros_tf(X_train.shape[0])

    # Gradient descent 
    params, grads, costs = optimize_tf(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "params"
    w = params['w']
    b = params['b']
    
    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict_tf(w, b, X_test)
    Y_prediction_train = predict_tf(w, b, X_train)
    

    # Print train/test Errors
    if print_cost:
        print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
        print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [131]:
logistic_regression_model = model_np(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations=2000, learning_rate=0.005, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 100: 0.584508
Cost after iteration 200: 0.466949
Cost after iteration 300: 0.376007
Cost after iteration 400: 0.331463
Cost after iteration 500: 0.303273
Cost after iteration 600: 0.279880
Cost after iteration 700: 0.260042
Cost after iteration 800: 0.242941
Cost after iteration 900: 0.228004
Cost after iteration 1000: 0.214820
Cost after iteration 1100: 0.203078
Cost after iteration 1200: 0.192544
Cost after iteration 1300: 0.183033
Cost after iteration 1400: 0.174399
Cost after iteration 1500: 0.166521
Cost after iteration 1600: 0.159305
Cost after iteration 1700: 0.152667
Cost after iteration 1800: 0.146542
Cost after iteration 1900: 0.140872
train accuracy: 99.04306220095694 %
test accuracy: 70.0 %


In [136]:
train_set_x_torch = torch.Tensor(train_set_x)
train_set_y_torch = torch.Tensor(train_set_y)
test_set_x_torch = torch.Tensor(test_set_x)
test_set_y_torch = torch.Tensor(test_set_y)
logistic_regression_model = model_torch(train_set_x_torch, train_set_y_torch, test_set_x_torch, test_set_y_torch, num_iterations=2000, learning_rate=0.005, print_cost=True)

Cost after iteration 0: 0.693147
Cost after iteration 100: 0.584508
Cost after iteration 200: 0.466949
Cost after iteration 300: 0.376007
Cost after iteration 400: 0.331463
Cost after iteration 500: 0.303273
Cost after iteration 600: 0.279880
Cost after iteration 700: 0.260042
Cost after iteration 800: 0.242941
Cost after iteration 900: 0.228004
Cost after iteration 1000: 0.214820
Cost after iteration 1100: 0.203078
Cost after iteration 1200: 0.192544
Cost after iteration 1300: 0.183033
Cost after iteration 1400: 0.174399
Cost after iteration 1500: 0.166521
Cost after iteration 1600: 0.159305
Cost after iteration 1700: 0.152667
Cost after iteration 1800: 0.146542
Cost after iteration 1900: 0.140872
train accuracy: 99.04306030273438 %
test accuracy: 70.0 %
