<a href="https://colab.research.google.com/github/squeze/my_udacity_deep_learning_solutions/blob/master/intro-neural-networks/gradient_descent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing the gradient descent algorithm


This notebook is based on the udacity deep learning nanodegree exercise for gradient descent, which can be found here:

https://github.com/udacity/deep-learning-v2-pytorch/blob/master/intro-neural-networks/gradient-descent/GradientDescent.ipynb

The original version is implemented with python and numpy, I try to implement it with swift-only as an exercise to learn swift.

## Loading dataset from github
The original dataset is located here:

https://raw.githubusercontent.com/udacity/deep-learning-v2-pytorch/master/intro-neural-networks/gradient-descent/data.csv

In [0]:
import Foundation

let url = "https://raw.githubusercontent.com/udacity/deep-learning-v2-pytorch/master/intro-neural-networks/gradient-descent/data.csv"

// author of this query function: https://gist.github.com/groz/85b95f663f79ba17946269ea65c2c0f4
func query(address: String) -> String {
    let url = URL(string: address)
    let semaphore = DispatchSemaphore(value: 0)
    
    var result: String = ""
    
    let task = URLSession.shared.dataTask(with: url!) {(data, response, error) in
        result = String(data: data!, encoding: String.Encoding.utf8)!
        semaphore.signal()
    }
    
    task.resume()
    semaphore.wait()
    return result
}

let rawData = query(address: url)

### Convert data
Make features a Tensor (not only an array)

In [2]:
import TensorFlow

let rows = rawData.components(separatedBy: "\n")
let featuresAndTargetsAsString = rows.map({ $0.components(separatedBy: ",") }).filter {$0[0] != ""}
var targets = [Double]()               
var features = [Tensor<Double>]()
for featureWithTarget in featuresAndTargetsAsString {
  targets.append(Double(featureWithTarget[2])!)
  features.append(Tensor<Double>([Double(featureWithTarget[0])!, Double(featureWithTarget[1])!]))
  
}
print(features)
print(targets)

[[  0.78051, -0.063669], [0.28774, 0.29139], [0.40714, 0.17878], [0.2923, 0.4217], [0.50922, 0.35256], [0.27785, 0.10802], [0.27527, 0.33223], [0.43999, 0.31245], [0.33557, 0.42984], [0.23448, 0.24986], [0.0084492,   0.13658], [0.12419, 0.33595], [0.25644, 0.42624], [ 0.4591, 0.40426], [0.44547, 0.45117], [0.42218, 0.20118], [0.49563, 0.21445], [0.30848, 0.24306], [0.39707, 0.44438], [0.32945, 0.39217], [0.40739, 0.40271], [ 0.3106, 0.50702], [0.49638, 0.45384], [0.10073, 0.32053], [0.69907, 0.37307], [0.29767, 0.69648], [0.15099, 0.57341], [0.16427, 0.27759], [ 0.33259, 0.055964], [0.53741, 0.28637], [0.19503, 0.36879], [ 0.40278, 0.035148], [0.21296, 0.55169], [0.48447, 0.56991], [0.25476, 0.34596], [0.21726, 0.28641], [0.67078, 0.46538], [0.3815, 0.4622], [0.53838, 0.32774], [ 0.4849, 0.26071], [0.37095, 0.38809], [0.54527, 0.63911], [0.32149, 0.12007], [0.42216, 0.61666], [ 0.10194, 0.060408], [0.15254,  0.2168], [0.45558, 0.43769], [0.28488, 0.52142], [0.27633, 0.21264], [0.39748,

## Sigmoid activation function
$$\sigma(x) = \frac{1}{1+e^{-x}}$$

In [0]:
func mySigmoid(_ x: Tensor<Double>) -> Tensor<Double> {
  return 1 / (1 + exp(-x))
}

## Output (prediction) formula
$$\hat{y} = \sigma(w_1 x_1 + w_2 x_2 + b)$$

In [0]:
func myOutputFormula(_ features: Tensor<Double>, _ weights: Tensor<Double>, _ bias: Tensor<Double>) -> Double {
  let res = mySigmoid((features * weights).sum() + bias)
  return res.scalar!
}

## Error function

$$Error(y, \hat{y}) = - y \log(\hat{y}) - (1-y) \log(1-\hat{y})$$

In [0]:
func myErrorFormula(_ y: Double, _ ŷ: Double) -> Double {
  //y * log(ŷ) - ((1-y) * log(1-ŷ))
  //results in:
  //error: the compiler is unable to type-check this expression in reasonable time; try breaking up the expression into distinct sub-expressions
  let expression1 = y * log(ŷ)
  let expression2 = ((1-y) * log(1-ŷ))
  return -expression1 - expression2
}

##Gradient descent step

$$w_i \longrightarrow w_i + \alpha (y - \hat{y}) x_i$$

$$b \longrightarrow b + \alpha (y - \hat{y})$$

In [0]:
func myUpdateWeights(_ features: Tensor<Double>, _ targets: Double, _ weights: Tensor<Double>, _ bias: Tensor<Double>, _ learningRate: Double) -> (Tensor<Double>, Tensor<Double>) {
  let learningRateAndYAndYHat = learningRate * (targets - myOutputFormula(features, weights, bias))
  let updatedWeights = weights + (learningRateAndYAndYHat * features)
  let updatedBias = bias + learningRateAndYAndYHat
  return (updatedWeights, updatedBias)
}

## Training function
Initialization of weights in the course

`weights = np.random.normal(scale=1 / n_features**.5, size=n_features)`

is different than in my version

`var weights = Tensor<Double>(randomNormal: [2])`

In [0]:
func train(_ features: [Tensor<Double>], _ targets: [Double], epochs: Int, learningRate: Double) { //make learningRate Double because swift's won't multiply Float with Double
  let numberRecords = Double(features.count)
  var weights = Tensor<Double>(randomNormal: features[0].shape)
  var bias = Tensor<Double>.zero
  var lastLoss = Double.infinity
  
  for epoch in 0...epochs {
    var errors = 0.0
    var correctPredictions = 0.0
    var prediction = 0.0

    for (x, y) in zip(features, targets) {
      let output = myOutputFormula(x, weights, bias)
      errors += myErrorFormula(y, output)
      (weights, bias) = myUpdateWeights(x, y, weights, bias, learningRate)

      if (output > 0.5) {
        prediction = 1.0
      } else {
        prediction = 0.0
      }
      
      if (prediction == y) {
        correctPredictions+=1
      }
    }
    
    let loss = errors / numberRecords

    if epoch % (epochs / 10) == 0 {
      print("Epoch: \(epoch)")
      
      let warning = lastLoss < loss ? "WARNING - Loss increasing" : ""
      print("Train loss: \(loss) \(warning)")
      lastLoss = loss
      
      let accuracy = correctPredictions / numberRecords
      print("Accuracy: \(accuracy)")
      
      print("Errors: \(errors)")
    }
  }        
}

## Train

In [11]:
train(features, targets, epochs: 100, learningRate: 0.01)

Epoch: 0
Train loss: 0.8829083619477006 
Accuracy: 0.5
Errors: 88.29083619477007
Epoch: 10
Train loss: 0.6743747883272507 
Accuracy: 0.56
Errors: 67.43747883272508
Epoch: 20
Train loss: 0.5959861768943281 
Accuracy: 0.77
Errors: 59.59861768943281
Epoch: 30
Train loss: 0.5343250043715391 
Accuracy: 0.86
Errors: 53.432500437153905
Epoch: 40
Train loss: 0.4858426265239904 
Accuracy: 0.92
Errors: 48.58426265239904
Epoch: 50
Train loss: 0.44713395487810526 
Accuracy: 0.93
Errors: 44.71339548781052
Epoch: 60
Train loss: 0.4157170299358725 
Accuracy: 0.93
Errors: 41.57170299358725
Epoch: 70
Train loss: 0.38981249980610433 
Accuracy: 0.93
Errors: 38.981249980610436
Epoch: 80
Train loss: 0.3681404088268396 
Accuracy: 0.93
Errors: 36.81404088268396
Epoch: 90
Train loss: 0.34977007224210405 
Accuracy: 0.93
Errors: 34.9770072242104
Epoch: 100
Train loss: 0.33401522011459245 
Accuracy: 0.93
Errors: 33.40152201145924


Compare with python calculation

========== Epoch 80 ==========

Train loss:  0.35459973368161973

Accuracy:  0.94

========== Epoch 90 ==========

Train loss:  0.3379273658879921

Accuracy:  0.94