In [1]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/SchwiftyNotebook_matrix_multiplication")' SchwiftyNotebook_matrix_multiplication

Installing packages:
	.package(path: "/home/ubuntu/workspace/fast-ai-swift/SchwiftyNotebook_matrix_multiplication")
		SchwiftyNotebook_matrix_multiplication
With SwiftPM flags: []
Working in: /tmp/tmpn6v0l09h/swift-install
/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaconda3/envs/swift-env/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swift)
/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaconda3/envs/swift-env/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swift)
/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaconda3/envs/swift-env/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swift)
/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaconda3/envs/swift-env/lib/libuuid.so.1: no version information available (required by /home/ubuntu/swift/usr/bin/swift)
/home/ubuntu/swift/usr/bin/swift: /home/ubuntu/anaconda3/envs/swift-env/lib/libuuid.so.1: no vers

In [2]:
//export
import Path
import TensorFlow
import SchwiftyNotebook_matrix_multiplication

In [3]:
let thingo = Tensor([-2, -1, 0, 1, 2])

In [4]:
time(repeating: 2){ relu(thingo) }

average: 0.2309215 ms,   min: 0.175269 ms,   max: 0.286574 ms


In [5]:
//export
public func reLU<T>(tensor: Tensor<T>) -> Tensor<T> where T : FloatingPoint, T: TensorFlowScalar {
    return max(tensor, 0)
}

In [6]:
//export
public func linearCombination<T>(inputs: Tensor<T>, weights: Tensor<T>, bias: Tensor<T>) -> Tensor<T> where T: FloatingPoint, T: TensorFlowScalar {
    return matmul(inputs, weights) + bias
}

# Normalization

In [7]:
let (xTrainingData, yTrainingData, xValidationData, yValidationData) = loadMNISTData(path: mnistPath)

In [8]:
let xTrainingDataNormalized = normalizeTensor(tensor: xTrainingData)
                                        .reshaped(to: [xTrainingData.shape[0], 784])
let xValidationDataNormalized = normalizeTensor(tensor: xValidationData)
                                        .reshaped(to: [xValidationData.shape[0], 784])

In [9]:
// export
public typealias TensorFloat=Tensor<Float>
public func assertNearZero(_ tensor: TensorFloat, _ threshold: Float = 1e-3){
    assert(tensor < threshold, "Expected \(tensor) to be less than \(threshold)")
}

In [10]:
assertNearZero(xTrainingDataNormalized.mean())
assertNearZero(xValidationDataNormalized.mean())
assertNearZero(1 - xTrainingDataNormalized.standardDeviation())
assertNearZero(1 - xValidationDataNormalized.standardDeviation())

In [11]:
let (numberOfImages, numberOfPixels) = (xTrainingDataNormalized.shape[0], xTrainingDataNormalized.shape[1])
let numberOfClasses = 10
let layerOutput = 50
print(numberOfImages, numberOfPixels, numberOfClasses)

60000 784 10


# Initialization

## From Hand

In [12]:
let parameterLayerOne = TensorFloat(randomNormal: [numberOfPixels, layerOutput]) / sqrt(Float(numberOfPixels))
let parameterLayerTwo = TensorFloat(randomNormal: [layerOutput, 1]) / sqrt(Float(layerOutput))

In [13]:
assertNearZero(parameterLayerOne.mean())
assertNearZero(parameterLayerOne.standardDeviation() - 1 / sqrt(Float(numberOfPixels)))

In [14]:
let biasLayerOne = TensorFloat(zeros: [layerOutput])
let biasLayerTwo = TensorFloat(zeros: [1])

In [15]:
assertNearZero(biasLayerOne.mean())
assertNearZero(biasLayerOne.standardDeviation() - 1 / sqrt(Float(numberOfImages)))

In [16]:
print(xValidationDataNormalized.shape, parameterLayerOne.shape, biasLayerOne.shape)

[10000, 784] [784, 50] [50]


### Side Adventure: Timing

In [17]:
//export
import Dispatch

func getTimeUnit(_ nanoSeconds: Double) -> String {
    let powerOfTen = floor(log10(nanoSeconds))
    switch powerOfTen {
        case 1..<3:
            return "\(nanoSeconds) ns"
        case 3..<6:
            return "\(nanoSeconds/1e3) µs"
        case 6..<9: 
            return "\(nanoSeconds/1e6) ms"
        default: 
            return "\(nanoSeconds/1e9) s"
    }
}

// ⏰Time how long it takes to run the specified function, optionally taking
// the average across a number of repetitions.
public func withTime<T>(_ f: () -> T) -> T {
    let start = DispatchTime.now()
    let value = f()
    let end = DispatchTime.now()
    let nanoSeconds = Double(end.uptimeNanoseconds - start.uptimeNanoseconds)
    print("elapsed time: \(getTimeUnit(nanoSeconds))")
    return value
}

In [18]:
withTime{
    print("yeet")
}

yeet
elapsed time: 33.034 µs


In [19]:
let linearWomboCombo = time(repeating: 10) {
    withDevice(.gpu){
        linearCombination(inputs: xValidationDataNormalized, 
                                         weights: parameterLayerOne, 
                                         bias: biasLayerOne)
    }
}

average: 0.0557584 ms,   min: 0.051998 ms,   max: 0.077631 ms


In [20]:
let cpuLinearWomboCombo = time(repeating: 10) {
    withDevice(.cpu){
        linearCombination(inputs: xValidationDataNormalized, 
                                         weights: parameterLayerOne, 
                                         bias: biasLayerOne)
    }
}

average: 16.6687776 ms,   min: 13.010301 ms,   max: 31.511705 ms


In [21]:
let defaultLinearWomboCombo = time(repeating: 10) {
    linearCombination(inputs: xValidationDataNormalized, 
                                     weights: parameterLayerOne, 
                                     bias: biasLayerOne)
}

average: 0.0464918 ms,   min: 0.044205 ms,   max: 0.055001 ms


In [22]:
let thing = withTime { linearCombination(inputs: xValidationDataNormalized, 
                                         weights: parameterLayerOne, 
                                         bias: biasLayerOne) }

elapsed time: 171.581 µs


---

In [23]:
let linearWomboCombo = withTime { linearCombination(inputs: xValidationDataNormalized, 
                                         weights: parameterLayerOne, 
                                         bias: biasLayerOne) }

elapsed time: 174.985 µs


In [24]:
(linearWomboCombo.mean(), linearWomboCombo.standardDeviation())

▿ 2 elements
  - .0 : -0.016522393
  - .1 : 0.99615026


In [25]:
import Python
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

('inline', 'module://ipykernel.pylab.backend_inline')


In [26]:
public let plotter = Python.import("matplotlib.pyplot")

In [27]:
linearWomboCombo.shape

▿ [10000, 50]
  ▿ dimensions : 2 elements
    - 0 : 10000
    - 1 : 50


In [28]:
let activations = withTime { reLU(tensor: linearWomboCombo) }

elapsed time: 353.992924 ms


In [29]:
print(activations.mean(), activations.standardDeviation()) //relu zeros out all of the stuff below zero (so only ~half remain)

0.3865898 0.57958156


### Kaiming Initialization

In [30]:
let kaimingInitParamOne = TensorFloat(randomNormal: [numberOfPixels, layerOutput]) * (2.0/sqrt(Float(numberOfPixels)))

In [31]:
(kaimingInitParamOne.mean(), kaimingInitParamOne.standardDeviation())

▿ 2 elements
  - .0 : -0.00043776532
  - .1 : 0.07149513


In [32]:
let kaimingWomboCombo = withTime{ linearCombination(inputs: xValidationDataNormalized, 
                                          weights: kaimingInitParamOne, 
                                          bias: biasLayerOne ) }

elapsed time: 205.522 µs


In [33]:
(kaimingWomboCombo.mean(), kaimingWomboCombo.standardDeviation())

▿ 2 elements
  - .0 : -0.26568612
  - .1 : 1.9591593


In [34]:
let kaimingActivations = reLU(tensor: kaimingWomboCombo)

In [35]:
(kaimingActivations.mean(), kaimingActivations.standardDeviation())

▿ 2 elements
  - .0 : 0.6503207
  - .1 : 1.0455549


### Basic Model

In [36]:
let kaimingParamOne = TensorFloat(randomNormal: [numberOfPixels, layerOutput]) * (2.0/sqrt(Float(numberOfPixels)))
let kaimingParamTwo = TensorFloat(randomNormal: [layerOutput, 1]) * (2.0/sqrt(Float(layerOutput)))
let biasLayerOne = TensorFloat(zeros: [layerOutput])
let biasLayerTwo = TensorFloat(zeros: [1])

In [37]:
func basicBitch(_ inputTensor: TensorFloat) -> TensorFloat {
    let firstParams = withTime { linearCombination(inputs: inputTensor, 
                                                          weights: kaimingParamOne, 
                                                          bias: biasLayerOne) }
    let firstActivations = withTime { reLU(tensor: firstParams)}
    return withTime{ linearCombination(inputs: firstActivations, 
                             weights: kaimingParamTwo, 
                             bias: biasLayerTwo) }
}

In [38]:
let prediction = basicBitch(xValidationDataNormalized)

elapsed time: 198.402 µs
elapsed time: 353.806392 ms
elapsed time: 153.554 µs


🤔 Why is reLU so sloooow???

Turns out its just slow the first time...

In [39]:
for i in 1...10 {
 withTime { relu(kaimingWomboCombo)}   
}

elapsed time: 354.019586 ms
elapsed time: 233.955 µs
elapsed time: 144.737 µs
elapsed time: 122.231 µs
elapsed time: 113.141 µs
elapsed time: 112.822 µs
elapsed time: 112.04 µs
elapsed time: 122.88 µs
elapsed time: 110.666 µs
elapsed time: 111.393 µs


### Calculating Loss

In [42]:
(prediction.shape, yValidationData.shape)

▿ 2 elements
  ▿ .0 : [10000, 1]
    ▿ dimensions : 2 elements
      - 0 : 10000
      - 1 : 1
  ▿ .1 : [10000]
    ▿ dimensions : 1 element
      - 0 : 10000


In [52]:
prediction.squeezingShape(at: -1).shape

▿ [10000]
  ▿ dimensions : 1 element
    - 0 : 10000


In [61]:
(prediction.squeezingShape(at: -1) - TensorFloat(yValidationData)).squared().mean()

42.89958


In [65]:
func _meanSquaredError(_ y: TensorFloat, _ yHat: TensorFloat) -> TensorFloat {
    return (y - yHat.squeezingShape(at: [1])).squared().mean()
}

In [71]:
let trainingPredictions = basicBitch(xTrainingDataNormalized)

elapsed time: 200.534 µs
elapsed time: 353.580599 ms
elapsed time: 126.535 µs


In [72]:
(yTrainingData.shape, trainingPredictions.squeezingShape(at: [1]).shape)

▿ 2 elements
  ▿ .0 : [60000]
    ▿ dimensions : 1 element
      - 0 : 60000
  ▿ .1 : [60000]
    ▿ dimensions : 1 element
      - 0 : 60000


In [73]:
print(_meanSquaredError(TensorFloat(yValidationData), prediction))
_meanSquaredError(TensorFloat(yTrainingData), trainingPredictions)

42.89958


42.570126


# Auto Diff

In [77]:
for x in stride(from: 0.0, 
                through: 1, by: 0.1){
    print(cos(x), sin(x))
}

1.0 0.0
0.9950041652780258 0.09983341664682815
0.9800665778412416 0.19866933079506122
0.955336489125606 0.2955202066613396
0.9210609940028851 0.3894183423086505
0.8775825618903728 0.479425538604203
0.8253356149096782 0.5646424733950355
0.7648421872844884 0.6442176872376911
0.6967067093471654 0.7173560908995228
0.6216099682706644 0.7833269096274834
0.5403023058681398 0.8414709848078965


**Note:** `in` is the arrow if the closure

In [86]:
let mathyBoiGradientFunction = gradient { (x: Double) in x * 9 }

In [87]:
mathyBoiGradientFunction(69)

9.0


The derivative of $9x$ with respect to $x$ is just 9

In [88]:
let sineGraident = gradient { (x: Double) in sin(x)}
let cosineGraident = gradient { (x: Double) in cos(x)}

for x in stride(from: 0.0, 
                through: 1, by: 0.1){
    print(cosineGraident(x), sineGraident(x))
}

-0.0 1.0
-0.09983341664682815 0.9950041652780258
-0.19866933079506122 0.9800665778412416
-0.2955202066613396 0.955336489125606
-0.3894183423086505 0.9210609940028851
-0.479425538604203 0.8775825618903728
-0.5646424733950355 0.8253356149096782
-0.6442176872376911 0.7648421872844884
-0.7173560908995228 0.6967067093471654
-0.7833269096274834 0.6216099682706644
-0.8414709848078965 0.5403023058681398


The derivative of sin is cosine and the derivitive of cosine is -sine. 2cool4Me

### Basics Of Mean Squared Error

In [89]:
func tracer(functionName: String = #function) { //I'm already tracer
    print(functionName)
}

In [91]:
let closureDude = {
    tracer()
}

closureDude()

__lldb_expr_713


In [93]:
func imAlreadyTracer() {
    tracer()
    print("maybe I'll be tracer")
}

imAlreadyTracer()

imAlreadyTracer()
maybe I'll be tracer


In [107]:
let a = TensorFloat([[1, 2, 3],[4,5,6]])
a.shape

▿ [2, 3]
  ▿ dimensions : 2 elements
    - 0 : 2
    - 1 : 3


In [128]:
a.sum(squeezingAxes: [0, 1])

21.0


In [130]:
func squareTensor(_ x: TensorFloat) -> TensorFloat {
    tracer()
    return x * x
}

func 𝛁squareTensor(_ x: TensorFloat) -> TensorFloat {
    return 2 * x
}

func meanOfTens(_ x: TensorFloat) -> TensorFloat {
    return x.sum(squeezingAxes: [0, 1]) / Float(x.shape[0])
}

// The derivative of the mean with respect to x is just 1/n because the mean is just sum(weights)/n
// cus the deriviative of x/2 with respect to x is just 1/2
func 𝛁meanOfTens(_ x: TensorFloat) -> TensorFloat {
    return TensorFloat(ones: [1]) / Float(x.shape[0])
}


In [131]:
(prediction.mean(), meanOfTens(prediction))

▿ 2 elements
  - .0 : -0.60979706
  - .1 : -0.60979706
