# Multiple Dispatch:

In [1]:
function test(a::Integer)
    a + 1
end
function test(a::Float32) # spezialisierung von der Float32 methode
    a + 1f0
end

test (generic function with 2 methods)

In [2]:
# jedes liste an argumenten at seine eigene methode!
@code_llvm test(1)
@code_llvm test(Int32(1))
@code_llvm test(1.0f0)
methods(test)


define i64 @julia_test_71548(i64) #0 {
top:
  %1 = add i64 %0, 1
  ret i64 %1
}

define i64 @julia_test_71559(i32) #0 {
top:
  %1 = sext i32 %0 to i64
  %2 = add nsw i64 %1, 1
  ret i64 %2
}

define float @julia_test_71565(float) #0 {
top:
  %1 = fadd float %0, 1.000000e+00
  ret float %1
}


In [3]:
@code_native test(1)

	.text
Filename: In[1]
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 2
	leaq	1(%rdi), %rax
	popq	%rbp
	retq
	nopw	(%rax,%rax)


# C

In [4]:
# compile some C code (taken from Steven G. Johnson's https://github.com/stevengj/18S096-iap17/blob/master/lecture1/Boxes-and-registers.ipynb)
c_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""
# compile to a shared library by piping C_code to gcc:
const Clib = tempname()
println(readstring(`gcc --version`))
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, c_code)
end
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

gcc (Ubuntu 6.2.0-5ubuntu12) 6.2.0 20161005
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.




c_sum (generic function with 1 method)

# Python

In [6]:
using PyCall
println(PyCall.pyversion)

np_sum = pyimport("numpy")["sum"]

py"""
def mysum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""
py_sum = py"mysum";

2.7.13


# Julia

In [8]:
function jl_sum(A)
    s = zero(eltype(A))
    for a in A
        s += a # funktioniert mit allen typen, die + definiert haben (duck typing)
    end
    return s
end
function jlsimd_sum(A)
    s = zero(eltype(A))
    # @simd erlaubt dem compiler mehr freiheit um den loop zu vektorisieren
    @simd for a in A
        s += a
    end
    return s
end

jlsimd_sum (generic function with 1 method)

In [9]:
using BenchmarkTools
a = rand(10^7) # Array mit randomisierten zahlen in [0,1)
apy_numpy = PyObject(a)

c_bench = @benchmark c_sum($a)
py_bench = @benchmark py_sum($a)
np_bench = @benchmark np_sum($a)
jl_bench1 = @benchmark jlsimd_sum($a)
jl_bench2 = @benchmark jl_sum($a)
jl_bench3 = @benchmark sum($a)
nothing

In [11]:
trials = [c_bench, py_bench, np_bench, jl_bench1, jl_bench2, jl_bench3]
names = ["gcc", "pure python", "numpy", "julia SIMD", "julia", "Base.sum (präziser)"]
idx = sortperm(trials, by = minimum)

# der Grund, warum Julia hauptsächlich in Julia geschrieben ist!
for i in idx
    @printf("%21s: %s\n", names[i], string(minimum(trials[i])))
end

           julia SIMD: TrialEstimate(5.285 ms)
  Base.sum (präziser): TrialEstimate(5.330 ms)
                numpy: TrialEstimate(5.624 ms)
                  gcc: TrialEstimate(11.872 ms)
                julia: TrialEstimate(11.981 ms)
          pure python: TrialEstimate(1.323 s)


# Threads

In [11]:
# starte 2 processe mit jeweils 4 threads im threadpool
ENV["JULIA_NUM_THREADS"] = 4 #(2 reale, 2 hyper)
addprocs(2)
a = rand(10^7)
b = rand(10^7)
out = similar(a)
# Funktionen und importe werden mit @everywhere in jedem Process geladen
@everywhere using BenchmarkTools

@everywhere function foo(a, b)
    x = a + b / sin(b) * exp(a) + rand()
    y = log10(x / b) + rand()
    atan2(x, y)
end
nothing



In [12]:
@everywhere function threaded_map!(f, C, A, B)
    N = length(C)
    # @threads is experimental!!!
    # In der Theory, sollte dies auch mit AVX 512 auf allen Kernen Xeon Phi laufen
    Base.Threads.@threads for i in 1:N
        @inbounds C[i] = f(A[i], B[i])
    end
    C
end

# Main process, hat nur ein thread
jl_bench1 = @benchmark threaded_map!($foo, $out, $a, $b)

# starte einen benchmark in einer clojure auf einem worker mit 4 threads
jl_bench4 = remotecall_fetch(2, a, b, out) do a, b, out
    @benchmark threaded_map!($foo, $out, $a, $b)
end
# Base.map! als vergleich
jl_bench = @benchmark $out .= $(foo).($a, $b)
nothing

In [14]:
minimum(jl_bench1).time / minimum(jl_bench4).time # gutes scaling!

1.6821415854808244

In [14]:
# Führe Berechnung parallel aus! 
# Am besten auf einem Cluster
a = @spawn sum(threaded_map!(foo, Array(Float64, 1000), rand(1000), rand(1000)))
b = @spawn sum(threaded_map!(foo, Array(Float64, 1000), rand(1000), rand(1000)))
fetch(a) + fetch(b)

2224.942204603455

# Zusammenfassung:
* Julia bietet first class support für clojures/lambdas und erlaubt selbst inlining und SIMD für higher order functions
* Threading unterstützt Knights Landing
* Kombination von parallelem Programmieren, Multi-Threading und SIMD für optimale Performance