# Floating Point Arithmetic

UC Berkeley Math 124, Per-Olof Persson <persson@berkeley.edu>


### Overflow/underflow

In [1]:
1.0

1.0

In [2]:
-1.0

-1.0

In [3]:
1e100

1.0e100

In [4]:
1e-100

1.0e-100

In [5]:
1e308

1.0e308

In [6]:
1e308 * 2

Inf

In [7]:
1e-308

1.0e-308

In [8]:
1e-308 / 2^51

5.0e-324

In [9]:
1e-308 / 2^52

0.0

### Cancellation

In [10]:
x = rand()
y = rand()
z = x - y

-0.03183006760111673

In [11]:
x1 = x + 1e10
y1 = y + 1e10
z1 = x1 - y1

-0.03183174133300781

In [12]:
z1 - z

-1.6737318910831078e-6

### Epsilon

In [13]:
1 + 1e-20

1.0

In [14]:
(1 + 1e-20) - 1

0.0

In [15]:
1 + 1e-16

1.0

In [16]:
1 + 2e-16

1.0000000000000002

In [17]:
(1 + 2e-16) - 1

2.220446049250313e-16

In [18]:
e = 1.0
while 1 + e > 1
    e = e/2
    println(e)
end

0.5
0.25
0.125
0.0625
0.03125
0.015625
0.0078125
0.00390625
0.001953125
0.0009765625
0.00048828125
0.000244140625
0.0001220703125
6.103515625e-5
3.0517578125e-5
1.52587890625e-5
7.62939453125e-6
3.814697265625e-6
1.9073486328125e-6
9.5367431640625e-7
4.76837158203125e-7
2.384185791015625e-7
1.1920928955078125e-7
5.960464477539063e-8
2.9802322387695312e-8
1.4901161193847656e-8
7.450580596923828e-9
3.725290298461914e-9
1.862645149230957e-9
9.313225746154785e-10
4.656612873077393e-10
2.3283064365386963e-10
1.1641532182693481e-10
5.820766091346741e-11
2.9103830456733704e-11
1.4551915228366852e-11
7.275957614183426e-12
3.637978807091713e-12
1.8189894035458565e-12
9.094947017729282e-13
4.547473508864641e-13
2.2737367544323206e-13
1.1368683772161603e-13
5.684341886080802e-14
2.842170943040401e-14
1.4210854715202004e-14
7.105427357601002e-15
3.552713678800501e-15
1.7763568394002505e-15
8.881784197001252e-16
4.440892098500626e-16
2.220446049250313e-16
1.1102230246251565e-16


In [19]:
eps()

2.220446049250313e-16

In [20]:
b = 2.0^100

1.2676506002282294e30

In [21]:
(b + eps()*b) - b

2.81474976710656e14

In [22]:
(b + eps()/2*b) - b

0.0

### Signed zeros

In [23]:
0.0

0.0

In [24]:
+0.0

0.0

In [25]:
-0.0

-0.0

### Infinity

In [26]:
1.0/0.0

Inf

In [27]:
-1.0/0.0

-Inf

In [28]:
0.0/0.0

NaN

In [29]:
Inf

Inf

In [30]:
1/Inf

0.0

In [31]:
-1/Inf

-0.0

In [32]:
-1/-Inf

0.0

In [33]:
2*Inf

Inf

In [34]:
Inf + Inf

Inf

In [35]:
Inf^Inf

Inf

### NaN (Not-a-Number)

In [36]:
Inf-Inf

NaN

In [37]:
Inf/Inf

NaN

In [38]:
0.0/0.0

NaN

In [39]:
NaN + 123

NaN

### Check for NaN

In [40]:
x = NaN
x == NaN

false

In [41]:
x == x

false

In [42]:
isnan.([1, 2, 3, NaN, Inf])

5-element BitArray{1}:
 0
 0
 0
 1
 0

In [43]:
isinf.([1, 2, 3, NaN, Inf])

5-element BitArray{1}:
 0
 0
 0
 0
 1

### Round to even

In [44]:
e = eps()/2
1 + e

1.0

In [45]:
1 + 2*e

1.0000000000000002

In [46]:
((1 + 2*e)-1) / e

2.0

In [47]:
((1 + 3*e)-1) / e

4.0

In [48]:
((1 + 4*e)-1) / e

4.0

In [49]:
for mul = 0:16
    println([mul, ((1 + mul*e) - 1) / e])
end

[0.0, 0.0]
[1.0, 0.0]
[2.0, 2.0]
[3.0, 4.0]
[4.0, 4.0]
[5.0, 4.0]
[6.0, 6.0]
[7.0, 8.0]
[8.0, 8.0]
[9.0, 8.0]
[10.0, 10.0]
[11.0, 12.0]
[12.0, 12.0]
[13.0, 12.0]
[14.0, 14.0]
[15.0, 16.0]
[16.0, 16.0]


### View hex/bin representations

In [50]:
using Printf
split32(s) = s[1] * " " * s[2:9] * " " * s[10:32]
showbits(xs) = for x in xs; @printf("%10.8g  =  %s\n", x, split32(bitstring(Float32(x)))); end

showbits([0, -0, Inf, -Inf, NaN, -NaN])
showbits(1:10)
showbits(1 .+ (0:10).*2^-23)
showbits(2 .- (10:-1:0).*2^-23)

         0  =  0 00000000 00000000000000000000000
         0  =  0 00000000 00000000000000000000000
       Inf  =  0 11111111 00000000000000000000000
      -Inf  =  1 11111111 00000000000000000000000
       NaN  =  0 11111111 10000000000000000000000
       NaN  =  1 11111111 10000000000000000000000
         1  =  0 01111111 00000000000000000000000
         2  =  0 10000000 00000000000000000000000
         3  =  0 10000000 10000000000000000000000
         4  =  0 10000001 00000000000000000000000
         5  =  0 10000001 01000000000000000000000
         6  =  0 10000001 10000000000000000000000
         7  =  0 10000001 11000000000000000000000
         8  =  0 10000010 00000000000000000000000
         9  =  0 10000010 00100000000000000000000
        10  =  0 10000010 01000000000000000000000
         1  =  0 01111111 00000000000000000000000
 1.0000001  =  0 01111111 00000000000000000000001
 1.0000002  =  0 01111111 00000000000000000000010
 1.0000004  =  0 01111111 00000000000000000000011
