# Market prediction using Flux.jl

In [1]:
## Uncomment if you are missing Parameters
# import Pkg; Pkg.add("Parameters")

using Flux
using DelimitedFiles
using Statistics
using Flux.Optimise: update!
using Parameters #: @with_kw

## Define Hyperparameters

In [6]:
# Struct to define hyperparameters

@with_kw mutable struct Hyperparams
    learning_rate::Float64 = 0.1       # learning rate
    split_ratio::Float64 = 0.1         # train/test split ratio 0.1 => 90/10
end
file_data_1 = "data/united-states.ism-manufacturing-employment.csv"

"data/united-states.ism-manufacturing-employment.csv"

In [9]:
isfile(file_data_1) || # does file exist locally?
    download(
            "https://raw.githubusercontent.com/UkiDLucas/MarketIndicators.jl/master/data/united-states.ism-manufacturing-employment.csv", # URL
            file_data_1)                                                              # save it as name

rawdata = readdlm(file_data_1)

83×4 Array{Any,2}:
 "Date"          "ActualValue"    "ForecastValue"    "PreviousValue"
 "2020.08.03"  44.3             34.4               42.1
 "2020.07.01"  42.1             38.1               32.1
 "2020.06.01"  32.1             34.1               27.5
 "2020.05.01"  27.5             44.8               43.8
 "2020.04.01"  43.8             45.4               46.9
 "2020.03.02"  46.9             43.9               46.6
 "2020.02.03"  46.6             43.3               45.2
 "2020.01.03"  45.1             53.5               46.6
 "2019.12.02"  46.6             54.3               47.7
 "2019.11.01"  47.7             54.7               46.3
 "2019.10.01"  46.3             55.3               47.4
 "2019.09.03"  47.4             56.0               51.7
 ⋮                                                 
 "2014.10.01"  54.6             58.1                 ""
 "2014.09.02"  58.1             58.2                 ""
 "2014.08.01"  58.2             52.8                 ""
 "2014.07.01"  52.8 

In [5]:
rawdata = rawdata' # flip the matrix

14×505 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
   0.02731    0.02729    0.03237  …    0.06076    0.10959    0.04741
   0.0        0.0        0.0           0.0        0.0        0.0
   7.07       7.07       2.18         11.93      11.93      11.93
   0.0        0.0        0.0           0.0        0.0        0.0
   0.469      0.469      0.458         0.573      0.573      0.573
   6.421      7.185      6.998    …    6.976      6.794      6.03
  78.9       61.1       45.8          91.0       89.3       80.8
   4.9671     4.9671     6.0622        2.1675     2.3889     2.505
   2.0        2.0        3.0           1.0        1.0        1.0
 242.0      242.0      222.0         273.0      273.0      273.0
  17.8       17.8       18.7      …   21.0       21.0       21.0
 396.9      392.83     394.63        396.9      393.45     396.9
   9.14       4.03       2.94          5.64       6.48       7.88
  21.6       34.7       33.4          23.9       22.0       11.9

# Independent Variables (features)

In [6]:
x = rawdata[1:13,:]     # independent variables: all rows before last

13×505 Array{Float64,2}:
   0.02731    0.02729    0.03237  …    0.06076    0.10959    0.04741
   0.0        0.0        0.0           0.0        0.0        0.0
   7.07       7.07       2.18         11.93      11.93      11.93
   0.0        0.0        0.0           0.0        0.0        0.0
   0.469      0.469      0.458         0.573      0.573      0.573
   6.421      7.185      6.998    …    6.976      6.794      6.03
  78.9       61.1       45.8          91.0       89.3       80.8
   4.9671     4.9671     6.0622        2.1675     2.3889     2.505
   2.0        2.0        3.0           1.0        1.0        1.0
 242.0      242.0      222.0         273.0      273.0      273.0
  17.8       17.8       18.7      …   21.0       21.0       21.0
 396.9      392.83     394.63        396.9      393.45     396.9
   9.14       4.03       2.94          5.64       6.48       7.88

# Dependent Variable (price)

In [7]:
y = rawdata[14:14,:]          # Dependent Variable (price) last ROW

1×505 Array{Float64,2}:
 21.6  34.7  33.4  36.2  28.7  22.9  …  16.8  22.4  20.6  23.9  22.0  11.9

In [33]:
mean(x, dims = 2) # calculate mean values for each feature in the 2-dimentional matrix

13×1 Array{Float64,2}:
 -2.5985814160533365e-16
 -3.0844413931665243e-16
 -3.138633467536841e-15
  3.6516444433710597e-16
  2.7279608714972782e-15
 -1.268918963303507e-14
 -1.3243531683845432e-15
  1.222564403948613e-15
  5.285101289502725e-16
  1.3942642420144046e-15
 -1.0211413672829558e-14
  9.97705768386908e-15
 -1.7829522237049545e-16

In [9]:
std(x, dims = 2) # sample standard deviation https://docs.julialang.org/en/v1/stdlib/Statistics/

13×1 Array{Float64,2}:
   8.608571806365033
  23.343703578872134
   6.85586835100403
   0.25422718805492533
   0.1159901880831873
   0.7031946661682126
  28.17637120551297
   2.1077570604702918
   8.70755324858419
 168.62999211798004
   2.162520022863353
  91.36778721047762
   7.139950350319604

# Normalize the independent variables

In [34]:
# Normalize the independent variables
x = (x .- mean(x, dims = 2)) ./ std(x, dims = 2) # math on 13× Arrays

13×505 Array{Float64,2}:
 -0.417416  -0.417418  -0.416828  …  -0.41353   -0.407858  -0.415081
 -0.486234  -0.486234  -0.486234     -0.486234  -0.486234  -0.486234
 -0.595732  -0.595732  -1.30899       0.11315    0.11315    0.11315
 -0.272618  -0.272618  -0.272618     -0.272618  -0.272618  -0.272618
 -0.739098  -0.739098  -0.833934      0.15753    0.15753    0.15753
  0.194741   1.28121    1.01528   …   0.983996   0.725177  -0.361293
  0.366208  -0.265527  -0.808535      0.795646   0.735312   0.433641
  0.556346   0.556346   1.0759       -0.771891  -0.66685   -0.611768
 -0.868939  -0.868939  -0.754097     -0.983782  -0.983782  -0.983782
 -0.987128  -0.987128  -1.10573      -0.803294  -0.803294  -0.803294
 -0.306024  -0.306024   0.110158  …   1.17373    1.17373    1.17373
  0.441136   0.396591   0.416291      0.441136   0.403377   0.441136
 -0.494157  -1.20985   -1.36251      -0.984357  -0.866709  -0.670629

In [11]:
records = size(x,2) # number of columns

505

In [12]:
args = Hyperparams()

Hyperparams
  learning_rate: Float64 0.1
  split_ratio: Float64 0.1


In [14]:
split_ratio = args.split_ratio

split_index = floor(Int, records * split_ratio)

50

In [15]:
x_train = x[:,1:split_index]           # training features
y_train = y[:,1:split_index]           # training results
x_test = x[:,split_index+1:records]  # testing features
y_test = y[:,split_index+1:records]  # testing results

1×455 Array{Float64,2}:
 20.5  25.0  23.4  18.9  35.4  24.7  …  16.8  22.4  20.6  23.9  22.0  11.9

In [16]:
train_data = (x_train, y_train) # tuples
test_data = (x_test, y_test)
size(test_data[1])

(13, 455)

In [17]:
function get_processed_data(args) # expects struct Hyperparams

    isfile("housing.data") ||
        download(
            "https://raw.githubusercontent.com/MikeInnes/notebooks/master/housing.data",
            "housing.data")

    rawdata = readdlm("housing.data")'

    # The last feature is our target -- the price of the house.
    split_ratio = args.split_ratio # For the train/test split

    x = rawdata[1:13,:]
    y = rawdata[14:14,:]

    # Normalise the data
    x = (x .- mean(x, dims = 2)) ./ std(x, dims = 2)

    # Split into train and test sets
    split_index = floor(Int,size(x,2)*split_ratio)
    x_train = x[:,1:split_index]
    y_train = y[:,1:split_index]
    x_test = x[:,split_index+1:size(x,2)]
    y_test = y[:,split_index+1:size(x,2)]

    train_data = (x_train, y_train)
    test_data = (x_test, y_test)

    return train_data,test_data
end

get_processed_data (generic function with 1 method)

In [18]:
# Struct to define model
mutable struct model
    W::AbstractArray
    b::AbstractVector
end

In [19]:
# Function to predict output from given parameters

predict(x, m) = m.W*x .+ m.b

predict (generic function with 1 method)

# Mean Squared Error (MSE)

<center><span style="font-size:x-large;" >$ MSE = \sum \limits _{i=1} ^{n} {   \frac{(ŷ_i - y)^2}{n} }$</span></center>

In [20]:
n = size(y, 2) # e.g. 505 columns

# Mean Squared Error
meansquarederror(ŷ, y) = sum((ŷ .- y).^2)/n

meansquarederror (generic function with 1 method)

In [43]:
function train(; kws...)
    # Initialize the Hyperparamters
    args = Hyperparams(; kws...)
    
    # Load the data
    (x_train,y_train),(x_test,y_test) = get_processed_data(args)
    
    test_data = (x_test,y_test)
    
    # The model
    m = model((randn(1,13)),[0.])
    
    loss(x, y) = meansquarederror(predict(x, m), y)

    ## Training
    η = args.learning_rate
    θ = params([m.W, m.b])

    for i = 1:1000
      g = gradient(() -> loss(x_train, y_train), θ)
      for x in θ
        update!(x, -g[x]*η)
      end
      if i%100==0
          @show loss(x_train, y_train)
        end
    end
    
    # Predict the RMSE on the test set
    err = meansquarederror(predict(x_test, m),y_test)
    println("error: ", err)
    return m , test_data# model
end

train (generic function with 1 method)

In [44]:
cd(@__DIR__)
resulting_model, test_data = train()
resulting_model.W

loss(x_train, y_train) = 2.4034405350456793e7
loss(x_train, y_train) = 1.3009620052606883e13
loss(x_train, y_train) = 7.042319983925728e18
loss(x_train, y_train) = 3.812123486980557e24
loss(x_train, y_train) = 2.0635650638207685e30
loss(x_train, y_train) = 1.1170416664548396e36
loss(x_train, y_train) = 6.046730032761353e41
loss(x_train, y_train) = 3.273194294098097e47
loss(x_train, y_train) = 1.7718338389292494e53
loss(x_train, y_train) = 9.591227622617485e58
error: 6.903824874228445e59


1×13 Array{Float64,2}:
 1.03671e29  7.2696e28  1.76632e29  7.63667e28  …  -7.60829e28  -2.42896e27

In [45]:
resulting_model.b

1-element Array{Float64,1}:
 -2.8012337591815173e29

In [46]:
function test(model, test)
    # Testing model performance on test data 
    X_test, y_test = test
    #accuracy_score = accuracy(X_test, y_test, model)

    #println("\nAccuracy: $accuracy_score")

    # Sanity check.
    #@assert accuracy_score > 0.8

    # To avoid confusion, here is the definition of a Confusion Matrix: https://en.wikipedia.org/wiki/Confusion_matrix
    println("\nConfusion Matrix:\n")
    #display(confusion_matrix(X_test, y_test, model))
end

test (generic function with 1 method)

In [47]:
test(model, test_data)
features = test_data[1]


Confusion Matrix:



13×455 Array{Float64,2}:
 -0.415551  -0.414362  -0.414802  …  -0.41353   -0.407858  -0.415081
  0.413366   0.413366   0.413366     -0.486234  -0.486234  -0.486234
 -0.804312  -0.804312  -0.804312      0.11315    0.11315    0.11315
 -0.272618  -0.272618  -0.272618     -0.272618  -0.272618  -0.272618
 -0.997741  -0.997741  -0.997741      0.15753    0.15753    0.15753
 -0.240416   0.322728  -0.4068    …   0.983996   0.725177  -0.361293
 -0.198094  -1.68516   -1.67451       0.795646   0.735312   0.433641
  1.43292    1.43292    1.43292      -0.771891  -0.66685   -0.611768
 -0.639254  -0.639254  -0.639254     -0.983782  -0.983782  -0.983782
 -0.981198  -0.981198  -0.981198     -0.803294  -0.803294  -0.803294
 -0.768447  -0.768447  -0.768447  …   1.17373    1.17373    1.17373
  0.409068   0.441136   0.441136      0.441136   0.403377   0.441136
 -0.453541  -1.03478   -0.593598     -0.984357  -0.866709  -0.670629

In [48]:
one_record = features[:,1]

13-element Array{Float64,1}:
 -0.41555052511828655
  0.41336649593291624
 -0.8043120350954587
 -0.2726180910206019
 -0.997740591030696
 -0.24041622337926227
 -0.1980944997389795
  1.432917219342502
 -0.6392538150218523
 -0.9811979699603633
 -0.7684470713096608
  0.4090678444064483
 -0.4535406083877912

In [49]:
results = test_data[1][1,:]

455-element Array{Float64,1}:
 -0.41555052511828655
 -0.4143621746891903
 -0.4148024335480637
 -0.41900870618125785
 -0.41906562619203563
 -0.4182013713345111
 -0.4189250686144006
 -0.4026471071648149
 -0.4085911825760424
 -0.40324302482867264
 -0.40064212882598776
 -0.4077792011978036
 -0.4058938710448971
  ⋮
 -0.3881127566576276
 -0.3997964600944315
 -0.38694763888599165
 -0.3894126238425335
 -0.3928115616289809
 -0.39993120950770145
 -0.3945238084838078
 -0.413313220204856
 -0.4153298148724133
 -0.41353044555211016
 -0.4078581922331687
 -0.4150812254375877

In [50]:
records = size(results)[1]

455

In [51]:
get_price(data, model) = model.W * data .+ model.b

get_price (generic function with 1 method)

In [52]:
get_price(one_record, resulting_model)

1-element Array{Float64,1}:
 -1.1831470566909782e30

In [53]:
for i in 1:records # 455
    record = features[:,i] # 13-element Array{Float64,1}:
    result = get_price(record, resulting_model)
    println(i, " ", result, " =? ", results[1])
end

1 [-1.1831470566909782e30] =? -0.41555052511828655
2 [-1.231521261515786e30] =? -0.41555052511828655
3 [-1.2871889118161498e30] =? -0.41555052511828655
4 [-1.0333334400402845e30] =? -0.41555052511828655
5 [-1.268321345798619e30] =? -0.41555052511828655
6 [-1.3231698358086788e30] =? -0.41555052511828655
7 [-1.0845943306267699e30] =? -0.41555052511828655
8 [-1.24637800263805e30] =? -0.41555052511828655
9 [-1.1700047924665444e30] =? -0.41555052511828655
10 [-1.1709734917406999e30] =? -0.41555052511828655
11 [-1.0430603494628936e30] =? -0.41555052511828655
12 [-1.090389104060263e30] =? -0.41555052511828655
13 [-1.1675245439148674e30] =? -0.41555052511828655
14 [-1.5228861690064825e30] =? -0.41555052511828655
15 [-1.0289158546315072e30] =? -0.41555052511828655
16 [-1.0562421590305645e30] =? -0.41555052511828655
17 [-1.2494998395615842e30] =? -0.41555052511828655
18 [-1.248286535603115e30] =? -0.41555052511828655
19 [-1.2237721560000868e30] =? -0.41555052511828655
20 [-1.0920266437616808e30]