# preprocess_TEMPLATE


**Source**

- [Yahoo finance](https://finance.yahoo.com/quote/%5EVIX/history?period1=475804800&period2=1601251200&interval=1d&filter=history&frequency=1d)

## Set Up

In [1]:
dataset_file_name = "NVDA.csv"
date_original_format = "yyyy-mm-dd" # ^VIX, ^DJI, AAPL, NVDA
#date_original_format = "yyyy.mm.dd"
column_to_keep = 3 # Column number in the original file e.g. High or ActualValue

# verbose = true
verbose = false

predict_days = 30 # number of days to predict
path_data_original  = "../Data/original/"
path_data_processed = "../Data/processed/"
include("../Julia/functions.jl") 
include("../Julia/function_toFloat64.jl")
println()





<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Set-Up" data-toc-modified-id="Set-Up-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Set Up</a></span></li><li><span><a href="#Read-Data" data-toc-modified-id="Read-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read Data</a></span><ul class="toc-item"><li><span><a href="#Sample-the-data" data-toc-modified-id="Sample-the-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Sample the data</a></span></li></ul></li><li><span><a href="#Extract-right-columns-and-sort" data-toc-modified-id="Extract-right-columns-and-sort-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Extract right columns and sort</a></span></li><li><span><a href="#Insert-Rata-Die-Column" data-toc-modified-id="Insert-Rata-Die-Column-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Insert Rata Die Column</a></span><ul class="toc-item"><li><span><a href="#Insert-Future-Date" data-toc-modified-id="Insert-Future-Date-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Insert Future Date</a></span></li><li><span><a href="#Update-the-Rata-Die-Column" data-toc-modified-id="Update-the-Rata-Die-Column-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Update the Rata Die Column</a></span></li><li><span><a href="#Fill-in-Empty-Dates-and-Sort" data-toc-modified-id="Fill-in-Empty-Dates-and-Sort-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Fill in Empty Dates and Sort</a></span></li><li><span><a href="#Fill:-Draw-Plots" data-toc-modified-id="Fill:-Draw-Plots-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Fill: Draw Plots</a></span></li></ul></li><li><span><a href="#Quantize-the-values" data-toc-modified-id="Quantize-the-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Quantize the values</a></span></li><li><span><a href="#Averages" data-toc-modified-id="Averages-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Averages</a></span><ul class="toc-item"><li><span><a href="#Averages:-Draw-Plots" data-toc-modified-id="Averages:-Draw-Plots-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Averages: Draw Plots</a></span></li><li><span><a href="#Insert-averages-to-DataFrame" data-toc-modified-id="Insert-averages-to-DataFrame-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Insert averages to DataFrame</a></span></li></ul></li><li><span><a href="#Save-DataFrame-to-CSV-file" data-toc-modified-id="Save-DataFrame-to-CSV-file-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Save DataFrame to CSV file</a></span><ul class="toc-item"><li><span><a href="#Save:-Describe-before-saving" data-toc-modified-id="Save:-Describe-before-saving-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Save: Describe before saving</a></span></li><li><span><a href="#Write-as-CSV-file" data-toc-modified-id="Write-as-CSV-file-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Write as CSV file</a></span></li></ul></li></ul></div>

## Read Data

In [2]:
## show available datasets
#data = available_datasets() # uncomment to see all available datasets

# Read DataFrame from the CSV file.
df = fetch_dataset(dataset_file_name, date_original_format , path_data_original )

if verbose
    preview_data(df)
end

println()

Fetched ../Data/original/NVDA.csv, record count 5508



### Sample the data

In [3]:
if verbose
    using Statistics
    describe(df)
end

## Extract right columns and sort

In [4]:


using DataFrames
df = DataFrame( 
                  Date      = df[:,1]               # 2 
                , Original  = toFloat64( df[:,column_to_keep] )  # 3 
                , Quantized = toFloat64( df[:,column_to_keep] )  # 4 
               )

df = sort(df, [ :Date ]);

if verbose
    columns = preview_data(df)
end
println()




## Insert Rata Die Column

In [5]:
## Insert the Rata Die Column
record_count = size(df)[1]
insert_localtion = 1
insertcols!(df, insert_localtion, :Rata_Die => zeros(Int64, record_count); makeunique = true )
println()




### Insert Future Date

In [6]:
using Dates
columns = names(df)
if verbose
    println(columns)
end

future_rata_die = Dates.datetime2rata( today() ) + predict_days # Int64  days from now
future_date = Dates.rata2datetime( future_rata_die) # Date
println(future_date)
push!(df, [ future_rata_die future_date 0.0 0.0 ])

## sort by Day (Rata Die)
df = sort(df, [:Rata_Die]);
if verbose
    ## show last row
    last_row = size(df)[1]
    df[last_row,:]
end
println()

2021-01-08T00:00:00



### Update the Rata Die Column

In [7]:
update_rata_die!(df, 1, 2)

if verbose
    first(df, 6)
    println("Inserted Rata Die")
end

### Fill in Empty Dates and Sort

In [8]:
populate_missing_dates!(df)

df = sort(df, [:Rata_Die]);

## remove last row
record_count = size(df)[1]
df = df[1:record_count-1,:]

if verbose
    println( preview_data(df) )
    println()
end

### Fill: Draw Plots
- if the indicator is updated only preiodically (bi-weekly, monthly, quarterly) the the graph will appear blocky

In [9]:
if verbose
    using Plots
    count = size(df)[1]
    rows = 1:count
    dates = format_dates( df[rows,2] , "m/d/yy")

    gr()
    plot(          dates, # x-axis: dates
                   [ df[rows,:Quantized]    ], # y-axis
        label    = [ "Quantized"  ""],
        legend   =:topleft, 
                  # :right, :left, :top, :bottom, :inside, :best, :legend, :topright, :topleft, :bottomleft, :bottomright
        xlabel   = "time",
        ylabel   = "indicators",
        size     = (980, 400), # width, height
        layout = (1, 1) # number of graphs: vertically, horizontally
        )
end

## Quantize the values

- Quantization is a process of noramalizing the data
- I have decided to normalize the data for Int8 as I might try use Google Coral NPU
- minimum = -128.0
- maximum = 127.0

In [10]:
data_original = df[:,3] # keep original for display comparison later

quantize_column!(df,4)

if verbose
    using Statistics
    describe(df)
end

In [11]:
if verbose
    using Plots

    count = size(df)[1]
    rows = 1:count
    dates = format_dates( df[rows,2] , "m/d/yy")

    gr()
    plot(          dates, # x-axis: dates
                   [  df[rows,:Quantized]    ], # y-axis
        label    = [  "Quantized" ""   ]  ,
        legend   =:topleft, 
                  # :right, :left, :top, :bottom, :inside, :best, :legend, :topright, :topleft, :bottomleft, :bottomright
        xlabel   = "time",
        ylabel   = "indicators",
        size     = (980, 400), # width, height
        layout = (1, 1) # number of graphs: vertically, horizontally
        )
end

## Averages

In [12]:
averages005 = calculate_average(df, 5,   :Quantized )
averages030 = calculate_average(df, 30,  :Quantized )
averages060 = calculate_average(df, 60,  :Quantized )
averages090 = calculate_average(df, 90,  :Quantized )
averages120 = calculate_average(df, 120, :Quantized )
averages180 = calculate_average(df, 180, :Quantized )
println()




### Averages: Draw Plots

In [13]:
if verbose
    using Plots

    columns = names(df)
    record_count = size(df)[1]
    days_back = 700
    rows = record_count-days_back:record_count # count-days_back:count
    dates = format_dates( df[rows,2] , "m/d/yy")

    gr()
    plot( dates, # x-axis: dates
        [ 
            df[rows,:Quantized] averages005[rows] averages030[rows] averages060[rows] averages120[rows] averages180[rows]  
        ], # y-axis
        label    = [ "Quantized" "averages005" "averages030" "averages060" "averages120" "averages180"   ""],
        legend   =:topleft, 
                  # :right, :left, :top, :bottom, :inside, :best, :legend, :topright, :topleft, :bottomleft, :bottomright
        xlabel   = "time",
        ylabel   = "indicators",
        size     = (980, 400), # width, height
        layout = (1, 1) # number of graphs: vertically, horizontally
        )
end

### Insert averages to DataFrame

- if the frequency of data is e.g. 30 days, averages below 30 days do not add value

In [14]:
insertcols!(df,  5,  :Avg005   => averages005  , makeunique=true)
insertcols!(df,  6,  :Avg030   => averages030  , makeunique=true)
insertcols!(df,  7,  :Avg060   => averages060  , makeunique=true)
insertcols!(df,  8,  :Avg090   => averages090  , makeunique=true)
insertcols!(df,  9,  :Avg120   => averages120  , makeunique=true)
insertcols!(df, 10,  :Avg180   => averages180  , makeunique=true)

if verbose
    using Statistics
    describe(df)
end

[back to top](#Table-of-Contents)
<hr/>

## Save DataFrame to CSV file

### Save: Describe before saving

In [15]:
## Always show
using Statistics
describe(df)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Nothing,DataType
1,Rata_Die,733786.0,729776,733786.0,737797,,,Int64
2,Date,,1999-01-22,,2021-01-07,8022.0,,Date
3,Original,57.5841,1.42188,15.6,589.07,,,Float64
4,Quantized,-103.632,-128.0,-122.0,127.0,,,Float64
5,Avg005,-103.626,-128.0,-122.0,124.2,,,Float64
6,Avg030,-103.589,-128.0,-122.0,111.13,,,Float64
7,Avg060,-103.545,-128.0,-121.98,109.25,,,Float64
8,Avg090,-103.507,-128.0,-121.98,107.82,,,Float64
9,Avg120,-103.469,-128.0,-121.8,105.28,,,Float64
10,Avg180,-103.359,-128.0,-121.645,95.22,,,Float64


### Write as CSV file

In [16]:
save_dataset(df, dataset_file_name, path_data_processed );
println("Finished and saved to ", dataset_file_name)

Finished and saved to NVDA.csv
