# Uber DataFrame
- This notebook combines multiple Market indicators into one massive DataFrame
- That DataFrame will be used for Machine Learning

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Set-Up" data-toc-modified-id="Set-Up-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Set Up</a></span></li><li><span><a href="#Fetch-Data" data-toc-modified-id="Fetch-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Fetch Data</a></span><ul class="toc-item"><li><span><a href="#Explore-Dictionary-data" data-toc-modified-id="Explore-Dictionary-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Explore Dictionary data</a></span></li></ul></li><li><span><a href="#Trim--Common-Date-Span" data-toc-modified-id="Trim--Common-Date-Span-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Trim  Common Date Span</a></span></li><li><span><a href="#Build-UBER-DataFrame" data-toc-modified-id="Build-UBER-DataFrame-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Build UBER DataFrame</a></span></li><li><span><a href="#Visualize-the-dependencies" data-toc-modified-id="Visualize-the-dependencies-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Visualize the dependencies</a></span></li><li><span><a href="#Divide-Training-and-Prediction-data-sets" data-toc-modified-id="Divide-Training-and-Prediction-data-sets-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Divide Training and Prediction data sets</a></span><ul class="toc-item"><li><span><a href="#Save-Datasets" data-toc-modified-id="Save-Datasets-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Save Datasets</a></span></li></ul></li></ul></div>

## Set Up

In [1]:
## set up

indicators_file_name = "Indicators.csv"
indicators_date_format = "yyyy-mm-dd"
indicators_file_dir = "../DATA/"

original_dir = "../DATA/original/"

include("../Julia/IndicatorData.jl") 
include("../Julia/format_rata_die_to_us_date.jl")
include("../Julia/trim_DataFrames.jl")
include("../Julia/IndicatorData.jl") # addIndicator!()
include("../Julia/fetch_local_CSV_as_DataFrame.jl")
println()




## Fetch Data

In [2]:
## Create a Dictionary data with all Market Indicators, each indicator is a spreadsheet!

data = Dict{String, IndicatorData}()

## Read from the CSV file

using CSV, DataFrames

df = fetch_local_CSV_as_DataFrame(
    indicators_file_name, 
    indicators_date_format, 
    indicators_file_dir)

## For each row, add indicator to the Dictionary data

row_count = size(df)[1]

for row in 1:row_count
    indicator_full_name  = df[row, :Full_Name]
    url                  = df[row, :URL]
    symbol               = df[row, :Symbol]
    original_date_format = df[row, :Date_Format]
    column_to_keep       = df[row, :Column]

    print( symbol, ", "  ) 
    ## Do NOT use "^" in names as in "^VIX"
    addIndicator!(data, symbol, original_dir)

end # for loop
df = nothing # free the memory for gc()

print()

AAPL, 

LoadError: MethodError: Cannot `convert` an object of type Date to an object of type Int64
Closest candidates are:
  convert(::Type{T}, !Matched::T) where T<:Number at number.jl:6
  convert(::Type{T}, !Matched::Number) where T<:Number at number.jl:7
  convert(::Type{T}, !Matched::Ptr) where T<:Integer at pointer.jl:23
  ...

### Explore Dictionary data

In [3]:
## Show content of the Dict{String,IndicatorData} with x entries

# data

In [4]:
# data["AAPL"].min_rata_die # 723161

In [5]:
using Statistics
describe(data["AAPL"].df)

LoadError: KeyError: key "AAPL" not found

## Trim  Common Date Span

Example:
- for all the DataFrames
- find lowest dates
- select the highest (common) of them

In [6]:
## Trim 

trim_DataFrames!(data) # include("../Julia/function_trim_DataFrames.jl")

LoadError: ArgumentError: reducing over an empty collection is not allowed

## Build UBER DataFrame

In [7]:
## Print POSSIBLE fields for UBER DataFrame

for key in keys(data) 
    #df = data[key].df
    columns = names(data[key].df) # Symbols, not Strings
    for i in 1:length(columns)
        if occursin("Avg", string(columns[i]) ) # include only averages
            println(",", data[key].name, "_",columns[i], 
                " = data[\"", data[key].name, "\"].df[:,:", columns[i], "] ") 
        end # if
    end
end

In [8]:
using DataFrames

## example column_name = data["SYMBOL"].df[:,:column_name] 

uber = DataFrame(  

 Rata_Die = data["VIX"].df[:,:Rata_Die] 
,Date = data["VIX"].df[:,:Date] 
    
,VIX_Original = data["VIX"].df[:,:Original] 
,VIX_Quantized = data["VIX"].df[:,:Quantized] 
,VIX_Avg005 = data["VIX"].df[:,:Avg005] 
,VIX_Avg030 = data["VIX"].df[:,:Avg030] 
,VIX_Avg060 = data["VIX"].df[:,:Avg060] 
,VIX_Avg090 = data["VIX"].df[:,:Avg090] 
,VIX_Avg120 = data["VIX"].df[:,:Avg120] 
,VIX_Avg180 = data["VIX"].df[:,:Avg180] 
    

,US_GDP_Q_Original = data["US_GDP_Q"].df[:,:Original] 
,US_GDP_Q_Quantized = data["US_GDP_Q"].df[:,:Quantized] 
,US_GDP_Q_Avg005 = data["US_GDP_Q"].df[:,:Avg005] 
,US_GDP_Q_Avg030 = data["US_GDP_Q"].df[:,:Avg030] 
,US_GDP_Q_Avg060 = data["US_GDP_Q"].df[:,:Avg060] 
,US_GDP_Q_Avg090 = data["US_GDP_Q"].df[:,:Avg090] 
,US_GDP_Q_Avg120 = data["US_GDP_Q"].df[:,:Avg120] 
,US_GDP_Q_Avg180 = data["US_GDP_Q"].df[:,:Avg180] 
    

,US_ISM_MFC_PMI_Original = data["US_ISM_MFC_PMI"].df[:,:Original] 
,US_ISM_MFC_PMI_Quantized = data["US_ISM_MFC_PMI"].df[:,:Quantized] 
,US_ISM_MFC_PMI_Avg030 = data["US_ISM_MFC_PMI"].df[:,:Avg030] 
,US_ISM_MFC_PMI_Avg060 = data["US_ISM_MFC_PMI"].df[:,:Avg060] 
,US_ISM_MFC_PMI_Avg090 = data["US_ISM_MFC_PMI"].df[:,:Avg090] 
,US_ISM_MFC_PMI_Avg180 = data["US_ISM_MFC_PMI"].df[:,:Avg180] 
,US_ISM_MFC_PMI_Avg365 = data["US_ISM_MFC_PMI"].df[:,:Avg365]
    
 
,NIO_Original = data["NIO"].df[:,:Original] 
,NIO_Quantized = data["NIO"].df[:,:Quantized] 
,NIO_Avg005 = data["NIO"].df[:,:Avg005] 
,NIO_Avg030 = data["NIO"].df[:,:Avg030] 
,NIO_Avg060 = data["NIO"].df[:,:Avg060] 
,NIO_Avg090 = data["NIO"].df[:,:Avg090] 
,NIO_Avg120 = data["NIO"].df[:,:Avg120] 
,NIO_Avg180 = data["NIO"].df[:,:Avg180] 
    

,NVDA_Original = data["NVDA"].df[:,:Original] 
,NVDA_Quantized = data["NVDA"].df[:,:Quantized] 
,NVDA_Avg005 = data["NVDA"].df[:,:Avg005] 
,NVDA_Avg030 = data["NVDA"].df[:,:Avg030] 
,NVDA_Avg060 = data["NVDA"].df[:,:Avg060] 
,NVDA_Avg090 = data["NVDA"].df[:,:Avg090] 
,NVDA_Avg120 = data["NVDA"].df[:,:Avg120] 
,NVDA_Avg180 = data["NVDA"].df[:,:Avg180] 
    

,DJIA_Original = data["DJIA"].df[:,:Original] 
,DJIA_Quantized = data["DJIA"].df[:,:Quantized] 
,DJIA_Avg005 = data["DJIA"].df[:,:Avg005] 
,DJIA_Avg030 = data["DJIA"].df[:,:Avg030] 
,DJIA_Avg060 = data["DJIA"].df[:,:Avg060] 
,DJIA_Avg090 = data["DJIA"].df[:,:Avg090] 
,DJIA_Avg120 = data["DJIA"].df[:,:Avg120] 
,DJIA_Avg180 = data["DJIA"].df[:,:Avg180] 
    

,AAPL_Original = data["AAPL"].df[:,:Original] 
,AAPL_Quantized = data["AAPL"].df[:,:Quantized] 
,AAPL_Avg005 = data["AAPL"].df[:,:Avg005] 
,AAPL_Avg030 = data["AAPL"].df[:,:Avg030] 
,AAPL_Avg060 = data["AAPL"].df[:,:Avg060] 
,AAPL_Avg090 = data["AAPL"].df[:,:Avg090] 
,AAPL_Avg120 = data["AAPL"].df[:,:Avg120] 
,AAPL_Avg180 = data["AAPL"].df[:,:Avg180] 
    

,US_INIT_JOBLESS_Original = data["US_INIT_JOBLESS"].df[:,:Original] 
,US_INIT_JOBLESS_Quantized = data["US_INIT_JOBLESS"].df[:,:Quantized] 
,US_INIT_JOBLESS_Avg005 = data["US_INIT_JOBLESS"].df[:,:Avg005] 
,US_INIT_JOBLESS_Avg030 = data["US_INIT_JOBLESS"].df[:,:Avg030] 
,US_INIT_JOBLESS_Avg060 = data["US_INIT_JOBLESS"].df[:,:Avg060] 
,US_INIT_JOBLESS_Avg090 = data["US_INIT_JOBLESS"].df[:,:Avg090] 
,US_INIT_JOBLESS_Avg120 = data["US_INIT_JOBLESS"].df[:,:Avg120] 
,US_INIT_JOBLESS_Avg180 = data["US_INIT_JOBLESS"].df[:,:Avg180] 
    

,US_ISM_MFC_EMP_Original = data["US_ISM_MFC_EMP"].df[:,:Original] 
,US_ISM_MFC_EMP_Quantized = data["US_ISM_MFC_EMP"].df[:,:Quantized] 
,US_ISM_MFC_EMP_Avg030 = data["US_ISM_MFC_EMP"].df[:,:Avg030] 
,US_ISM_MFC_EMP_Avg060 = data["US_ISM_MFC_EMP"].df[:,:Avg060] 
,US_ISM_MFC_EMP_Avg090 = data["US_ISM_MFC_EMP"].df[:,:Avg090] 
,US_ISM_MFC_EMP_Avg120 = data["US_ISM_MFC_EMP"].df[:,:Avg120] 
,US_ISM_MFC_EMP_Avg180 = data["US_ISM_MFC_EMP"].df[:,:Avg180] 
,US_ISM_MFC_EMP_Avg365 = data["US_ISM_MFC_EMP"].df[:,:Avg365] 
    

,US_HOUS_STRT_M_Original = data["US_HOUS_STRT_M"].df[:,:Original] 
,US_HOUS_STRT_M_Quantized = data["US_HOUS_STRT_M"].df[:,:Quantized] 
,US_HOUS_STRT_M_Avg005 = data["US_HOUS_STRT_M"].df[:,:Avg005] 
,US_HOUS_STRT_M_Avg030 = data["US_HOUS_STRT_M"].df[:,:Avg030] 
,US_HOUS_STRT_M_Avg060 = data["US_HOUS_STRT_M"].df[:,:Avg060] 
,US_HOUS_STRT_M_Avg090 = data["US_HOUS_STRT_M"].df[:,:Avg090] 
,US_HOUS_STRT_M_Avg120 = data["US_HOUS_STRT_M"].df[:,:Avg120] 
,US_HOUS_STRT_M_Avg180 = data["US_HOUS_STRT_M"].df[:,:Avg180]
)
using Statistics
#describe(uber)

LoadError: KeyError: key "VIX" not found

## Visualize the dependencies

In [9]:
## print columns in format to be taken by predict_ file.

include("../Julia/function_print_columns_features.jl")
#columns = print_colunms(uber)
columns =  print_columns_features(uber)
println()

LoadError: could not open file /Users/uki/REPOS/MarketIndicators.jl/src/Julia/function_print_columns_features.jl

In [10]:
using Plots

record_count = size(uber)[1]
rows = 1:record_count
dates = format_dates( uber[rows,2] , "m/d/yy")

gr()
plot( dates, # x-axis: dates
     [ 
        uber[rows,8] uber[rows,15] uber[rows,20] uber[rows,28] uber[rows,39] uber[rows,45] uber[rows,53] uber[rows,57] uber[rows,65]  uber[rows,78]         
    ], # y-axis
    label = [      
        columns[8]     columns[15]   columns[20]   columns[28] columns[39]   columns[45]   columns[53] columns[57] columns[65] columns[78] "" 
    ],
    legend   =:topleft, 
              # :right, :left, :top, :bottom, :inside, :best, :legend, :topright, :topleft, :bottomleft, :bottomright
    xlabel   = "time",
    ylabel   = "indicators",
    size     = (980, 400), # width, height
    layout = (1, 1) # number of graphs: vertically, horizontally
    )

LoadError: UndefVarError: uber not defined

In [11]:
savefig("../../indicators.png")

LoadError: No current plot/subplot

## Divide Training and Prediction data sets

In [12]:
record_count = size(uber)[1]
today_rata = Dates.datetime2rata( today() )
today_id = find_day(uber, today_rata)

# I am putting overlap in the sets to see the behavior
df_training   = uber[1:today_id-5,:] # train up 5 days ago, I want to see real prediction for last few days.
df_prediction = uber[today_id-90:record_count,:] # show predictions up to 90 days ago

println()

LoadError: UndefVarError: uber not defined

### Save Datasets

In [13]:
save_dataset(df_training,   "uber_training.csv"  , "../Data/processed/");
save_dataset(df_prediction, "uber_prediction.csv", "../Data/processed/");

LoadError: UndefVarError: save_dataset not defined