In [35]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [36]:
def calc_sma_and_bollinger_bands(df, sma_length):
    #Create a sma column for the df, setting all the values to NaN
    df["SMA"+str(sma_length)] = np.nan
    #Create an upper and lower bolinger band columns for the df, setting all the values to NaN
    df["Upper_Bollinger_Band"] = np.nan
    df["Lower_Bollinger_Band"] = np.nan

    #Create variables for the sum and sma
    sma_sum = 0
    sma = 0
    std_dev_sum = 0

    #Loop through the df
    for i in range(0, len(df.index)):
        #If i is less then 20 just sum the value and let the sma stay NaN
        if i < sma_length:
            sma_sum += df.iloc[i]["Close_VET"]

        #If i is greater than 20 add the next value and subtract the value from 20 places before
        elif i > sma_length:
            #Add the next day
            sma_sum += df.iloc[i]["Close_VET"]

            #Subtract the day 20 days before
            sma_sum -= df.iloc[i-sma_length]["Close_VET"]

            #Get the sma and set the sma column
            sma = sma_sum / float(sma_length)
            df["SMA"+str(sma_length)].iat[i] = sma
            
            #Reset standard deviation variables
            std_dev_sum = 0
            
            #Calculate std deviation
            for j in range(20):
                #Find average distance from moving average squared
                std_dev_sum += (df["Close_VET"][i-j] - sma)**2
            
            #Divide sum by num elements in the sum and take the square root
            std_dev = (std_dev_sum / float(sma_length))**0.5
            
            
            #Set the upper and lower bollinger bands 2 std deviations from the average
            df["Upper_Bollinger_Band"].iat[i] = sma + 2 * std_dev
            df["Lower_Bollinger_Band"].iat[i] = sma - 2 * std_dev

In [37]:
def generate_structured_data(hist, num_hist_days):
    #Create empty list of col names
    column_names = []
    #Create columns beforehand to use iat later
    for j in range(num_hist_days):
            for key in hist.keys():    
                column_names.append("Target-" + str(num_hist_days-j) + "_" + str(key))


    # Create empty df to push data into
    full_data = pd.DataFrame(columns=column_names, index=range(hist.shape[0]-num_hist_days))        

    # Include target columns
    full_data["Regression_Target"] = np.nan
    full_data["Classification_Target"] = np.nan
    
    #Control the row number of full_data
    for i in range(hist.shape[0]-num_hist_days):
        #Variables to hold the row and col of the original df indicies
        hist_row = i
        hist_col = 0
        #Control the column number of full_data
        for j in range(full_data.shape[1]):
            #For debugging if needed
            #print("full_data:", i, j, "hist:", hist_row, hist_col)

            #Place the original data into the full_data df
            full_data.iat[i, j] = hist.iat[hist_row, hist_col]

            #Increment the col where the data is pulled from
            hist_col += 1

            #When the col indexer for the original df reaches the end, increment the row and reset the col to 0
            if(hist_col == hist.shape[1]):
                hist_row += 1
                hist_col = 0

        # Create target columns
        full_data.loc[:,"Regression_Target"].iat[i] = hist.iloc[i+num_hist_days]["Close_VET"]

        #If the close > open, positive candle => 1 for classification
        if(hist.iloc[i+num_hist_days]["Close_VET"] > hist.iloc[i+num_hist_days]["Open_VET"]):
            full_data.loc[:,"Classification_Target"].iat[i] = 1
        else:
            full_data.loc[:,"Classification_Target"].iat[i] = 0
            
    return full_data

In [38]:
#Get the data
vet = yf.Ticker("VET-USD")
btc = yf.Ticker("BTC-USD")

#Get max daily data
hist_vet = yf.download("VET-USD", start="2018-08-03", end="2021-11-12")
hist_btc = yf.download("BTC-USD", start="2018-08-03", end="2021-11-12")

#Drop unecessary cols
hist_vet.drop(["Adj Close"], axis=1, inplace=True)
hist_btc.drop(["Adj Close"], axis=1, inplace=True)

#Combine the two dfs into 1
hist = hist_vet.join(hist_btc, lsuffix="_VET", rsuffix="_BTC")
print(hist_vet.head(1))
print(hist_btc.head(1))
print(hist.head(1))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
                Open     High       Low     Close    Volume
Date                                                       
2018-08-03  0.015041  0.01573  0.014938  0.015307  46250600
                   Open         High          Low        Close      Volume
Date                                                                      
2018-08-03  7562.140137  7562.140137  7328.649902  7434.390137  4627150000
            Open_VET  High_VET   Low_VET  Close_VET  Volume_VET     Open_BTC  \
Date                                                                           
2018-08-03  0.015041   0.01573  0.014938   0.015307    46250600  7562.140137   

               High_BTC      Low_BTC    Close_BTC  Volume_BTC  
Date                                                           
2018-08-03  7562.140137  7328.649902  7434.390137  4627150000  


In [39]:
#Define sma size
sma_length = 20

#Get Bollinger Bands and SMA
calc_sma_and_bollinger_bands(hist, sma_length)

#Drop the front of the df with the NaN vals from calculating the sma
hist = hist.iloc[sma_length+1:]

In [40]:
#Define the number of historical days to use as features
num_hist_days = 20

data_for_csv = generate_structured_data(hist, num_hist_days)
data_for_csv

Unnamed: 0,Target-20_Open_VET,Target-20_High_VET,Target-20_Low_VET,Target-20_Close_VET,Target-20_Volume_VET,Target-20_Open_BTC,Target-20_High_BTC,Target-20_Low_BTC,Target-20_Close_BTC,Target-20_Volume_BTC,...,Target-1_Open_BTC,Target-1_High_BTC,Target-1_Low_BTC,Target-1_Close_BTC,Target-1_Volume_BTC,Target-1_SMA20,Target-1_Upper_Bollinger_Band,Target-1_Lower_Bollinger_Band,Regression_Target,Classification_Target
0,0.014743,0.014743,0.013981,0.014535,19980000,6551.52002,6719.959961,6498.640137,6719.959961,4097820000,...,6317.009766,6363.870117,6265.089844,6351.799805,4064230000,0.016273,0.019782,0.012764,0.014060,1.0
1,0.014552,0.015735,0.014244,0.015413,24453700,6719.950195,6789.629883,6700.959961,6763.189941,3312600000,...,6354.240234,6535.410156,6354.240234,6517.310059,4210910000,0.016249,0.019811,0.012688,0.013978,0.0
2,0.01539,0.015446,0.014513,0.015233,20503900,6754.640137,6774.75,6620.75,6707.259766,3295500000,...,6515.410156,6596.100098,6456.169922,6512.709961,4076220000,0.016178,0.019857,0.012499,0.014275,1.0
3,0.015222,0.017856,0.01521,0.017824,53968100,6710.799805,6884.640137,6689.709961,6884.640137,4019000000,...,6509.399902,6561.720215,6493.549805,6543.200195,3216300000,0.01613,0.019879,0.01238,0.014170,0.0
4,0.017839,0.018756,0.017549,0.01798,45607800,6891.080078,7109.560059,6882.339844,7096.279785,4659940000,...,6536.680176,6544.330078,6460.100098,6517.180176,3273730000,0.015947,0.019699,0.012196,0.012725,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,0.118551,0.119335,0.114771,0.116163,337277736,62043.164062,64434.535156,61622.933594,64261.992188,40471196346,...,61554.921875,63326.988281,61432.488281,63326.988281,24726754302,0.137296,0.160627,0.113964,0.175682,1.0
1140,0.116161,0.123003,0.115251,0.122075,413430313,64284.585938,66930.390625,63610.675781,65992.835938,40788955582,...,63344.066406,67673.742188,63344.066406,67566.828125,41125608330,0.140272,0.167012,0.113531,0.171228,0.0
1141,0.12222,0.135935,0.122192,0.129184,1045478292,66002.234375,66600.546875,62117.410156,62210.171875,45908121370,...,67549.734375,68530.335938,66382.0625,66971.828125,42357991721,0.142729,0.171312,0.114147,0.163078,0.0
1142,0.129659,0.136943,0.126778,0.130023,688702052,62237.890625,63715.023438,60122.796875,60692.265625,38434082775,...,66953.335938,68789.625,63208.113281,64995.230469,48730828378,0.144424,0.173613,0.115235,0.165503,1.0


In [41]:
data_for_csv.to_csv("Structured_data.csv")