### Downloading the Price datasets

BTC: https://www.investing.com/crypto/bitcoin/historical-data <br>
SNP500: https://www.investing.com/indices/us-spx-500-historical-data <br>
VIX: https://www.investing.com/indices/volatility-s-p-500-historical-data <br>
GOLD: https://www.investing.com/commodities/gold-historical-data

In [73]:
import pandas as pd

# Load in main price datasets
btc = pd.read_csv('data/price/btc.csv')
eth = pd.read_csv('data/price/eth.csv')
sol = pd.read_csv('data/price/sol.csv')

#### Creating functions for pre-proccesing 
# 1. Removing the Comma from the price action 
def comma_form (column): 
    column = column.str.replace(',', '')
    column = pd.to_numeric(column, errors='coerce')
    return column

# 2. Converting the Volume Function 
def convert_volume(volume):
    # Check if the value is a string and needs conversion
    if isinstance(volume, str):
        factor = 1
        if volume.endswith('K'):
            factor = 10**3
        elif volume.endswith('M'):
            factor = 10**6
        elif volume.endswith('B'):
            factor = 10**9
        # Remove the last character and convert to float
        return float(volume[:-1]) * factor
    else:
        # If not a string, return as is (assuming it's already a numeric type)
        return volume
    
# 3. Adjusting the Change % Column 
def percent(percent):
    return pd.to_numeric(percent.str.replace('%', '')) / 100

# 4. Date Selection
def date_filter(df, start_date_str, end_date_str):

    # Convert start and end date strings to datetime
    start_date = pd.to_datetime(start_date_str, dayfirst=True)
    end_date = pd.to_datetime(end_date_str, dayfirst=True)

    # Filter based on the date range
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    return df.loc[mask]

### BTC PRICES

In [74]:
# 1. Converting the Date columns to Date
btc.Date = pd.to_datetime(btc.Date)

# 2. Applying the Comma function on Prices
btc.Price = comma_form(btc.Price)
btc.Open = comma_form(btc.Open)
btc.High = comma_form(btc.High)
btc.Low = comma_form(btc.Low)

# 3. Renaming Volume and applying fomatting
btc['Volume']= btc['Vol.']
btc.Volume = btc.Volume.apply(convert_volume)
    
# 4. Applying the PCT Change Column
btc['Change %'] = percent(btc['Change %'])
btc['pct_change'] = btc['Change %']

# 5. Deleting old formatted columns
btc = btc.drop(['Vol.', 'Change %'], axis = 1)

#mark.to_csv('Data/Model_Data/SOL_Latest.csv', index = False)    

btc

Unnamed: 0,Date,Price,Open,High,Low,Volume,pct_change
0,2024-06-08,69349.9,69347.0,69572.1,69222.4,56450.0,0.0000
1,2024-06-07,69347.9,70793.4,71956.5,68620.7,82620.0,-0.0204
2,2024-06-06,70791.5,71083.6,71616.1,70178.7,49790.0,-0.0041
3,2024-06-05,71083.7,70550.9,71744.4,70397.1,67060.0,0.0076
4,2024-06-04,70549.2,68808.0,71034.2,68564.3,75690.0,0.0253
...,...,...,...,...,...,...,...
885,2022-01-05,43425.9,45833.1,47019.4,42535.1,83740.0,-0.0526
886,2022-01-04,45837.3,46435.7,47505.4,45602.1,55590.0,-0.0128
887,2022-01-03,46430.2,47293.9,47556.0,45704.0,41060.0,-0.0186
888,2022-01-02,47311.8,47738.7,47944.9,46718.2,27020.0,-0.0089


#### ETC PRICES

In [75]:
# 1. Converting the Date columns to Date
eth.Date = pd.to_datetime(eth.Date)

# 2. Applying the Comma function on Prices
eth.Price = comma_form(eth.Price)
eth.Open = comma_form(eth.Open)
eth.High = comma_form(eth.High)
eth.Low = comma_form(eth.Low)

# 3. Renaming Volume and applying fomatting
eth['Volume']= eth['Vol.']
eth.Volume = eth.Volume.apply(convert_volume)
    
# 4. Applying the PCT Change Column
eth['Change %'] = percent(eth['Change %'])
eth['pct_change'] = eth['Change %']

# 5. Deleting old formatted columns
eth = eth.drop(['Vol.', 'Change %'], axis = 1)

#mark.to_csv('Data/Model_Data/SOL_Latest.csv', index = False)    

eth

Unnamed: 0,Date,Price,Open,High,Low,Volume,pct_change
0,2024-06-08,3687.34,3678.36,3709.44,3670.34,353040.0,0.0024
1,2024-06-07,3678.37,3812.95,3840.88,3608.44,384780.0,-0.0353
2,2024-06-06,3812.95,3865.15,3878.28,3765.23,251820.0,-0.0135
3,2024-06-05,3865.14,3810.35,3885.15,3778.13,288180.0,0.0144
4,2024-06-04,3810.35,3767.10,3831.41,3743.11,247960.0,0.0115
...,...,...,...,...,...,...,...
1250,2021-01-05,1099.52,1042.48,1131.56,976.91,3250000.0,0.0548
1251,2021-01-04,1042.40,977.76,1158.27,894.24,5190000.0,0.0692
1252,2021-01-03,974.97,774.54,1008.49,769.57,4020000.0,0.2588
1253,2021-01-02,774.50,729.00,787.26,715.15,2250000.0,0.0622


#### SOL PRICES

In [76]:
# 1. Converting the Date columns to Date
sol.Date = pd.to_datetime(sol.Date)

# 3. Renaming Volume and applying fomatting
sol['Volume']= sol['Vol.']
sol.Volume = sol.Volume.apply(convert_volume)
    
# 4. Applying the PCT Change Column
sol['Change %'] = percent(sol['Change %'])
sol['pct_change'] = sol['Change %']

# 5. Deleting old formatted columns
sol = sol.drop(['Vol.', 'Change %'], axis = 1)

#mark.to_csv('Data/Model_Data/SOL_Latest.csv', index = False)    

sol

Unnamed: 0,Date,Price,Open,High,Low,Volume,pct_change
0,2024-06-08,161.119,162.504,163.716,158.516,5150000.0,-0.0085
1,2024-06-07,162.504,170.114,172.583,155.049,5110000.0,-0.0447
2,2024-06-06,170.106,173.491,174.371,167.754,3070000.0,-0.0195
3,2024-06-05,173.491,171.784,175.565,171.207,3800000.0,0.0100
4,2024-06-04,171.778,164.898,171.859,164.446,3290000.0,0.0413
...,...,...,...,...,...,...,...
1249,2021-01-05,2.155,2.489,2.489,2.093,,-0.1339
1250,2021-01-04,2.489,2.161,2.489,1.945,,0.1518
1251,2021-01-03,2.161,1.796,2.295,1.796,,0.2027
1252,2021-01-02,1.796,1.837,1.986,1.733,,-0.0222


In [80]:
gold = pd.read_csv('data/price/gold.csv')
snp = pd.read_csv('data/price/snp.csv')
vix = pd.read_csv('data/price/vix.csv')

gold.Date, snp.Date, vix.Date = pd.to_datetime(gold.Date), pd.to_datetime(snp.Date), pd.to_datetime(vix.Date)

# 2. Applying the Comma function on Prices
gold.Price, snp.Price = comma_form(gold.Price), comma_form(snp.Price)
gold.Open, snp.Open = comma_form(gold.Open), comma_form(snp.Open)
gold.High, snp.High = comma_form(gold.High), comma_form(snp.High)
gold.Low, snp.Low= comma_form(gold.Low), comma_form(snp.Low)

# 3. Renaming Volume and applying fomatting
gold['Volume'], snp['Volume']= gold['Vol.'], snp['Vol.']
gold.Volume, snp.Volume = gold.Volume.apply(convert_volume), snp.Volume.apply(convert_volume)

# 4. Applying the PCT Change Column
gold['Change %'], snp['Change %'], vix['Change %']  = percent(gold['Change %']), percent(snp['Change %']), percent(vix['Change %'])
gold['pct_change'], snp['pct_change'], vix['pct_change'] = gold['Change %'], snp['Change %'], vix['Change %']

# 5. Deleting old formatted columns
gold, snp, vix = gold.drop(['Vol.', 'Change %'], axis = 1), snp.drop(['Vol.', 'Change %'], axis = 1), vix.drop(['Vol.', 'Change %'], axis = 1)

ext = pd.merge(gold, snp, on='Date')
ext = pd.merge(ext, vix, on='Date')

ext


Unnamed: 0,Date,Price_x,Open_x,High_x,Low_x,Volume_x,pct_change_x,Price_y,Open_y,High_y,Low_y,Volume_y,pct_change_y,Price,Open,High,Low,pct_change
0,2024-03-14,2165.55,2179.8,2181.3,2157.1,,-0.0070,5150.48,5175.14,5176.85,5123.30,,-0.0029,14.40,13.75,15.33,13.42,0.0473
1,2024-03-13,2180.80,2163.6,2185.6,2161.3,232270.0,0.0068,5165.31,5173.49,5179.14,5151.88,,-0.0019,13.75,13.89,14.04,13.67,-0.0065
2,2024-03-12,2166.10,2189.1,2190.8,2156.2,316250.0,-0.0103,5175.27,5134.30,5179.87,5114.48,,0.0112,13.84,14.97,15.20,13.81,-0.0907
3,2024-03-11,2188.60,2187.6,2195.5,2180.9,242420.0,0.0014,5117.94,5111.96,5124.66,5091.14,,-0.0011,15.22,15.51,16.04,15.13,0.0326
4,2024-03-08,2185.50,2166.6,2203.0,2161.2,389680.0,0.0094,5123.69,5164.46,5189.26,5117.50,,-0.0065,14.74,14.22,15.53,13.97,0.0208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,2018-01-08,1320.40,1321.8,1323.0,1315.7,246090.0,-0.0014,2747.71,2742.67,2748.51,2737.60,,0.0017,9.52,9.61,9.89,9.32,0.0325
1556,2018-01-05,1322.30,1324.4,1324.7,1314.6,330230.0,0.0005,2743.15,2731.33,2743.45,2727.92,,0.0070,9.22,9.10,9.54,9.00,0.0000
1557,2018-01-04,1321.60,1315.5,1327.3,1307.1,369850.0,0.0024,2723.99,2719.31,2729.29,2719.07,,0.0040,9.22,9.01,9.31,8.92,0.0077
1558,2018-01-03,1318.50,1319.0,1323.0,1308.9,353460.0,0.0018,2713.06,2697.85,2714.37,2697.77,,0.0064,9.15,9.56,9.65,8.94,-0.0635


##### Creating the tomorrow independent variable

In [78]:
def tomorrow(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date', ascending=False)
    df['Tomorrow'] = df.Price.shift(1)
    df['Tomorrow'] = (df.Tomorrow > df.Price).astype(int)
    return df

btc, eth, sol = tomorrow(btc), tomorrow(eth), tomorrow(sol)

Unnamed: 0,Date,Price,Open,High,Low,Volume,pct_change,Tomorrow
0,2024-06-08,161.119,162.504,163.716,158.516,5150000.0,-0.0085,0
1,2024-06-07,162.504,170.114,172.583,155.049,5110000.0,-0.0447,0
2,2024-06-06,170.106,173.491,174.371,167.754,3070000.0,-0.0195,0
3,2024-06-05,173.491,171.784,175.565,171.207,3800000.0,0.0100,0
4,2024-06-04,171.778,164.898,171.859,164.446,3290000.0,0.0413,1
...,...,...,...,...,...,...,...,...
1249,2021-01-05,2.155,2.489,2.489,2.093,,-0.1339,0
1250,2021-01-04,2.489,2.161,2.489,1.945,,0.1518,0
1251,2021-01-03,2.161,1.796,2.295,1.796,,0.2027,1
1252,2021-01-02,1.796,1.837,1.986,1.733,,-0.0222,1


##### Join Gold, SNP, and VIX to Crypto Price Dataset

In [None]:
def join_other(df):
    df = pd.merge(df, )