# Boring Pre-req loading data and packages

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch


In [9]:
df = pd.read_csv('/Users/spencerfonbuena/Desktop/ES_1min_continuous_adjusted.txt', sep=',', index_col=0, header=None, names=["Date", 'Open', 'High', 'low', 'Close', 'Volume'])

In [10]:
df['Close']

Date
2008-01-02 06:00:00    1317.50
2008-01-02 06:01:00    1317.75
2008-01-02 06:02:00    1318.25
2008-01-02 06:03:00    1318.25
2008-01-02 06:04:00    1318.00
                        ...   
2023-06-09 16:55:00    4350.00
2023-06-09 16:56:00    4350.25
2023-06-09 16:57:00    4350.25
2023-06-09 16:58:00    4349.75
2023-06-09 16:59:00    4348.75
Name: Close, Length: 5381922, dtype: float64

In [11]:
df

Unnamed: 0_level_0,Open,High,low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-01-02 06:00:00,1316.25,1317.75,1316.00,1317.50,2317
2008-01-02 06:01:00,1317.25,1317.75,1317.00,1317.75,777
2008-01-02 06:02:00,1318.00,1318.25,1317.75,1318.25,593
2008-01-02 06:03:00,1318.25,1318.50,1318.00,1318.25,472
2008-01-02 06:04:00,1318.25,1318.25,1318.00,1318.00,57
...,...,...,...,...,...
2023-06-09 16:55:00,4349.50,4350.25,4349.50,4350.00,189
2023-06-09 16:56:00,4350.00,4350.50,4350.00,4350.25,357
2023-06-09 16:57:00,4350.00,4350.75,4350.00,4350.25,294
2023-06-09 16:58:00,4350.00,4350.25,4349.25,4349.75,613


### Moving averages

In [12]:
#50 period moving average
df['50SMA'] = df['Close'].rolling(50).mean()

#200 period moving average
df['200SMA'] = df['Close'].rolling(200).mean()

### RSI

In [13]:
def RSI(df, lookback):
    deltas = np.diff(df)
    seed = deltas[:lookback+1]
    up = seed[seed>= 0].sum()/lookback
    down = -seed[seed < 0].sum()/lookback
    rs = up/down
    rsi = np.zeros_like(df)
    
    for i in range(lookback, len(df)):
        delta = deltas[i-1]

        if delta > 0:
            upval = delta
            downval = 0.
        if delta < 0:
            upval = 0
            downval=abs(delta)
        up = (up * (lookback - 1) + upval) / lookback
        down = (down * (lookback - 1) + downval) / lookback

        rs = up/down
        rsi[i] = 100. - 100./(1. +rs)

    return rsi
df['RSI'] = RSI(df['Close'], 14)
    


In [14]:
df

Unnamed: 0_level_0,Open,High,low,Close,Volume,50SMA,200SMA,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-01-02 06:00:00,1316.25,1317.75,1316.00,1317.50,2317,,,0.000000
2008-01-02 06:01:00,1317.25,1317.75,1317.00,1317.75,777,,,0.000000
2008-01-02 06:02:00,1318.00,1318.25,1317.75,1318.25,593,,,0.000000
2008-01-02 06:03:00,1318.25,1318.50,1318.00,1318.25,472,,,0.000000
2008-01-02 06:04:00,1318.25,1318.25,1318.00,1318.00,57,,,0.000000
...,...,...,...,...,...,...,...,...
2023-06-09 16:55:00,4349.50,4350.25,4349.50,4350.00,189,4351.255,4351.60000,49.222864
2023-06-09 16:56:00,4350.00,4350.50,4350.00,4350.25,357,4351.235,4351.59000,51.656802
2023-06-09 16:57:00,4350.00,4350.75,4350.00,4350.25,294,4351.215,4351.58125,54.029826
2023-06-09 16:58:00,4350.00,4350.25,4349.25,4349.75,613,4351.195,4351.57125,48.863670


## Method for creating the labels

In [15]:
def create_labels(df):

    A = 0
    C = 0
    labels = np.array([])
    
    print(df)
    for i in range(0, (len(df))):


        #find 1 percent and 2 percent above and below
        #print(df[A])
        one_low = df[A] * .99
        two_low = df[A] * .98
        one_high = df[A] * 1.01
        two_high = df[A] * 1.02

        #print(f'1 low: {one_low} | 2 low: {two_low} | 1 high: {one_high} | 2 high: {two_high}')
        #initialize the label counter
        label_counter = A

        #this is to make sure that once it either enters the "gone up by one percent" or "gone down by 1 percent"
        #it doesn't enter the other while loops
        pathway = 0

        try:
            #look for the instance when the price increases or decreases by 1 percent
            while df[label_counter] >= one_low and df[label_counter] <= one_high:
                label_counter += 1
                #print(df[label_counter])
            #If the price moved up 1 pecent first, this while loop will trigger and check if it is a two to one, or a one to one trade
            while df[label_counter] >= one_low and df[label_counter] <= two_high:
                label_counter += 1
                pathway = 1
                #print(df[label_counter])
            #Check if price has increased two percent
            if df[label_counter] >= two_high:
                labels = np.append(labels, 2)
                pathway = 1
                #print(df[label_counter])
            #check if price has reversed back down to the one percent marker
            if df[label_counter] <= one_low and pathway == 1:
                labels = np.append(labels, 1)
                #print(df[label_counter])
            
            #if the price moved down 1 pecent first, this will check if it is a two to one, or a one to one trade
            while df[label_counter] <= one_high and df[label_counter] >= two_low and pathway != 1:
                label_counter += 1
                pathway = 2
                #print(df[label_counter])
        
            #check if the price has continued down two percent
            if df[label_counter] <= two_low and pathway != 1:
                labels = np.append(labels, 0)
                #print(df[label_counter])
            #check if price reversed back up to the 1 percent above marker
            if df[label_counter] >= one_high and pathway != 1:
                labels = np.append(labels, 1)
                #print(df[label_counter])
            
            #temporarily store the last label that was added to "labels=[]"
            C = labels[-1]

        except:
            break
        #increment the graph by one time interval
        A += 1 

    #Create an array with the last value before the classification algorithm stopped
    array_append = []
    while A < len(df):
        array_append = np.append(array_append, C)
        A += 1
        

    labels = np.append(labels, array_append)
    return labels
df['Labels'] = create_labels(df['Close'])

Date
2008-01-02 06:00:00    1317.50
2008-01-02 06:01:00    1317.75
2008-01-02 06:02:00    1318.25
2008-01-02 06:03:00    1318.25
2008-01-02 06:04:00    1318.00
                        ...   
2023-06-09 16:55:00    4350.00
2023-06-09 16:56:00    4350.25
2023-06-09 16:57:00    4350.25
2023-06-09 16:58:00    4349.75
2023-06-09 16:59:00    4348.75
Name: Close, Length: 5381922, dtype: float64


## Create a secondary dataframe store to create normalized dataset

In [49]:
dataframe = pd.DataFrame()

In [50]:
dataframe['Open'] = df['Open'].pct_change()
dataframe['High'] = df['High'].pct_change()
dataframe['Low'] = df['low'].pct_change()
dataframe['Close'] = df['Close'].pct_change()
dataframe['Volume'] = df['Volume']
dataframe['50SMA'] = df['50SMA'].pct_change()
dataframe['200SMA'] = df['200SMA'].pct_change()
dataframe['RSI'] = df['RSI'].pct_change()
dataframe['Labels'] = df['Labels']

In [51]:
dataframe['Labels'].value_counts()

3.0    56147
0.0    43426
1.0    22729
2.0    21490
Name: Labels, dtype: int64

In [52]:
dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Volume,50SMA,200SMA,RSI,Labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-03 08:00:00,,,,,143200,,,,0.0
2005-01-03 08:30:00,-0.000247,0.000000,0.000082,-0.000082,284200,,,,0.0
2005-01-03 09:00:00,-0.000247,0.000164,-0.000658,0.000082,247900,,,,0.0
2005-01-03 09:30:00,0.000165,0.001151,0.000165,-0.000329,5974100,,,,0.0
2005-01-03 10:00:00,-0.000247,-0.000986,-0.004857,-0.004444,8743800,,,,0.0
...,...,...,...,...,...,...,...,...,...
2023-06-09 17:30:00,0.000093,0.000070,0.000023,-0.000023,18912,0.000139,0.000092,-0.002274,3.0
2023-06-09 18:00:00,0.000070,-0.000209,0.000023,-0.000047,4337,0.000156,0.000093,-0.004875,3.0
2023-06-09 18:30:00,-0.000070,0.000070,0.000047,0.000070,14252,0.000162,0.000088,0.007142,3.0
2023-06-09 19:00:00,0.000047,0.000023,0.000047,0.000093,604,0.000123,0.000095,0.009991,3.0


In [53]:
dataframe.to_csv('/Users/spencerfonbuena/Documents/Python/Trading Models/models/mach1/datasets/SPY_30mins.txt')

## Method for windowing the data

In [None]:
datafile = '/Users/spencerfonbuena/Documents/Python/Trading Models/models/mach1/datasets/AAPL_1hour_corrected.txt'
df = pd.read_csv(datafile, delimiter=',', index_col=0)

In [None]:
def window_dataset(df, window_size):
    A = 0
    window_set = []
    for i in range(len(df) - 100):
        example = df[i: window_size + i]
        window_set.append(np.expand_dims(example, 0))
        if i % 1000 == 0:
            print(i)
    return torch.tensor(np.vstack(window_set)).transpose(-1,-2)
df_expand = window_dataset(df, 100)