In [1]:
import os
import sys
import requests
from datetime import datetime
import numpy as np
import pandas as pd
import yfinance as yf

In [2]:
file_name = '../data/price_1m.csv'
start_date = '2024-05-01'
# end_date = '2023-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')


def grab_price_data(start_date,end_date):
    #tickers_list = ['JPM', 'COST', 'IBM', 'HD', 'ARWR']
    tickers_list = ['GOOG','JPM', 'COST', 'IBM', 'HD', 'ARWR']

# Store multiple result sets.
    
    full_price_history = []

    for ticker in tickers_list:
        price_history = yf.Ticker(ticker).history(period='max', start=start_date, end=end_date, interval='1d')

        for index, row in price_history.iterrows():
            row_data = row.to_dict()
            row_data['symbol'] = ticker
            row_data['datetime'] = index.strftime('%Y-%m-%d')  # Convert Pandas Timestamp to datetime string
            full_price_history.append(row_data)


    price_data = pd.DataFrame(full_price_history)
    
    
    price_data_ro = price_data
    price_data = price_data_ro[['datetime', 'symbol', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']] # rearrange column here
    price_data.to_csv('../data/inital_price_data.csv', index=False)
    price_data.head()


if os.path.exists('../data/inital_price_data.csv'):
    
    # Load the data
    price_data = pd.read_csv('../data/inital_price_data.csv')

else:

    # Grab the data and store it.
    grab_price_data(start_date, end_date)

    # Load the data
    price_data = pd.read_csv('../data/inital_price_data.csv')

# Display the head before moving on.
price_data.head()


def calculate_price_change():
    # Sort the data by symbol and datetime
    price_data.sort_values(by = ['symbol','datetime'], inplace = True)

    # calculate the change in price
    price_data['change_in_price'] = price_data['Close'].diff()

    return price_data.head()



def calculate_price_change2():
    mask = price_data['symbol'] != price_data['symbol'].shift(1)

    # For those rows, let's make the value null
    price_data['change_in_price'] = np.where(mask == True, np.nan, price_data['change_in_price'])

    # print the rows that have a null value, should only be 5
    return price_data[price_data.isna().any(axis = 1)]


def smoothed_df():
    # define the number of days out you want to predict
    days_out = 30

    # Group by symbol, then apply the rolling function and grab the Min and Max.
    price_data_smoothed = price_data.groupby(['symbol'])[['Close','Low','High','Open','Volume']].transform(lambda x: x.ewm(span = days_out).mean())

    # Join the smoothed columns with the symbol and datetime column from the old data frame.
    smoothed_df = pd.concat([price_data[['symbol','datetime']], price_data_smoothed], axis=1, sort=False)

    

    # create a new column that will house the flag, and for each group calculate the diff compared to 30 days ago. Then use Numpy to define the sign.
    smoothed_df['Signal_Flag'] = smoothed_df.groupby('symbol')['Close'].transform(lambda x : np.sign(x.diff(days_out)))

    # print the first 50 rows
    return smoothed_df.head(50)



def calculate_RSI():
    # Calculate the 14 day RSI
    n = 14

    # First make a copy of the data frame twice
    up_df, down_df = price_data[['symbol','change_in_price']].copy(), price_data[['symbol','change_in_price']].copy()

    # For up days, if the change is less than 0 set to 0.
    up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

    # For down days, if the change is greater than 0 set to 0.
    down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

    # We need change in price to be absolute.
    down_df['change_in_price'] = down_df['change_in_price'].abs()

    # Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
    ewma_up = up_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
    ewma_down = down_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

    # Calculate the Relative Strength
    relative_strength = ewma_up / ewma_down

    # Calculate the Relative Strength Index
    relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

    # Add the info to the data frame.
    price_data['down_days'] = down_df['change_in_price']
    price_data['up_days'] = up_df['change_in_price']
    price_data['RSI'] = relative_strength_index

    # Display the head.
    return price_data.head(30)



def calculate_Stoc_Osc():
    # Calculate the Stochastic Oscillator
    n = 14

    # Make a copy of the high and low column.
    low_14, high_14 = price_data[['symbol','Low']].copy(), price_data[['symbol','High']].copy()

    # Group by symbol, then apply the rolling function and grab the Min and Max.
    low_14 = low_14.groupby('symbol')['Low'].transform(lambda x: x.rolling(window = n).min())
    high_14 = high_14.groupby('symbol')['High'].transform(lambda x: x.rolling(window = n).max())

    # Calculate the Stochastic Oscillator.
    k_percent = 100 * ((price_data['Close'] - low_14) / (high_14 - low_14))

    # Add the info to the data frame.
    price_data['low_14'] = low_14
    price_data['high_14'] = high_14
    price_data['k_percent'] = k_percent

    # Display the head.
    return price_data.head(30)

def calculate_williams_R():
    # Calculate the Williams %R
    n = 14

    # Make a copy of the high and low column.
    low_14, high_14 = price_data[['symbol','Low']].copy(), price_data[['symbol','High']].copy()

    # Group by symbol, then apply the rolling function and grab the Min and Max.
    low_14 = low_14.groupby('symbol')['Low'].transform(lambda x: x.rolling(window = n).min())
    high_14 = high_14.groupby('symbol')['High'].transform(lambda x: x.rolling(window = n).max())

    # Calculate William %R indicator.
    r_percent = ((high_14 - price_data['Close']) / (high_14 - low_14)) * - 100

    # Add the info to the data frame.
    price_data['r_percent'] = r_percent

    # Display the head.
    return price_data.head(30)


def calculate_MACD():
    # Calculate the MACD
    ema_26 = price_data.groupby('symbol')['Close'].transform(lambda x: x.ewm(span = 26).mean())
    ema_12 = price_data.groupby('symbol')['Close'].transform(lambda x: x.ewm(span = 12).mean())
    macd = ema_12 - ema_26

    # Calculate the EMA
    ema_9_macd = macd.ewm(span = 9).mean()

    # Store the data in the data frame.
    price_data['MACD'] = macd
    price_data['MACD_EMA'] = ema_9_macd

    # Print the head.
    return price_data.head(30)

def calculate_price_rate_of_change():
    # Calculate the Price Rate of Change
    n = 9

    # Calculate the Rate of Change in the Price, and store it in the Data Frame.
    price_data['Price_Rate_Of_Change'] = price_data.groupby('symbol')['Close'].transform(lambda x: x.pct_change(periods = n))

    # Print the first 30 rows
    return price_data.head(30)

def obv(group):
    
    # Grab the volume and close column.
    volume = group['Volume']
    change = group['Close'].diff()

    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    # Return a panda series.
    return pd.Series(obv_values, index = group.index)
        
def calculate_obv():
# apply the function to each group
    obv_groups = price_data.groupby('symbol').apply(obv)

    # add to the data frame, but drop the old index, before adding it.
    price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)

    # display the data frame.
    return price_data.head(30)

'''
    In this case, let's create an output column that will be 1 if the closing price at time 't' is greater than 't-1' and 0 otherwise.
    In other words, if the today's closing price is greater than yesterday's closing price it would be 1.
'''

# Create a column we wish to predict
def calculate_prediction():
    
# Group by the `Symbol` column, then grab the `Close` column.
    close_groups = price_data.groupby('symbol')['Close']

    # Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
    close_groups = close_groups.transform(lambda x : np.sign(x.diff()))

    # add the data to the main dataframe.
    price_data['Prediction'] = close_groups

    # for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
    price_data.loc[price_data['Prediction'] == 0.0] = 1.0

    # print the head
    return price_data.head(50)
    # OPTIONAL CODE: Dump the data frame to a CSV file to examine the data yourself.
    # price_data.to_csv('final_metrics.csv')



#-----------------------------

calculate_price_change()
calculate_price_change2()
smoothed_df()

calculate_RSI()
calculate_Stoc_Osc()
calculate_williams_R()
calculate_MACD()
calculate_price_rate_of_change()
calculate_obv()
calculate_prediction()


Unnamed: 0,datetime,symbol,Open,High,Low,Close,Volume,Dividends,Stock Splits,change_in_price,...,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
95,2024-05-01,ARWR,22.719999,23.84,22.66,23.17,1113600.0,0.0,0.0,,...,,,,,,0.0,0.0,,0.0,
96,2024-05-02,ARWR,23.459999,24.299999,22.73,24.129999,887600.0,0.0,0.0,0.959999,...,100.0,,,,,0.021538,0.011966,,887600.0,1.0
97,2024-05-03,ARWR,25.030001,25.235001,24.120001,24.440001,630800.0,0.0,0.0,0.310001,...,100.0,,,,,0.037057,0.022249,,1518400.0,1.0
98,2024-05-06,ARWR,24.559999,25.24,24.24,25.16,619900.0,0.0,0.0,0.719999,...,100.0,,,,,0.069203,0.038155,,2138300.0,1.0
99,2024-05-07,ARWR,25.15,25.379999,24.860001,25.0,732000.0,0.0,0.0,-0.16,...,90.254421,,,,,0.078846,0.05026,,1406300.0,-1.0
100,2024-05-08,ARWR,24.73,25.27,24.299999,25.219999,906000.0,0.0,0.0,0.219999,...,91.559471,,,,,0.092569,0.061728,,2312300.0,1.0
101,2024-05-09,ARWR,25.34,25.34,24.59,24.719999,707500.0,0.0,0.0,-0.5,...,67.763162,,,,,0.077124,0.065624,,1604800.0,-1.0
102,2024-05-10,ARWR,25.48,25.48,21.790001,22.08,2540200.0,0.0,0.0,-2.639999,...,26.230321,,,,,-0.06003,0.035427,,-935400.0,-1.0
103,2024-05-13,ARWR,22.34,22.76,21.65,22.059999,1949600.0,0.0,0.0,-0.02,...,26.090534,,,,,-0.155086,-0.008582,,-2885000.0,-1.0
104,2024-05-14,ARWR,22.389999,24.219999,22.389999,22.91,2046000.0,0.0,0.0,0.85,...,41.403523,,,,,-0.176477,-0.0462,-0.011221,-839000.0,1.0


In [3]:
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
price_data.tail()



Before NaN Drop we have 114 rows and 22 columns
After NaN Drop we have 36 rows and 22 columns


Unnamed: 0,datetime,symbol,Open,High,Low,Close,Volume,Dividends,Stock Splits,change_in_price,...,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
33,2024-05-21,JPM,197.0,199.899994,196.600006,199.520004,14420800.0,0.0,0.0,3.940002,...,54.790297,188.460007,205.880005,63.490234,-36.509766,0.945536,0.901467,0.01978,34255400.0,1.0
34,2024-05-22,JPM,199.0,200.940002,197.690002,198.309998,9425300.0,0.0,0.0,-1.210007,...,50.916479,188.460007,205.880005,56.544155,-43.455845,0.803762,0.881926,0.004101,24830100.0,-1.0
35,2024-05-23,JPM,197.809998,198.300003,196.070007,196.919998,8069400.0,0.0,0.0,-1.389999,...,46.553689,189.820007,205.880005,44.209165,-55.790835,0.599646,0.82547,-0.009307,16760700.0,-1.0
36,2024-05-24,JPM,197.75,200.759995,197.559998,200.710007,7355400.0,0.0,0.0,3.790009,...,57.902199,191.0,205.880005,65.2554,-34.7446,0.674923,0.795361,0.009963,24116100.0,1.0
37,2024-05-28,JPM,199.860001,200.410004,198.660004,199.5,6909700.0,0.0,0.0,-1.210007,...,53.701667,191.0,205.880005,57.123637,-42.876363,0.646945,0.765678,-0.009975,17206400.0,-1.0


In [4]:
price_data.to_csv(file_name)