In [1]:
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from datetime import datetime
import yfinance as yf
import pandas as pd
import os

load_dotenv(override=True)

postgres_v = os.getenv("POSTGRES_VERSION")
postgres_url = os.getenv("POSTGRES_URL")
postgres_user = os.getenv("POSTGRES_USER")
postgres_pass = os.getenv("POSTGRES_PASSWORD")
postgres_table = os.getenv("POSTGRES_TABLE")
format_file = os.getenv("FORMAT_FILE")
_mode = os.getenv("MODE")

config_ = postgres_v

column_1_name = os.getenv("COLUMN_1")
column_2_name = os.getenv("COLUMN_2")
column_3_name = os.getenv("COLUMN_3")
column_4_name = os.getenv("COLUMN_4")
column_5_name = os.getenv("COLUMN_5")
column_6_name = os.getenv("COLUMN_6")
column_7_name = os.getenv("COLUMN_7")
column_8_name = os.getenv("COLUMN_8")

list_remove = ['GEV','SOLV','VLTO','BF.B','BRK.B']

class ListSAndP500:
    def __init__(self):
        """
        Initialize the ListSAndP500 class

        Attributes:
        tickers_string (list): A list of stock symbols in string format
        tickers_list (list): A list of stock symbols in list format

        """
        _tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
        _tickers = _tickers.Symbol.to_list()
        self.tickers_string = [i.replace('.','-') for i in _tickers]
        _tickers_list_transform_ = [i if i not in list_remove else False for i in _tickers]
        self.tickers_list = [i for i in _tickers_list_transform_ if i]
        
list_of_symbols__ = ListSAndP500().tickers_list

class YahooFinance:
    def __init__(self, list_of_symbols, start, end):
        self.schema = StructType([
            StructField(column_1_name, StringType(), True),
            StructField(column_2_name, StringType(), True),
            StructField(column_3_name, StringType(), True),
            StructField(column_4_name, StringType(), True),
            StructField(column_5_name, StringType(), True),
            StructField(column_6_name, StringType(), True),
            StructField(column_7_name, StringType(), True)
            #StructField(column_8_name, StringType(), True)
        ])

        self.symbols = list_of_symbols
        self.interval = '1d'
        self.start = start
        self.end = end
        self.results = self.process_data()

    def process_data(self):
        """
        Process the historical stock data for the stock symbols
        """
        data = self.get_data()
        return self.transform_data(data)

    def get_data(self):
        """
        Get historical stock data from Yahoo Finance API using yfinance library

        Returns:
        DataFrame: A DataFrame containing historical stock data
        """
        try:
            data = yf.download(
                self.symbols,
                start=self.start,
                end=self.end,
                interval=self.interval,
                ignore_tz=True,
                threads=5,
                timeout=60,
                progress=True
            )
            return data
        except Exception as e:
            print(f"Error downloading data: {e}")
            return None

    def transform_data(self, df):
        """
        Transform the historical stock data into a format that can be stored in a database FactPrices table

        Args:
        df (DataFrame): A DataFrame containing historical stock data

        Returns:
        DataFrame: A DataFrame containing transformed historical stock data with the following columns:
        - stock_id (str): The stock symbol
        - date (str): The date of the stock data
        - open (float): The opening price of the stock
        - high (float): The highest price of the stock
        - low (float): The lowest price of the stock
        - close (float): The closing price of the stock
        - volume (int): The volume of the stock
        - adjusted_close (float): The adjusted closing price of the stock

        """
        # Reset the index to turn the MultiIndex into columns
        df = df.reset_index()

        # Create a list to store transformed records
        records = []

        # Iterate over each row and stock symbol
        for index, row in df.iterrows():
            date = row[('Date', '')]
            for stock in self.symbols:
                try:
                    record = {
                        column_1_name: stock,
                        column_2_name: date,
                        column_3_name: row[('Open', stock)],
                        column_4_name: row[('High', stock)],
                        column_5_name: row[('Low', stock)],
                        column_6_name: row[('Close', stock)],
                        column_7_name: row[('Volume', stock)]
                        #column_8_name: row[('Adj Close', stock)]
                    }
                    records.append(record)
                except KeyError as e:
                    print(f"KeyError: {e} for stock: {stock} on date: {date}")

        # Convert the list of records into a DataFrame
        return pd.DataFrame(records)

In [3]:
transformed_data = YahooFinance(list_of_symbols__, '2014-01-01', datetime.now().strftime('%Y-%m-%d')).results
print(transformed_data.head())

[*********************100%%**********************]  498 of 498 completed


  stock_id       date        open        high         low       close  \
0      MMM 2014-01-02  115.426422  116.220734  115.058525  115.493309   
1      AOS 2014-01-02   26.965000   26.990000   26.535000   26.660000   
2      ABT 2014-01-02   38.090000   38.400002   38.000000   38.230000   
3     ABBV 2014-01-02   52.119999   52.330002   51.520000   51.980000   
4      ACN 2014-01-02   81.500000   81.919998   81.089996   81.129997   

      volume  
0  3650312.0  
1  1297000.0  
2  4967500.0  
3  4569100.0  
4  2405400.0  


In [4]:
len(transformed_data)

1311732

In [19]:
transformed_data_today = transformed_data.sort_values(by=['date', 'high','volume'],ascending=False)
transformed_data_today_highest = transformed_data_today.head(20)
transformed_data_today_highest

Unnamed: 0,stock_id,date,open,high,low,close,volume
1311581,NVR,2024-06-20,7507.569824,7562.299805,7422.330078,7514.879883,22500.0
1311303,BKNG,2024-06-20,3995.0,4004.399902,3961.280029,3972.080078,211600.0
1311337,CMG,2024-06-20,3445.580078,3445.580078,3188.0,3214.419922,844100.0
1311285,AZO,2024-06-20,2979.469971,3029.879883,2979.469971,3008.22998,287400.0
1311308,AVGO,2024-06-20,1803.459961,1804.75,1722.219971,1734.560059,6595500.0
1311546,MTD,2024-06-20,1466.77002,1470.72998,1452.27002,1456.589966,119400.0
1311424,FICO,2024-06-20,1428.25,1428.609985,1401.670044,1412.76001,138300.0
1311680,TDG,2024-06-20,1347.869995,1349.0,1325.26001,1329.670044,208400.0
1311516,LRCX,2024-06-20,1091.920044,1093.390015,1050.219971,1061.310059,1461200.0
1311583,ORLY,2024-06-20,1059.5,1083.23999,1059.5,1080.569946,724300.0


## Transform Data

In [None]:
total_data_dict = dict()
fileroute="./dataset/"
fileroute_ticket="./dataset/tickets/"
transformed_data.to_csv(fileroute + "Summary.csv", index=False, encoding='utf-8')

for i in range(len(list_of_symbols__)):
    filtered_data = transformed_data[transformed_data['stock_id'] == list_of_symbols__[i]]
    filtered_data = filtered_data.reset_index()
    total_data_dict[list_of_symbols__[i]] = filtered_data
    filtered_data.to_csv(fileroute_ticket + list_of_symbols__[i] + ".csv", index=False, encoding='utf-8')

## Checking the missing data

In [5]:
unique_dates = pd.DatetimeIndex(transformed_data['date'].unique())

In [6]:
complete_dates = pd.date_range(start='2022-01-01', end=datetime.now().strftime('%Y-%m-%d'))

missing_dates = complete_dates.difference(unique_dates)

# Convert missing dates to a DataFrame for display
missing_dates_df = pd.DataFrame(missing_dates, columns=['missing_dates'])

print(len(missing_dates_df))
print(missing_dates_df.tail(20))
print(len(complete_dates))

284
    missing_dates
264    2024-04-21
265    2024-04-27
266    2024-04-28
267    2024-05-04
268    2024-05-05
269    2024-05-11
270    2024-05-12
271    2024-05-18
272    2024-05-19
273    2024-05-25
274    2024-05-26
275    2024-05-27
276    2024-06-01
277    2024-06-02
278    2024-06-08
279    2024-06-09
280    2024-06-15
281    2024-06-16
282    2024-06-19
283    2024-06-20
902
