In [13]:
import os
import pandas as pd
import yahoo_fin.stock_info as si

In [14]:
# function to read price data and output new dataframe containing premarket change by date
# data to generate/retain:
    # overnight change percentage
    # following day change percentage
def feature_label(historical_prices: pd.DataFrame, filename: str) -> pd.DataFrame:
    # length of dataframe and lists
    length = historical_prices.shape[0] # num of rows

    # list of all opening and closing historical prices
    price_open = historical_prices['open'].tolist()
    price_close = historical_prices['close'].tolist()

    # dataframe that will be returned
    result_data = pd.DataFrame(columns=('sector', 'premarket_change', 'day_change'))

    company = filename[:-4]

    # getting sector of company
    sector = si.get_company_info(company).loc['sector']['Value']

    # iterating through the lists, determining changes by %, and adding to result DF
    # adds sector feature
    for i in range(length - 1):
        # change in price overnight / previous close price
        premarket_change = ( (price_open[i + 1] - price_close[i]) / price_close[i] ) * 100

        # change in price during day / opening price of day
        day_change = ( (price_close[i + 1] - price_open[i + 1]) / price_open[i + 1] ) * 100

        result_data.loc[i] = [sector, premarket_change, day_change]

    return result_data

In [None]:
directory = 'raw_stock_data'
n = 1
for filename in os.listdir(directory):
    
    # easy tracking of progress because I'm lazy
    print(f"getting data for {filename}... {n}/500")
    n += 1
    
    data = pd.read_csv(f'{directory}/{filename}')
    result = feature_label(data, filename)
    result.to_csv(path_or_buf=f"clean_stock_data/{filename}")