### This file preps the data to feed into the training 

The ML model predicts the eps that will be declared on a filing date
---
## Inputs:
- past 30 day normalised stock price metrics for all SPX stocks 
- past 4 Q EPS values for all SPX stocks

---
# Steps
1. Identify the model type to use 
2. Identify the effect of open close and volume on the prediction
3. Identify key values in the data sent to the model  - dimensionality reduction 


In [14]:
import pandas as pd
import numpy as np
import matplotlib
from datetime import datetime,timedelta
import os
root_df = pd.read_csv('extracted_data.csv',parse_dates=['start','end','filed'])

In [16]:
missingeps = pd.isna(root_df['EarningsPerShareDiluted'])
df = root_df.copy()
df.loc[missingeps, 'Diluted EPS PA'] = df.loc[missingeps, 'IncomeLossFromContinuingOperationsPerDilutedShare'] + df.loc[missingeps, 'IncomeLossFromDiscontinuedOperationsNetOfTaxPerDilutedShare']
df['Diluted EPS PA'] = df['EarningsPerShareDiluted'] *  365.25 / (df['end'] - df['start']).dt.days

Letst start with where to start the data 
Based on code below we know which year to start our analysis from ie 2012 (85% + have filed by 2011)
'''
start_days = df.pivot_table(index = 'ticker',values='filed',aggfunc='min')
df1 = start_days.groupby( start_days['filed'].dt.year, as_index=True).count()
'''

In [10]:
def daily_data(start, end, ticker, columns=['Close', 'High', 'Low', 'Open', 'Volume'], folder='daily-data'):
    filepath = os.path.join(os.getcwd(), folder, f'{ticker}.csv')
    data = pd.read_csv(filepath, index_col='Date', parse_dates=True)
    
    
    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    
    date_ranges = pd.date_range(start=start_date, end=end_date)
    
    for date_i in date_ranges:
        i=1
        while date_i not in data.index:
            try:
                data.loc[date_i] = data.loc[date_i - timedelta(days=i)]
                data.loc[date_i,:] = data.loc[date_i,'Close']
                data.loc[date_i,'Volume'] = 0
            except:
                i+=1
    data = data.sort_index()
    return data.loc[start_date:end_date, columns]

time = datetime.now()
daily_data('2019-11-01', '2019-12-31', 'AAPL')
print(datetime.now()-time)

0:00:00.032938


In [17]:
def eff_daily_data(start, end, ticker, columns=['Close', 'High', 'Low', 'Open', 'Volume'], folder='daily-data'):
    """
    Retrieves daily stock data for a given ticker and date range, efficiently handling missing data.
    Optimized for the specific CSV format provided.

    Args:
        start (str or datetime): Start date of the desired data range.
        end (str or datetime): End date of the desired data range.
        ticker (str): Stock ticker symbol.
        columns (list, optional): List of columns to include in the output. 
                                   Defaults to ['Close', 'High', 'Low', 'Open', 'Volume'].
        folder (str, optional): Name of the folder where data files are stored. Defaults to 'daily-data'.

    Returns:
        pd.DataFrame: DataFrame containing the requested stock data.
    """
    filepath = os.path.join(os.getcwd(), folder, f'{ticker}.csv')

    # Optimized read_csv based on provided sample data
    data = pd.read_csv(filepath, index_col='Date', parse_dates=True)

    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)

    # Efficiently handle missing dates using reindex and ffill
    full_date_range = pd.date_range(start=start_date, end=end_date)
    data = data.reindex(full_date_range)
    data[columns[:-1]] = data[columns[:-1]].ffill()  # Forward fill all columns except 'Volume'
    data['Volume'] = data['Volume'].fillna(0)  # Fill missing 'Volume' with 0

    return data.loc[start_date:end_date]
time = datetime.now()
eff_daily_data('2019-11-01', '2019-12-31', 'AAPL')
print(datetime.now()-time)

0:00:00.007760


Theses are the parameters to generate the training data

In [18]:
#Parmeters for training data generation
start_year = 2014
ticker = 'AAPL'
pastdays = 30 #Daysbefore filing date
daily_metrics = ['Close','Volume']
pastqtrs = 4
forms = [] #['10-K','10-Q']
Industry_level = True
Sector_level = False
Index_level = False #Need infor on historical index data
Limit_by_marketcap = 0 #0 for no limit



Filtering the filing data to show only relevant rows and columsn based on the filters declared above

In [30]:
#Generate training data by removing unwanted columns and rows
Training_data = df[(df['filed'].dt.year >= start_year) ]#& (df['ticker'] == ticker)]
Training_data = Training_data[['start', 'end', 'filed', 'ticker', 'form', 'Diluted EPS PA']]
Training_data['duration'] = (Training_data['end'] - Training_data['start']).dt.days


In [31]:
# Create the columns for the past days
for i in range(pastdays+1):
    for j in daily_metrics:
        col_name = f'{j}_{i}_days_ago'
        Training_data[col_name] = pd.Series(dtype=float)
Training_data=Training_data.reset_index()
columns_list=Training_data.columns.tolist()
           

In [32]:
#Fill past days
for index, row in Training_data.iterrows():
    stock_prices = daily_data(row['filed'] - timedelta(days=pastdays), row['filed'], ticker, daily_metrics)
    out = np.array(stock_prices).reshape(-1)
    paste_col = columns_list.index(f'{daily_metrics[0]}_0_days_ago')
    Training_data.iloc[index, paste_col:paste_col+len(out)] = out.astype(float)
    #print(len(out))
    #print(Training_data.iloc[index])
    #break
Training_data = Training_data.sort_values(by='filed', ascending=True).reset_index(drop=True)

: 

: 

In [40]:
#AI optimised Code

def generate_training_data(df, start_year, pastdays, daily_metrics, daily_data_folder='daily-data'):
    """
    Generates training data from a DataFrame of financial filings and a set of daily stock data CSV files.

    Args:
        df (pd.DataFrame): DataFrame containing financial filing data with columns:
                           'start', 'end', 'filed', 'ticker', 'form', 'Diluted EPS PA'.
        start_year (int): The earliest year to include data from (based on 'filed' date).
        pastdays (int): Number of past days of daily data to include for each filing.
        daily_metrics (list): List of daily metrics to include (e.g., ['Close', 'High', 'Low']).
        daily_data_folder (str): Folder containing the daily stock data CSV files.

    Returns:
        pd.DataFrame: Training data with historical daily data appended.
    """

    # 1. Filter and Prepare Data:
    Training_data = df[df['filed'].dt.year >= start_year].copy()  # Filter by year, use .copy() to avoid SettingWithCopyWarning
    Training_data = Training_data[['start', 'end', 'filed', 'ticker', 'form', 'Diluted EPS PA']]
    Training_data['duration'] = (Training_data['end'] - Training_data['start']).dt.days
    Training_data['filing delay'] = (Training_data['filed'] - Training_data['end']).dt.days
    Training_data['filing age'] = (datetime.now() - Training_data['filed']).dt.days

    # 2. Create Columns for Past Days Efficiently:
    past_days_columns = [f'{metric}_{i}_days_ago' for i in range(pastdays + 1) for metric in daily_metrics]
    Training_data = Training_data.reindex(columns=Training_data.columns.tolist() + past_days_columns)

    # 3. Optimized Data Loading and Processing:
    # a. Group by Ticker
    grouped = Training_data.groupby('ticker')

    # b. Iterate Through Groups
    for ticker, group_data in grouped:
        print(f"Processing data for ticker: {ticker}...")
        # Load daily data for the ticker ONCE
        try:
            filepath = os.path.join(os.getcwd(), daily_data_folder, f'{ticker}.csv')
            # Optimized read_csv based on previous improvements:
            daily_stock_data = pd.read_csv(filepath, parse_dates=True, index_col='Date')
        except (ValueError,FileNotFoundError):
            print(f"Warning: Data file not found for ticker: {ticker}. Skipping...")
            continue
        
        # Iterate through rows of each group efficiently using .itertuples()
        for row in group_data.itertuples():
            # c. Handle Missing Dates (Weekends/Holidays):
            start_date = row.filed - timedelta(days=pastdays)
            end_date = row.filed

            # Create a full date range, including weekends/holidays
            full_date_range = pd.date_range(start=start_date, end=end_date)

            # Reindex the daily data to include missing dates
            past_data = daily_stock_data.reindex(full_date_range)

            # Identify columns to fill (all except 'Volume', if present)
            cols_to_fill = [col for col in daily_metrics if col != 'Volume']

            # Fill 'Volume' with 0 for missing dates, if 'Volume' is in daily_metrics
            if 'Volume' in daily_metrics:
                past_data['Volume'] = past_data['Volume'].fillna(0)

            # Forward-fill with the last valid 'Close' value for other columns
            # 1. Get the last valid 'Close' value
            last_valid_close = past_data['Close'].ffill()

            # 2. Create a mask for missing values in other columns
            missing_mask = past_data[cols_to_fill].isna()

            # 3. Fill the missing values using the mask and the last valid 'Close'
            past_data[cols_to_fill] = past_data[cols_to_fill].where(~missing_mask, last_valid_close, axis=0)

            # Sort the data in descending order
            past_data = past_data.sort_index(ascending=False)
            past_data = past_data[daily_metrics]

            # Flatten and assign data
            out = np.array(past_data).reshape(-1)
            try:
                Training_data.loc[row.Index, past_days_columns] = out.astype(float)
            except Exception as e:
                print(e)
                print(str(start_date) + ticker+ str(end_date))
                print(out.shape)
                print(len(past_days_columns))
                #break
        #break
    # 4. Sort Data
    Training_data = Training_data.sort_values(by='filed', ascending=True).reset_index(drop=True)

    return Training_data
Training_data = generate_training_data(df, start_year, pastdays, daily_metrics)
Training_data.to_csv('training_data.csv', index=False)



In [42]:
Training_data.to_csv('training_data.csv',index=False)

In [24]:
filepath = './daily-data/BRK.B.csv'
a = pd.read_csv(filepath, parse_dates=True,index_col='Date')
a.index

ValueError: 'Date' is not in list

In [None]:

# Create columns for past results
for i in range(pastqtrs):
    col_name = f'PAdilutedEPS_{i}'
    col_name_days = f'PAdilutedEPS_{i}_days_ago' #filed 
    Training_data[col_name] = pd.Series(dtype=float)
    Training_data[col_name_days] = pd.Series(dtype=int)
