This code is to perform some time series analysis on the data 

In [94]:
import pandas as pd
from datetime import datetime,timedelta
import yfinance as yf
import os
import numpy as np
eps_df = pd.read_csv('eps_data.csv',parse_dates=['start','end','filed'])





The block below downloads data for the range of dates covered by EPS. 

In [56]:
for ticker in eps_df['ticker'].unique():
    # ticker = 'AAPL'
    start = eps_df[eps_df['ticker'] == ticker]['start'].min()
    end = eps_df[eps_df['ticker'] == ticker]['end'].max()

    try:
        if not os.path.exists(f'./stock_daily_data/{ticker}.csv'):
            data = yf.download(ticker, start - timedelta(days=3),
                           end + timedelta(days=3), progress=False)
            data.to_csv(f'./stock_daily_data/{ticker}.csv')
            print("Downloaded data for ", ticker)
    except Exception as e:
        print(e)
        print(f"Failed to download {ticker}")
        continue
    

    # print("Sleeping for 5 seconds")
    # time.sleep(60*60/2000)

This code pads them all to a unified single file 

In [126]:
iterables = [eps_df['ticker'].unique(), ['Close','High','Low','Open','Volume','Diluted EPS trailing','PE Ratio','filing_type','filing_dEPS','filing_period_start','filing_period_end']]
full_df = pd.DataFrame(index=pd.date_range(eps_df['start'].min(), datetime.today(), freq='d'), columns=pd.MultiIndex.from_product(iterables, names=['ticker', 'metric']))
full_df.index.name = 'date'

In [127]:
#eps_df contains eps values for a given time period for each ticker
#this code runs through each line and generates a unified time series data set for each ticker
for i, row in eps_df.iterrows():
    date_range = pd.date_range(start=row['start'], end=row['end'])
    # limit to quarterly earnings only
    if len(date_range) >85 and len(date_range) < 95:
        #df.loc[date_range, row['ticker']] = row['EarningsPerShareDiluted']
        full_df.loc[date_range, (row['ticker'],'Diluted EPS trailing')] = row['EarningsPerShareDiluted']*365/(row['end']-row['start']).days
        full_df.loc[row['filed'], (row['ticker'],'filing_type')] = row['form']
        full_df.loc[row['filed'], (row['ticker'],'filing_dEPS')] = row['EarningsPerShareDiluted']
        full_df.loc[row['filed'], (row['ticker'],'filing_period_start')] = row['start']
        full_df.loc[row['filed'], (row['ticker'],'filing_period_end')] = row['end']
#df.to_csv('eps_by_date.csv')


In [128]:
#fill values from the downloaded stock data
for ticker in eps_df['ticker'].unique():
    try:
        data = pd.read_csv(f'./stock_daily_data/{ticker}.csv', index_col='Date', parse_dates=True,header=2)
        data.columns = ['Close','High','Low','Open','Volume']
        full_df.loc[data.index, (ticker,'Close')] = data['Close']
        full_df.loc[data.index, (ticker,'High')] = data['High']
        full_df.loc[data.index, (ticker,'Low')] = data['Low']
        full_df.loc[data.index, (ticker,'Open')] = data['Open']
        full_df.loc[data.index, (ticker,'Volume')] = data['Volume']
    except Exception as e:
        print(e)
        print(f"Failed to load {ticker}")

"[Timestamp('2005-12-29 00:00:00'), Timestamp('2005-12-30 00:00:00')] not in index"
Failed to load AES
"[Timestamp('2005-12-29 00:00:00'), Timestamp('2005-12-30 00:00:00')] not in index"
Failed to load MDLZ


In [125]:
#calculate PE ratio
for ticker in eps_df['ticker'].unique():
    for i, row in full_df.iterrows():
        if not row[(ticker,'Diluted EPS')]>0  or np.isnan(row[(ticker,'Close')]):
            continue
        else:
            full_df.loc[i,(ticker,'PE Ratio')] = full_df.loc[i,(ticker,'Close')]/full_df.loc[i,(ticker,'Diluted EPS')]#calculate PE ratio


In [130]:

# Assuming full_df is your DataFrame and eps_df contains the tickers

for ticker in eps_df['ticker'].unique():
    # Filter rows where 'Diluted EPS' > 0 and 'Close' is not NaN
    valid_rows = full_df[(full_df[(ticker, 'Diluted EPS trailing')] > 0) & (~full_df[(ticker, 'Close')].isna())]
    
    # Calculate 'PE Ratio' for the filtered rows
    full_df.loc[valid_rows.index, (ticker, 'PE Ratio')] = valid_rows[(ticker, 'Close')] / valid_rows[(ticker, 'Diluted EPS trailing')]

# Now full_df has the 'PE Ratio' calculated only for the valid rows

In [134]:
print(full_df.loc['2022-01-07',('AAPL','PE Ratio')])
full_df.head(5)

27.479394804081995


ticker,A,A,A,A,A,A,A,A,A,A,...,ZTS,ZTS,ZTS,ZTS,ZTS,ZTS,ZTS,ZTS,ZTS,ZTS
metric,Close,High,Low,Open,Volume,Diluted EPS trailing,PE Ratio,filing_type,filing_dEPS,filing_period_start,...,High,Low,Open,Volume,Diluted EPS trailing,PE Ratio,filing_type,filing_dEPS,filing_period_start,filing_period_end
2006-01-01,,,,,,,,,,,...,,,,,,,,,,
2006-01-02,,,,,,,,,,,...,,,,,,,,,,
2006-01-03,,,,,,,,,,,...,,,,,,,,,,
2006-01-04,,,,,,,,,,,...,,,,,,,,,,
2006-01-05,,,,,,,,,,,...,,,,,,,,,,


In [30]:
#Checking for missing values. sucessful
# Assuming new_df is your DataFrame
missing_values = {}
new_df = df.copy()
for column in new_df.columns:
    # Drop leading and trailing NaNs
    trimmed_col = new_df[column].dropna()
    first_valid_index = trimmed_col.first_valid_index()
    last_valid_index = trimmed_col.last_valid_index()
    
    # Slice the column to exclude leading and trailing NaNs
    if first_valid_index is not None and last_valid_index is not None:
        trimmed_col = trimmed_col.loc[first_valid_index:last_valid_index + timedelta(days=1)]
    
    # Count the number of missing values in the trimmed column
    missing_count = trimmed_col.isna().sum()
    missing_values[column] = missing_count

# Print the number of missing values for each column
for ticker, count in missing_values.items():
    if count > 0:
        print(f"{ticker}: {count} missing values")