In [2]:
import numpy as np
import pandas as pd
import requests
import datetime
import time

In [3]:
# list of stocks is comprised of the top 500 stocks by market cap (from Fidelity) on March 22, 2022
# ISSUE: range is by market cap but is range from future date (that do not line up with year of financial statements)
stock_list = pd.read_csv('screener_results.csv')
stock_list = stock_list.loc[:, ['Symbol', 'Sector', 'Industry']]

In [4]:
stock_list

Unnamed: 0,Symbol,Sector,Industry
0,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals"
1,MSFT,Information Technology,Software
2,AMZN,Consumer Discretionary,Internet & Direct Marketing Retail
3,TSLA,Consumer Discretionary,Automobiles
4,GOOGL,Communication Services,Interactive Media & Services
...,...,...,...
9775,SPRQF,--,--
9776,BIAF,Health Care,Biotechnology
9777,MOB,Information Technology,Communications Equipment
9778,MATEF,Financials,Capital Markets


In [1]:
# restore with 
%store -r error_load
%store -r error_date
%store -r master_df
%store -r x 

no stored variable or alias error_load
no stored variable or alias error_date
no stored variable or alias master_df
no stored variable or alias x


In [16]:
# requests financial data from alphavantage API
master_df = pd.DataFrame()

# counter (corresponds to stock_list index)
x = 0

error_load = pd.Series([], dtype=str)
error_date = pd.Series([], dtype=str)

In [2]:
from secrets import Special_API_key

In [None]:
# keep track of ranges (underlying facts about the data)
# comparing first_date which is 5 to 11 days after earnings is announced to 
# second_date which is 341 to 347 after earnings is announced 

for stock, sector, industry  in stock_list.values[12:]:
    
    symbol = stock
    
    # if symbol is not valid then it will return an empty json object
    income_api_url = f'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol={symbol}&apikey={Special_API_key}'
    income_data = requests.get(income_api_url).json()

    cash_api_url = f'https://www.alphavantage.co/query?function=CASH_FLOW&symbol={symbol}&apikey={Special_API_key}'
    cash_data = requests.get(cash_api_url).json()

    balance_api_url = f'https://www.alphavantage.co/query?function=BALANCE_SHEET&symbol={symbol}&apikey={Special_API_key}'
    balance_data = requests.get(balance_api_url).json()

    price_api_url = f'https://www.alphavantage.co/query?function=TIME_SERIES_WEEKLY_ADJUSTED&symbol={symbol}&outputsize=full&apikey={Special_API_key}'
    price_data = requests.get(price_api_url).json()
    
    try: 
        # extract financial statements 
        inc_data = income_data['annualReports'][-1].copy()
        cas_data = cash_data['annualReports'][-1].copy()
        bal_data = balance_data['annualReports'][-1].copy()
    
    except KeyError: 
        # len() should return 3 for each one, should return 0 if a valid api call was not performed, should return 1 if api call limit has been reached
        print(f"Error: {len(income_data)}{len(cash_data)}{len(balance_data)} could not load financial statements for \t\t {symbol}")
            
        error_load = pd.concat([error_load, pd.Series(symbol)])
        x += 1 
        time.sleep(60)
        continue # skips rest of code in this loop 


    earnings_date = inc_data['fiscalDateEnding']
    currency = inc_data['reportedCurrency']

    first_date = datetime.datetime.strptime(earnings_date, '%Y-%m-%d')
    weekday = int(first_date.strftime('%w')) # Sunday is 0 and Saturday is 6. 
    
    # changes the date to the following friday if it is a weekday 
    if  weekday == 1: # Monday  
        first_date += datetime.timedelta(11)
    elif weekday == 2: 
        first_date += datetime.timedelta(10)
    elif weekday == 3: 
        first_date += datetime.timedelta(9)
    elif weekday == 4: 
        first_date += datetime.timedelta(8)
    elif weekday == 5: 
        first_date += datetime.timedelta(7) 
    elif weekday == 6: 
        first_date += datetime.timedelta(6)
    elif weekday == 0: # Sunday 
        first_date += datetime.timedelta(5)
    
    # has to be a multiple of 7 (Ex. 7 * 48 = 336)
    # second_date should be before next years annual earnings announcement
    second_date = first_date + datetime.timedelta(336)
    
    first_date = first_date.strftime('%Y-%m-%d')
    second_date = second_date.strftime('%Y-%m-%d')
    
    try: 
        first_price = price_data['Weekly Adjusted Time Series'][first_date]['5. adjusted close']
        second_price = price_data['Weekly Adjusted Time Series'][second_date]['5. adjusted close']
    except KeyError: 
        # len(price_data) should return 2, should return 0 if a valid api call was not performed, should return 1 if api call limit has been reached
        print(f"Error: {len(price_data)} price_data does not contain {first_date} or {second_date} \t\t {symbol}")
        
        error_date = pd.concat([error_date, pd.Series(symbol)])
        x += 1 
        time.sleep(60)
        continue
        
    first_price = float(first_price)
    second_price = float(second_price)

    perc_change = (second_price - first_price) / first_price

    df_details = pd.DataFrame({'ticker': symbol, 
                               'sector': sector, 
                               'industry': industry, 
                               'earnings_date': earnings_date, 
                               'currency': currency, 
                               'current_price': first_price, 
                               'following_price': second_price,
                               'first_price_date': first_date,
                               'second_price_date': second_date, 
                               'percent_change': perc_change}, 
                              index=[0])

    df_inc = pd.DataFrame(inc_data, index=[0])
    df_inc.drop(['fiscalDateEnding', 'reportedCurrency'], axis=1, inplace=True)

    df_bal = pd.DataFrame(bal_data, index=[0])
    df_bal.drop(['fiscalDateEnding', 'reportedCurrency'], axis=1, inplace=True)

    df_cas = pd.DataFrame(cas_data, index=[0])
    df_cas.drop(['fiscalDateEnding', 'reportedCurrency', 'netIncome'], axis=1, inplace=True)

    df_all = pd.concat([df_details, df_inc, df_cas, df_bal], keys=['details', 'income', 'cash', 'balance'], axis=1)

    master_df = pd.concat([master_df, df_all], ignore_index=True)
    
    
    
    print(x, symbol)
    x += 1 
    
    time.sleep(60)

In [17]:
%store error_load 
%store error_date 
%store master_df
%store x 

Stored 'error_load' (Series)
Stored 'error_date' (Series)
Stored 'master_df' (DataFrame)
Stored 'x' (int)


In [None]:
# replaces any None values which are interpreted as a string which can not be changed to floats and then changes respective columns to floats
master_df.iloc[:, 8:] = master_df.iloc[:, 8:].replace('None', np.nan).astype(float)

In [25]:
# flattens column index 
master_df.columns = master_df.columns.get_level_values(1)

In [26]:
master_df.to_csv('./financial_statements.csv', index=False)