#### SET-UP

In [None]:
import ealib

import logging
import pandas as pd
from collections import Counter
from typing import List
import random
from typing import Tuple
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# EDGAR API requires request header
req_header = {"User-Agent": "roberto.brera.24@outlook.com"}

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logging.info("logging info")

In [None]:
# Dictionary to pandas dataframe
tickers_df = ealib.get_tickers_df(req_header)
tickers_df.head()

#### Download company filing example

In [None]:
# Find cik for some company given name substring
query_substr =  "adeco"
query_ticker = ealib.find_title_substring(tickers_df, query_substr).iloc[0]
query_ticker["cik_str"]

In [None]:
# Find and donwload 20-f
curr_comp_mtd = ealib.get_response_dict(ealib.metadata_url(query_ticker["cik_str"]), req_header, mrps=8)
curr_filings_df = pd.DataFrame.from_dict(curr_comp_mtd["filings"]["recent"])
curr_select_filings = ealib.filter_filings(curr_filings_df, "filingDate", "form", query_forms=["20"], max_days=360)
ealib.download_company_filings(req_header, mrps=8, comp_dir="Adecoagro S.A. 20-F", select_filings=curr_select_filings, cik=query_ticker["cik_str"], write_txt=False, write_pdf=True)

#### Filtering: Ticker df (yfinance, e.g. marketCap)

In [None]:
# Suppose we are iterating through tickers_df
curr_ticker = tickers_df.iloc[1000]
curr_yticker = yf.Ticker(curr_ticker["ticker"])

In [None]:
# Search for particular info keys
ealib.find_dict_key_substr(curr_yticker.info, ["cap"])
ealib.find_dict_key_substr(curr_yticker.info, ["currency"])

# Check other yfinance info, to then compare with SEC
ealib.find_keys_containing_all_substrs(curr_yticker.info, ["cash", "operating"])

In [None]:
# Search for the information, returning NA is not found
ealib.yf_info(curr_ticker["ticker"], "marketCap")
ealib.yf_info(curr_ticker["ticker"], "currency")
ealib.yf_info(curr_ticker["ticker"], "operatingCashflow")

In [None]:
# Generate info series for a df of tickers
marketCap_series = tickers_df[:10]["ticker"].apply(lambda x: ealib.yf_info(x, "marketCap"))

# How many nans have we got?
marketCap_series.isna().sum()

In [None]:
# Now apply filtering
market_cap_threshhold = 15*(10**9)
mask = marketCap_series < market_cap_threshhold
filtered_series = marketCap_series[mask]

# How many?
mask.sum()

#### SAVED search parameters for already-searched company facts

In [None]:
# IMPORTANT
shares_outstanding_query_str = ["NumberOfSharesOutstanding", "EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding"]
ocf_query_str = ["NetCashProvidedByUsedInOperatingActivities", "CashFlowsFromUsedInOperatingActivities"]
# sufficient = True


#### Analyze structure of comp_fact dictionaries

In [None]:
tot_tickers = 100
start_ticker = 1000
keys_counter = Counter()
index_dict = {}
for index, row in tickers_df[start_ticker:start_ticker + tot_tickers].iterrows():
    """ 
    IMPORTANT: Always handle None returns i.e. unsuccessful requests
    """
    cfacts = ealib.get_response_dict(ealib.companyfacts_url(row["cik_str"]), req_header, mrps=8)
    if cfacts == None:
        keys_counter["FAILED_REQS"] += 1
        index_dict["FAILED_REQS"] = index
        continue

    # Now that we have ascertained that cfacts not None, try to access defensively shares outstanding
    shares_out_info = cfacts.get("facts", {}).get("dei", {}).get("EntityCommonStockSharesOutstanding", None)
    keys_counter["cfacts found BUT SO not found" if shares_out_info is None else "cfacts found AND SO found"] += 1

    # Count all the locatable keys in cfacts["facts"]
    for key in cfacts["facts"].keys():
        keys_counter[key] += 1
        index_dict[key] = index

print(keys_counter)
print(index_dict)
print(f'tot_tickers = {tot_tickers}')
print(f'us-gaap + ifrs + failed reqs = {keys_counter["FAILED_REQS"] + keys_counter["us-gaap"] + keys_counter["ifrs-full"]}')


#### Procedure to locate & retrieve some arbitrary company fact (testing of comp_fact_df function)

In [None]:
# Start with broad list parameters to look for specific keys and then refine search
curr_ticker = ealib.find_title_substring(tickers_df, "adeco").iloc[0]

# Broad search
query_fact_substr = ["seg"]
sufficient = False
# Refined search
"""
query_fact_substr = ["NumberOfSharesOutstanding", "EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding"] # Refined list
sufficient = True
"""

comp_facts = ealib.get_response_dict(ealib.companyfacts_url(curr_ticker["cik_str"]), req_header, 8)
if comp_facts is None:
    logging.warning(f'Failed request when attempting to retrieve company facts for ticker {curr_ticker["ticker"]}, or comp_facts dictionary empty')

# Extract desired company fact
res = ealib.comp_facts_df(
    comp_facts,
    query_fact_substr, 
    sufficient,
)

# Log all matches
for units, as_key, match_fact, comp_fact_df in res:
    logging.info(f'Results with as_key = {as_key}, match_fact = {match_fact}, units = {units} for test with ticker number {random_number}')
    # logging.info(f'\n\t{comp_fact_df}')

if res:
    res_units, res_as_key, res_match_fact, res_comp_fact_df = min(res, key=lambda x: len(x[2]))
    # print(f'Shortest match fact tuple selected:\n\t as_key = {as_key}, match_fact = {match_fact} for test with ticker number {random_number}\n\t {res_comp_fact_df}')

#### Graphing a Company Fact Over Time

In [None]:
ticker = "WATT"

desired_match_fact = ['EntityCommonStockSharesOutstanding'] # forces only one fact
sufficient = True
max_days = 360 # day-range for filings

curr_ticker = ealib.find_ticker(tickers_df, ticker).iloc[0]
comp_facts = ealib.get_response_dict(ealib.companyfacts_url(curr_ticker["cik_str"]), req_header, 8)
if comp_facts is None:
    logging.warning(f'Failed request when attempting to retrieve company facts for ticker {curr_ticker["ticker"]}, or comp_facts dictionary empty')

# Extract desired company fact
res = ealib.comp_facts_df(comp_facts, query_fact_substr=desired_match_fact, sufficient=sufficient)

if res:
    res_units, res_as_key, res_match_fact, res_comp_fact_df = min(res, key=lambda x: len(x[2]))

    # Substr from saved search params
    """
    desired_substr = "NumberOfSharesOutstanding"
    sel_units, sel_as_key, sel_match_fact, sel_comp_fact_df = [t for t in res if desired_substr.lower() in t[2].lower()][0]
    """

    # Now filter filings dataframe
    filt_so_df = ealib.filter_filings(res_comp_fact_df, filing_date_col="end", form_col="form", query_forms=[""], max_days=max_days)
    
    # Plotting graph:
    plt.figure(figsize=(10, 5))  # Set the figure size (optional)
    plt.plot( filt_so_df['end'], filt_so_df['val'], marker='o')  # Line plot with markers
    plt.title(f'{res_match_fact} for {ticker} Inc.')  # Adding a title to the graph
    plt.xlabel('Filing end date')  # Label for the x-axis
    plt.ylabel('Reported Number of Shares Outstanding')  # Label for the y-axis
    plt.grid(True)  # Enable grid for easier readability
    plt.xticks(rotation=45)
    plt.show()  # Display the plot


#### Randomized Company Fact Retrieval Test

In [None]:
# Then refine the search criteria, and test on larger dataset

query_fact_substr = ["NumberOfSharesOutstanding", "EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding"] # Refined list
sufficient = True

num_tests = 100
len_counter = Counter()
for _ in range(num_tests):
    random_number = random.randint(1, 10000)
    curr_ticker = tickers_df.iloc[random_number]

    # Request comp facts dictionary
    comp_facts = ealib.get_response_dict(ealib.companyfacts_url(curr_ticker["cik_str"]), req_header, 8)
    if comp_facts is None:
            logging.warning(f'Failed request when attempting to retrieve company facts for ticker {curr_ticker["ticker"]}, or comp_facts dictionary empty')
            len_counter['failed reqs'] += 1

    # Extract desired company fact
    res = ealib.comp_facts_df(
        comp_facts,
        query_fact_substr, 
        sufficient,
    )
    # Record the number of matches
    len_counter[f'{len(res)}'] += 1

    for units, as_key, match_fact, comp_fact_df in res:
        logging.info(f'Results with as_key = {as_key}, match_fact = {match_fact} for test with ticker number {random_number}')
    # Now extract tuple with shortest match_fact
    if res:
        res_units, res_as_key, res_match_fact, res_comp_fact_df = min(res, key=lambda x: len(x[2]))
        # Optionally print it out
        """
        print(f'Shortest match fact tuple selected:\n\t as_key = {as_key}, match_fact = {match_fact} for test with ticker number {random_number}\n\t {res_comp_fact_df}')
        """

print(len_counter)

#### Calculating cash burn rate

In [None]:
# Checking cash burn rateresults on a few tickers
test_ticker_str = "AAPL"
max_days = 180

comp_facts = ealib.get_response_dict(ealib.companyfacts_url(ealib.find_ticker(tickers_df, test_ticker_str)["cik_str"].iloc[0]), req_header, 8)
if comp_facts is None:
        logging.warning(f'Failed request when attempting to retrieve company facts for ticker {curr_ticker["ticker"]}')

# Extract desired company fact
res = ealib.comp_facts_df(
    comp_facts,
    ["NetCashProvidedByUsedInOperatingActivities", "CashFlowsFromUsedInOperatingActivities"], 
    True
)

if res:
    res_units, res_as_key, res_match_fact, res_comp_fact_df = min(res, key=lambda x: len(x[2]))
    ocf_df_filt = ealib.filter_filings(res_comp_fact_df, filing_date_col="filed", form_col="form", query_forms=[""], max_days=max_days)
    print(ealib.ocf_average_daily_burn_rate(ocf_df_filt))


#### Converting between currencies

In [None]:
from_currency = "USD"
to_currency = "USD"
forex_ticker =  f"{from_currency}{to_currency}=X"

ealib.yf_info(forex_ticker, "previousClose")

#### Overall filtering function

In [None]:
# Main parameter setting and fucntion call
comp_out_df, missing_data_df = ealib.screen_select_companies(
    # general parameters:
        req_header=req_header, 
        mrps=8, 
        tickers_df=tickers_df, 
        root_dir="Selected filings", 
    # filtering parameters:
        query_forms = ["424B5", "S-3"], 
        max_days = 180, 
        max_market_cap = 15*(10**9), 
        max_ocf_daily_burn_rate = 0, 
        ocf_max_days = 180, 
        ocf_filing_date_col = "filed",
    # download parameters:
        out_df_sort_key = "Avg yearly OCF burn / Market Cap", 
        write_txt = False, 
        write_pdf = True
)

In [None]:
# Save to Excel
file_name = "Selected filings.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)

####  Reconstruct with yfinance missing_data_df

In [None]:
# First extract the original ticker
missing_data_ticks = missing_data_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)
missing_data_ticks

In [None]:
# Compute OCF with yfinance
operatingCashflow_series = missing_data_ticks.apply(lambda x: ealib.yf_info(x, "operatingCashflow"))

# How many Nones have we still got?
operatingCashflow_series.isna().sum()

In [None]:
# Now write enriched data frame back to Excel
operatingCashflow_series.name = "OCF (yf)"
enriched_df = pd.concat([missing_data_df, operatingCashflow_series], axis=1)
enriched_df.to_excel("enriched missing data.xlsx", sheet_name='Enriched missing data (yf)', index=False)

#### Filtering: remove pharma

In [None]:
file_name = "Selected filings companies 18-07-24 copy.xlsx"
header_row = 1

with pd.ExcelFile(file_name, engine='openpyxl') as reader:
    # Retrieve each DataFrame from the respective sheet
    comp_out_df = pd.read_excel(reader, sheet_name='Verified Companies', header=header_row)
    missing_data_df = pd.read_excel(reader, sheet_name='Companies with missing data', header=header_row)

In [None]:
# Clean the dataframes
comp_out_df.drop(columns=['Unnamed: 0'], inplace=True)
missing_data_df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Reconstruct original tickers
comp_out_df["ticker"] = comp_out_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)
missing_data_df["ticker"] = missing_data_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)

In [None]:
# Compute sic codes, and add as new columns
def get_sic(tickers_df, ticker, req_header, mrps):
    found_ticker = ealib.find_ticker(tickers_df, ticker)
    if found_ticker.empty:
        return None  # or some default value or error handling
    return ealib.get_response_dict(ealib.metadata_url(found_ticker.iloc[0]["cik_str"]), req_header, mrps=mrps)["sic"]

comp_out_df["sid"] = comp_out_df["ticker"].apply(lambda ticker: get_sic(tickers_df, ticker, req_header, 8))
missing_data_df["sid"] = missing_data_df["ticker"].apply(lambda ticker: get_sic(tickers_df, ticker, req_header, 8))

In [None]:
# First save to Excel with still all companies
with pd.ExcelWriter("Select Comps with sic.xlsx", engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)

In [None]:
comp_out_df.rename(columns={"sid": "sic"}, inplace=True)
missing_data_df.rename(columns={"sid": "sic"}, inplace=True)

In [None]:
# How many Nans? 
comp_out_df_na_percentage     = comp_out_df["sic"].isna().mean() * 100
missing_data_df_na_percentage = missing_data_df["sic"].isna().mean() * 100

logging.info(f"Percentage of NA/NaN/None values in comp_out_df['sic']: {comp_out_df_na_percentage:.2f}%")
logging.info(f"Percentage of NA/NaN/None values in missing_data_df['sic']: {missing_data_df_na_percentage:.2f}%")

In [None]:
# Counters for combined dataframes
sic_counter = Counter()

for index, row in pd.concat([comp_out_df, missing_data_df], ignore_index=True).iterrows():
    sic_counter[f"{row['sic']}"] += 1

# Print results
for key, value in sic_counter.items():
    print(f"{key}")


In [None]:
# Strings whose combined occurrence you want to find
pharma_biotech_sic_codes = [
    '2833', '2834', '2835', '2836', '8731', '8734', '3841', '3842', '3845'
]

comp_out_df_filt     = comp_out_df[~comp_out_df['sic'].isin(pharma_biotech_sic_codes)]
missing_data_df_filt = missing_data_df[~missing_data_df['sic'].isin(pharma_biotech_sic_codes)]

In [None]:
# Write back to Excel after processing:
file_name = "No pharma&biotech Select Comps.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df_filt.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df_filt.to_excel(writer, sheet_name='Companies with missing data', index=False)

#### Filtering: Change in Company Fact over time (e.g. common shares outstanding percent increase)

In [None]:
file_name = "No pharma&biotech Select Comps.xlsx"
header_row = 0

with pd.ExcelFile(file_name, engine='openpyxl') as reader:
    # Retrieve each DataFrame from the respective sheet
    comp_out_df = pd.read_excel(reader, sheet_name='Verified Companies', header=header_row)
    missing_data_df = pd.read_excel(reader, sheet_name='Companies with missing data', header=header_row)

In [None]:
# Apply on whole dataframes 
def perc_change_shares_out(tickers_df, ticker, req_header, mrps):
    found_ticker = ealib.find_ticker(tickers_df, ticker)
    if found_ticker.empty:
        return None 
    comp_facts = ealib.get_response_dict(ealib.companyfacts_url(found_ticker.iloc[0]["cik_str"]), req_header, mrps)
    if comp_facts is None:
        logging.warning(f'Failed request when attempting to retrieve company facts for ticker {found_ticker["ticker"]}, or comp_facts dictionary empty')
    # Extract desired company fact
    res = ealib.comp_facts_df(
        comp_facts, 
        query_fact_substr=["NumberOfSharesOutstanding", "EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding"], 
        sufficient=True
    )
    # Iterate through all the matched facts and their dataframes
    return ealib.comp_fact_avg_change(res, 180, 360)

comp_out_df["% change in shares outstanding (~6 months)"] = comp_out_df["ticker"].apply(lambda ticker: perc_change_shares_out(tickers_df, ticker, req_header, 8))
missing_data_df["% change in shares outstanding (~6 months)"] = missing_data_df["ticker"].apply(lambda ticker: perc_change_shares_out(tickers_df, ticker, req_header, 8))

In [None]:
# How many nans?
comp_out_df_na_percentage     = comp_out_df["% change in shares outstanding (~6 months)"].isna().mean() * 100
missing_data_df_na_percentage = missing_data_df["% change in shares outstanding (~6 months)"].isna().mean() * 100

logging.info(f"Percentage of NA/NaN/None values in comp_out_df['% change in shares outstanding (~6 months)']: {comp_out_df_na_percentage:.2f}%")
logging.info(f"Percentage of NA/NaN/None values in missing_data_df['% change in shares outstanding (~6 months)']: {missing_data_df_na_percentage:.2f}%")

In [None]:
# Move non verifiable data to missing_data_df
missing_data_df = pd.concat([missing_data_df, comp_out_df[comp_out_df['sic'].isna()]], ignore_index=True)
missing_data_df = pd.concat([missing_data_df, comp_out_df[comp_out_df['% change in shares outstanding (~6 months)'].isna()]], ignore_index=True)

comp_out_df = comp_out_df[~comp_out_df['sic'].isna()]
comp_out_df = comp_out_df[~comp_out_df['% change in shares outstanding (~6 months)'].isna()]


In [None]:
# Only keep columns with positive change in shares outstanding 
comp_out_df = comp_out_df[comp_out_df['% change in shares outstanding (~6 months)'] > 0]
comp_out_df = comp_out_df.sort_values(by="% change in shares outstanding (~6 months)", ascending=False)
comp_out_df["sic"] = comp_out_df["sic"].astype(int)

#### Filtering: market cap > 100mn

In [None]:
mcap_min = 1*(10**8)
comp_out_df = comp_out_df[comp_out_df["Market Cap (USD)"] > mcap_min]
missing_data_df = missing_data_df[~(missing_data_df["Market Cap (USD)"] < mcap_min)]

In [None]:
comp_out_df = comp_out_df.sort_values(by='Avg yearly OCF burn / Market Cap', ascending=True)

In [None]:
# Save back to Excel
file_name = "Updated Screening 19-07-2024.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)

### COMPANY CONCEPT

In [None]:
rev_concept = ealib.get_response_dict(
    ealib.companyconcept_url(query_cik, "/us-gaap/Revenues"), 
    req_header,
    mrps=mrps
)

# Coincides with company facts request
pd.DataFrame(rev_concept["units"]["USD"])