### General Set Up

In [1]:
import ealib
import logging
import pandas as pd
from collections import Counter
from typing import List
import random
from typing import Tuple
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np

# Required to identify with EDGAR API
req_header = {"User-Agent": "roberto.brera.24@outlook.com"}

# Select rate of requests (< 10)
mrps = 8

# Select desired logging level
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

# Stores information about all tickers currently available on SEC database
tickers_df = ealib.get_tickers_df(req_header)

2024-07-20 18:52:49,507 - INFO - Request to https://www.sec.gov/files/company_tickers.json returned successfully. Response code: 200


### Filtering steps

#### Main filtering function

In [None]:
# Main parameter setting and fucntion call
comp_out_df, missing_data_df = ealib.screen_select_companies(
    # general parameters:
        req_header=req_header, 
        mrps=8, 
        tickers_df=tickers_df, 
        root_dir="Selected filings", 
    # filtering parameters:
        query_forms = ["424B5", "S-3"], 
        max_days = 180, 
        max_market_cap = 15*(10**9), 
        max_ocf_daily_burn_rate = 0, 
        ocf_max_days = 180, 
        ocf_filing_date_col = "filed",
    # download parameters:
        out_df_sort_key = "Avg yearly OCF burn / Market Cap", 
        write_txt = False, 
        write_pdf = True
)

In [None]:
# Save to Excel
file_name = "Selected filings.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)

####  Attempt to retrieve OCF with yf

In [None]:
# First extract the original ticker
missing_data_ticks = missing_data_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)
missing_data_ticks

In [None]:
# Compute OCF with yfinance
operatingCashflow_series = missing_data_ticks.apply(lambda x: ealib.yf_info(x, "operatingCashflow"))

# How many Nones have we still got?
operatingCashflow_series.isna().sum()

In [None]:
# Now write enriched data frame back to Excel
operatingCashflow_series.name = "OCF (yf)"
enriched_df = pd.concat([missing_data_df, operatingCashflow_series], axis=1)
enriched_df.to_excel("enriched missing data.xlsx", sheet_name='Enriched missing data (yf)', index=False)

#### Remove pharma/ biotech

In [None]:
file_name = "Selected filings companies 18-07-24 copy.xlsx"
header_row = 1

with pd.ExcelFile(file_name, engine='openpyxl') as reader:
    # Retrieve each DataFrame from the respective sheet
    comp_out_df = pd.read_excel(reader, sheet_name='Verified Companies', header=header_row)
    missing_data_df = pd.read_excel(reader, sheet_name='Companies with missing data', header=header_row)

In [None]:
# Clean the dataframes
comp_out_df.drop(columns=['Unnamed: 0'], inplace=True)
missing_data_df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Reconstruct original tickers
comp_out_df["ticker"] = comp_out_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)
missing_data_df["ticker"] = missing_data_df["CIQ ticker"].apply(lambda x: x.split(":")[1] if ":" in x else x)

In [None]:
# Compute sic codes, and add as new columns
def get_sic(tickers_df, ticker, req_header, mrps):
    found_ticker = ealib.find_ticker(tickers_df, ticker)
    if found_ticker.empty:
        return None  # or some default value or error handling
    return ealib.get_response_dict(ealib.metadata_url(found_ticker.iloc[0]["cik_str"]), req_header, mrps=mrps)["sic"]

comp_out_df["sid"] = comp_out_df["ticker"].apply(lambda ticker: get_sic(tickers_df, ticker, req_header, 8))
missing_data_df["sid"] = missing_data_df["ticker"].apply(lambda ticker: get_sic(tickers_df, ticker, req_header, 8))

In [None]:
# First save to Excel with still all companies
with pd.ExcelWriter("Select Comps with sic.xlsx", engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)

In [None]:
comp_out_df.rename(columns={"sid": "sic"}, inplace=True)
missing_data_df.rename(columns={"sid": "sic"}, inplace=True)

In [None]:
# How many Nans? 
comp_out_df_na_percentage     = comp_out_df["sic"].isna().mean() * 100
missing_data_df_na_percentage = missing_data_df["sic"].isna().mean() * 100

logging.info(f"Percentage of NA/NaN/None values in comp_out_df['sic']: {comp_out_df_na_percentage:.2f}%")
logging.info(f"Percentage of NA/NaN/None values in missing_data_df['sic']: {missing_data_df_na_percentage:.2f}%")

In [None]:
# Counters for combined dataframes
sic_counter = Counter()

for index, row in pd.concat([comp_out_df, missing_data_df], ignore_index=True).iterrows():
    sic_counter[f"{row['sic']}"] += 1

# Print results
for key, value in sic_counter.items():
    print(f"{key}")


In [None]:
# Strings whose combined occurrence you want to find
pharma_biotech_sic_codes = [
    '2833', '2834', '2835', '2836', '8731', '8734', '3841', '3842', '3845'
]

comp_out_df_filt     = comp_out_df[~comp_out_df['sic'].isin(pharma_biotech_sic_codes)]
missing_data_df_filt = missing_data_df[~missing_data_df['sic'].isin(pharma_biotech_sic_codes)]

In [None]:
# Write back to Excel after processing:
file_name = "No pharma&biotech Select Comps.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    # Write each DataFrame to a different sheet
    comp_out_df_filt.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df_filt.to_excel(writer, sheet_name='Companies with missing data', index=False)

#### Require increase in Shares Outstanding for last 6 months

In [None]:
file_name = "No pharma&biotech Select Comps.xlsx"
header_row = 0

with pd.ExcelFile(file_name, engine='openpyxl') as reader:
    # Retrieve each DataFrame from the respective sheet
    comp_out_df = pd.read_excel(reader, sheet_name='Verified Companies', header=header_row)
    missing_data_df = pd.read_excel(reader, sheet_name='Companies with missing data', header=header_row)

In [None]:
# Apply on whole dataframes 
def perc_change_shares_out(tickers_df, ticker, req_header, mrps):
    found_ticker = ealib.find_ticker(tickers_df, ticker)
    if found_ticker.empty:
        return None 
    comp_facts = ealib.get_response_dict(ealib.companyfacts_url(found_ticker.iloc[0]["cik_str"]), req_header, mrps)
    if comp_facts is None:
        logging.warning(f'Failed request when attempting to retrieve company facts for ticker {found_ticker["ticker"]}, or comp_facts dictionary empty')
    # Extract desired company fact
    res = ealib.comp_facts_df(
        comp_facts, 
        query_fact_substr=["NumberOfSharesOutstanding", "EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding"], 
        sufficient=True
    )
    # Iterate through all the matched facts and their dataframes
    return ealib.comp_fact_avg_change(res, 180, 360)

comp_out_df["% change in shares outstanding (~6 months)"] = comp_out_df["ticker"].apply(lambda ticker: perc_change_shares_out(tickers_df, ticker, req_header, 8))
missing_data_df["% change in shares outstanding (~6 months)"] = missing_data_df["ticker"].apply(lambda ticker: perc_change_shares_out(tickers_df, ticker, req_header, 8))

In [None]:
# How many nans?
comp_out_df_na_percentage     = comp_out_df["% change in shares outstanding (~6 months)"].isna().mean() * 100
missing_data_df_na_percentage = missing_data_df["% change in shares outstanding (~6 months)"].isna().mean() * 100

logging.info(f"Percentage of NA/NaN/None values in comp_out_df['% change in shares outstanding (~6 months)']: {comp_out_df_na_percentage:.2f}%")
logging.info(f"Percentage of NA/NaN/None values in missing_data_df['% change in shares outstanding (~6 months)']: {missing_data_df_na_percentage:.2f}%")

In [None]:
# Move non verifiable data to missing_data_df
missing_data_df = pd.concat([missing_data_df, comp_out_df[comp_out_df['sic'].isna()]], ignore_index=True)
missing_data_df = pd.concat([missing_data_df, comp_out_df[comp_out_df['% change in shares outstanding (~6 months)'].isna()]], ignore_index=True)

comp_out_df = comp_out_df[~comp_out_df['sic'].isna()]
comp_out_df = comp_out_df[~comp_out_df['% change in shares outstanding (~6 months)'].isna()]


In [None]:
# Only keep columns with positive change in shares outstanding 
comp_out_df = comp_out_df[comp_out_df['% change in shares outstanding (~6 months)'] > 0]
comp_out_df = comp_out_df.sort_values(by="% change in shares outstanding (~6 months)", ascending=False)
comp_out_df["sic"] = comp_out_df["sic"].astype(int)

#### Require market cap > 100mn

In [None]:
mcap_min = 1*(10**8)
comp_out_df = comp_out_df[comp_out_df["Market Cap (USD)"] > mcap_min]
missing_data_df = missing_data_df[~(missing_data_df["Market Cap (USD)"] < mcap_min)]

In [None]:
comp_out_df = comp_out_df.sort_values(by='Avg yearly OCF burn / Market Cap', ascending=True)

In [None]:
# Save back to Excel
file_name = "Updated Screening 19-07-2024.xlsx"
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    comp_out_df.to_excel(writer, sheet_name='Verified Companies', index=False)
    missing_data_df.to_excel(writer, sheet_name='Companies with missing data', index=False)