In [4]:
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from tqdm.auto import tqdm
from datetime import datetime

from bs4 import BeautifulSoup
import requests
from lxml import etree
from time import sleep

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [5]:
nasdaq_df = pd.read_csv('./data/nasdaq_list.csv')

In [56]:
def scrape_marketwatch(symbol: str):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        resp = requests.get(f'https://www.marketwatch.com/investing/stock/{symbol.lower()}/financials/income', headers=headers, timeout=15)
        soup = BeautifulSoup(resp.text)
    except:
        print(symbol, "request failed.")
        return None

    try:
        financial_table = soup.find('table', attrs={'class': 'table table--overflow align--right'})
        df = pd.read_html(str(financial_table))[0]
        df = df.drop(df.columns[[-1]], axis=1) # drop 5-year trend column
    except:
        print(symbol, "parsing failed.")
        return None
    else:
        return df

In [69]:
def thread_scrape(start: int, end: int):
    result = pd.DataFrame()
    for i in tqdm(range(start, end)):
        name = nasdaq_df.loc[i, 'Name']
        symbol = nasdaq_df.loc[i, 'Symbol']
        industry = nasdaq_df.loc[i, 'Industry']
        
        df = scrape_marketwatch(symbol)
        if df is None:
            continue
        
        # add multiindex level
        df = pd.concat([df], keys=[industry], names=['Industry'])
        df = pd.concat([df], keys=[symbol], names=['Symbol'])
        df = pd.concat([df], keys=[name], names=['Name'])
        
        result = pd.concat([result, df])
    
    return result

In [72]:
work_list = [(i, min(i + 500, len(nasdaq_df))) for i in range(0, len(nasdaq_df), 500)]
print(work_list)

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(thread_scrape, work[0], work[1]) for work in work_list]

    df = pd.DataFrame()
    for future in as_completed(futures):
        result = future.result()
        print(result)
        df = pd.concat([df, result])

df.to_csv('./data/nasdaq_full2.csv')
df

[(0, 500), (500, 1000), (1000, 1500), (1500, 2000), (2000, 2500), (2500, 3000), (3000, 3500), (3500, 4000), (4000, 4500), (4500, 4634)]


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

BOWXU parsing failed.
RWAY parsing failed.
FSTX parsing failed.
NAAC parsing failed.
DCRCU parsing failed.
FNVT parsing failed.
LBBBU parsing failed.
SVOK parsing failed.
TBLD parsing failed.
MSON parsing failed.
CMPS parsing failed.
TLC parsing failed.
GRVI parsing failed.
GWACU parsing failed.
ENCP parsing failed.
EM parsing failed.
ACGLP parsing failed.
AHAC parsing failed.
MCAA parsing failed.
ASCAU parsing failed.
ROIV parsing failed.
CMCA parsing failed.
IONR parsing failed.
AMYT parsing failed.
MCADU parsing failed.
TOUR parsing failed.
PPGH parsing failed.
IQMDU parsing failed.
HORIU parsing failed.
MYNZ parsing failed.
MDCA parsing failed.
GXGX parsing failed.
VINP parsing failed.
UTRS parsing failed.
CDAQU parsing failed.
CHSCN parsing failed.
TLGY parsing failed.
HSAQ parsing failed.
CDRO parsing failed.
IVCB parsing failed.
PLXP parsing failed.
NBN parsing failed.
CRSA parsing failed.
PFHC parsing failed.
MOGO parsing failed.
SPK parsing failed.
GLBL parsing failed.
RGTI pa

  0%|          | 0/500 [00:00<?, ?it/s]

 parsing failed.
ABVC parsing failed.
ALRM parsing failed.
KYMR parsing failed.
VRAY parsing failed.
LAZY parsing failed.
HHGC parsing failed.
WAFDP parsing failed.
PTON parsing failed.
NEO parsing failed.
FWP parsing failed.
LQDT parsing failed.
EBONANGN parsing failed.
 parsing failed.
REVE parsing failed.
MMSI parsing failed.
NGM parsing failed.
NVTS parsing failed.
DWSN parsing failed.
DALN parsing failed.
BLKBCMPR parsing failed.
RMGCU parsing failed.
GDST  parsing failed.
parsing failed.
ADIL parsing failed.
WILC parsing failed.
NBSE parsing failed.
GDNRU parsing failed.
CLOVBHF parsing failed.
QRTEB parsing failed.
 parsing failed.
OSPNMEOAU parsing failed.
 parsing failed.
CNTX parsing failed.
PAFO parsing failed.
UROY parsing failed.
ARVL parsing failed.
ZIONO parsing failed.
PCHBAND parsing failed.
 parsing failed.
REVBU parsing failed.
CDTX parsing failed.
ADMP parsing failed.
AFAQU parsing failed.
GABC parsing failed.
GLAQU parsing failed.
GRFS parsing failed.
AFIB parsing 

  0%|          | 0/134 [00:00<?, ?it/s]

DILA parsing failed.
ELDN parsing failed.
VTAQ parsing failed.
AVO parsing failed.
EPIX parsing failed.
CRBP parsing failed.
ENG parsing failed.
TANH parsing failed.
NCSM parsing failed.
TPBA parsing failed.
RVNC parsing failed.
CMCTP parsing failed.
DCTH parsing failed.
MKDWTMAU parsing failed.
 parsing failed.
AACG parsing failed.
FRSGU parsing failed.
EQBK parsing failed.
AMSF parsing failed.
PIRSSOHO parsing failed.
 parsing failed.
SSKN parsing failed.
NVFY parsing failed.
ISAA parsing failed.
MCAF parsing failed.
GEVO parsing failed.
MOMO parsing failed.
AQST parsing failed.
AMAOU parsing failed.
ASPALPTH parsing failed.
 parsing failed.
AFRI parsing failed.
CCBG parsing failed.
MYSZ parsing failed.
VMEO parsing failed.
GLYC parsing failed.
ATOS parsing failed.
LHDX parsing failed.
TWCBAEI parsing failed.
 parsing failed.
KERN parsing failed.
CPSI parsing failed.
LMAT parsing failed.
BRQS parsing failed.
RNDB parsing failed.
SLNG parsing failed.
SZZL parsing failed.
CLNE parsing 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Item Item,2017,2018,2019,2020,2021,2022,2016
Name,Symbol,Industry,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Orthofix Medical Inc,OFIX,헬스케어 장비 및 용품,0,Sales/Revenue Sales/Revenue,433.82M,453.04M,459.96M,406.56M,464.48M,,
Orthofix Medical Inc,OFIX,헬스케어 장비 및 용품,1,Sales Growth Sales Growth,-,4.43%,1.53%,-11.61%,14.25%,,
Orthofix Medical Inc,OFIX,헬스케어 장비 및 용품,2,Cost of Goods Sold (COGS) incl. D&A Cost of Go...,93.04M,96.63M,105.68M,108.69M,122.82M,,
Orthofix Medical Inc,OFIX,헬스케어 장비 및 용품,3,COGS Growth COGS Growth,-,3.86%,9.37%,2.85%,13.00%,,
Orthofix Medical Inc,OFIX,헬스케어 장비 및 용품,4,COGS excluding D&A COGS excluding D&A,72.91M,77.97M,80.98M,78.14M,93.22M,,
...,...,...,...,...,...,...,...,...,...,...,...
SAB Biotherapeutics Inc,SABS,생명과학 및 메디컬 리서치,52,EPS (Diluted) Growth EPS (Diluted) Growth,,,,-,-150.89%,,
SAB Biotherapeutics Inc,SABS,생명과학 및 메디컬 리서치,53,Diluted Shares Outstanding Diluted Shares Outs...,,,,25.97M,43.49M,,
SAB Biotherapeutics Inc,SABS,생명과학 및 메디컬 리서치,54,EBITDA EBITDA,,,,(34.13M),(72.62M),,
SAB Biotherapeutics Inc,SABS,생명과학 및 메디컬 리서치,55,EBITDA Growth EBITDA Growth,,,,-,-112.74%,,


In [None]:
import re

def _conv_to_float(s):
    if s == '-':
        return None

    if s[-1] == '%':
        s = s.replace('%', '')
    if s[-1] in list('BMK'):
        powers = {'B': 10 ** 9, 'M': 10 ** 6, 'K': 10 ** 3, '': 1}
        m = re.search("([0-9\.]+)(M|B|K|)", s)
        if m:
            val, mag = m.group(1), m.group(2)
            return float(val) * powers[mag]
    try:
        result = float(s)
    except:
        result = None
    return result

In [None]:
df = pd.read_csv('./data/nasdaq_marketcap_full.csv')
df = df.drop(df.columns[[0]], axis=1)

conv_list =['MarketCap', 'Income', 'Sales', 'GrossMargin', 'OperatingMargin', 'ProfitMargin']
for col in conv_list:
    df[col] = df[col].apply(_conv_to_float)

nasdaq_df_proc = df.dropna(subset=['MarketCap'])

nasdaq_df_proc