# Scraper

- All non-scrapy scrappers are here

In [1]:
import os
from lxml import etree
import bs4 as bs
import requests
import time
import sys
from selenium import webdriver
import pandas as pd
from datetime import datetime

# Scrape Tickers

- This section is just to get a sets of tickers (the other info collected are not that necessary)

### SnP500

In [None]:
snp_ticker_df = pd.read_html('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', match= 'GICS')[0]

In [None]:
snp_ticker_df = snp_ticker_df[['Symbol', 'Security', 'GICS Sector', 'GICS Sub Industry', 'CIK']]
snp_ticker_df.head()

In [None]:
# if not df not saved yet, do this
# snp_ticker_df.to_csv('data_out/snp_ticker_df.csv')

In [2]:
snp_ticker_df = pd.read_csv('data_out/snp_ticker_df.csv', index_col=0)

In [None]:
'''
Long winded method using BS4

def get_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        if ticker[-1:]=='\n':
            ticker=ticker[:-1]
        ticker=ticker.replace('.','')
        tickers.append(ticker)
    return tickers
'''
pass

### Russell 3000

- could not find reliable source with tickers and companies

##### Source 1
- problem with this method is need to get tickers

In [None]:
import tabula

In [None]:
all_ticker_tables_df_list = tabula.read_pdf('data_in/russell3000.pdf', pages="all")

In [None]:
russell_tickers_df2 = pd.concat(all_ticker_tables_df_list).reset_index(drop=True)
russell_tickers_df2.head()

##### Source 2

 - this shd be better, but it is from a 3rd party source

In [None]:
russell_tickers_df = pd.read_html('http://www.kibot.com/Historical_Data/Russell_3000_Historical_Intraday_Data.aspx')[1]

In [None]:
russell_tickers_df = russell_tickers_df.rename(columns=russell_tickers_df.iloc[0]).drop(0)[
                        ['Symbol', 'Description', 'Industry', 'Sector']].reset_index(drop=True)
russell_tickers_df.head()

In [None]:
# if not df not saved yet, do this
# russell_tickers_df.to_csv('data_out/russell_tickers_df.csv')

In [None]:
russell_tickers_df = pd.read_csv('data_out/russell_tickers_df.csv', index_col=0)
russell_tickers_df.head()

### STI

In [None]:
sti_ticker_df = pd.read_html('https://en.wikipedia.org/wiki/Straits_Times_Index', match= 'Stock Symbol')[0]

In [None]:
sti_ticker_df.head()

In [None]:
sti_ticker_df.to_csv('data_out/sti_ticker_df.csv')

# Description, Sector, Industry scraping

- can try selenium grid for multiprocessing

### Selenium

In [3]:
import time
from selenium import webdriver
from tqdm import tqdm # If ur using Jupyter Lab
from tqdm.notebook import tqdm # If you are using Jupyter Notebook
from multiprocessing import Pool
from IPython.display import display, HTML

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

##### Desc Scraping

In [None]:
# %%time
ticker_list = snp_ticker_df.Symbol[:2]
# ticker_list = ['GOOG', 'GOOGL']
ticker_desc_df = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])
wrong_ticker_list = []

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

# If you print, tqdm bar will not work
for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    desc_xpath = '//*[@id="Col1-0-Profile-Proxy"]/section/section[2]/p'
    element_present = EC.presence_of_element_located(
                (By.XPATH, desc_xpath))
    try:
        WebDriverWait(driver, 10).until(element_present)
    except:
        print(ticker)
        wrong_ticker_list.append(ticker)
        continue
    
    desc = driver.find_element_by_xpath(desc_xpath).text
    sector = driver.find_element_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[2]').text
    industry = driver.find_element_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]').text
    
    ticker_desc_df.loc[len(ticker_desc_df)] = [ticker, desc, sector, industry]
    driver.close() #need?
display(ticker_desc_df)
wrong_ticker_list

In [None]:
ticker_desc_df.to_csv('data_out/ticker_desc_df.csv', encoding='utf-8-sig')

Comparison to Scrapy
- Sometime when running scrapy spider, it causes some links to stop be able to be accessed by this computer, which is a odd phenomenon
- after a while it is okay

Timing
- Scrapy: 78s
- Selenium: 1hr ++
    - Running headless makes it about 30% faster
    - Rruns faster than the original code (profs code) as well

##### Try using multithreading with Selenium

In [None]:
'''%%time

import multithreading_test

ticker_list = snp_ticker_df.Symbol.head(5)
ticker_desc = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])

p = Pool(processes = 4)
results = p.map_async(multithreading_test.get_ticker_desc_3, ticker_list)
output = results.get()

ticker_desc_df = pd.DataFrame(output, columns = ['Ticker', 'Description', 'Sector', 'Industry'])
display(ticker_desc_df)
'''
pass

##### Price Scraping (Method 1 : by clicking Downloads)

- Have to use selenium (and not scrapy) for price scraping as you need to interact with the webpage to extract the prices (i.e. clicking button or scrolling)
    - even if use pd.read_HTML(), it only outputs 100 elements (no interactivity)

In [None]:
ticker_list = snp_ticker_df.Symbol
DOWNLOAD_FOLDER = os.getcwd() + os.path.sep + 'data_out\\price_csv_files\\'
wrong_ticker_list = []

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
prefs = {
            "profile.default_content_settings.popups": 0,
            "download.default_directory": DOWNLOAD_FOLDER,
            "directory_upgrade": True
        }
options.add_experimental_option('prefs', prefs)
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

for ticker in tqdm(ticker_list):
    print('Processing: %s' %ticker)
    
    # Skip counters which have already been downloaded
    if '%s.csv' %ticker in os.listdir(DOWNLOAD_FOLDER):
        continue 
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)
    
    dload_button_xpath = '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[2]/span[2]/a'
    element_present = EC.presence_of_element_located(
                (By.XPATH, dload_button_xpath))
    try:
        WebDriverWait(driver, 10).until(element_present)
    except:
        print(ticker)
        wrong_ticker_list.append([ticker, "Driver Wait too long"])
        continue
        
    dload_button = driver.find_element_by_xpath(dload_button_xpath)
    dload_button.click()
    
    try:
        error_msg = driver.find_element_by_xpath('/html/body/pre').text
        print('Ticker Error: %s, %s' %(ticker, error_msg))
        wrong_ticker_list.append([ticker, error_msg])
        continue
    except Exception as e:
        pass
        
    # Wait for download to complete by checking for csv file locally
    # Note that repeated files not downloaded
    while ('%s.csv' %ticker not in os.listdir(DOWNLOAD_FOLDER)):
        time.sleep(0.1)
    driver.close()
wrong_ticker_list

- To run faster, restart the kernel and run again
- Sometime might get this error, just retry and it should be okay
    - WebDriverException: Message: unknown error: unable to discover open pages (FIXED)
    - SessionNotCreatedException: Message: session not created from tab crashed (Session info: headless chrome=84.0.4147.135)

- Error examples for wrong tickers
    - 404 Not Found: No data found, symbol may be delisted
    - 404 Not Found: Timestamp data missing
    - 400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000

- Sometimes a valid symbol may not retrieve the data (if is in the bad_ticker list, with label "Driver Wait too long", but just rerun the code above and it should dload the data properly

- Bad Ticker Data (from above)
    - [['BRK.B', '404 Not Found: No data found, symbol may be delisted'],
    - ['BF.B', '404 Not Found: Timestamp data missing.'],
    - ['CARR', "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
    - ['OTIS', "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"]]

In [None]:
# Concat all dfs to form list of all prices
list_of_dfs = []
for filename in os.listdir(DOWNLOAD_FOLDER):
    df = pd.read_csv(DOWNLOAD_FOLDER + filename, index_col=0)[['Adj Close']]
    ticker = filename.split('.')[0]
    df.rename({'Adj Close':ticker}, axis=1, inplace=True)
    list_of_dfs.append(df)
ticker_price_df = pd.concat(list_of_dfs, axis=1)
ticker_price_df.to_csv('data_out/ticker_price_df.csv')
ticker_price_df

In [None]:
# OPTIONAL: Remove all downloaded files

# for filename in os.listdir(DOWNLOAD_FOLDER):
#     os.remove(DOWNLOAD_FOLDER + filename)

##### Price Scraping (Method 2: By scrolling down the page and scraping all values)

- Method 1 is better
- will take longer for longer pages 
- difficult to scrape dates, as some dates are for dividends (not scraped in this case)
- other than dates, output should be the same as Method 1


In [4]:
ticker_list = snp_ticker_df.Symbol
all_prices_dict = {}

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

# Yahoo webpage only shows 100 prices at a time
NUM_TIMES_TO_SCROLL = (date2 - date1).days//100 + 1

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--no-sandbox') # Bypass OS security model
options.add_argument('--headless')

for ticker in tqdm(ticker_list):
    print('Processing: %s' %ticker)
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    SCROLL_PAUSE_TIME = 0.1

    # getting scrollHeight using javascript dosent work for YahooFinance pg
    for _ in range(NUM_TIMES_TO_SCROLL):
        driver.execute_script("window.scrollTo(0, 100000);")
        time.sleep(SCROLL_PAUSE_TIME)
    
    items = driver.find_elements_by_xpath(
        '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr/td[6]/span')
    prices = list(reversed(list(map(lambda x: x.text, items))))
    all_prices_dict[ticker] = prices
    driver.close()
print(pd.DataFrame(all_prices_dict))
ticker_price_df2 = pd.DataFrame(all_prices_dict)
ticker_price_df2.to_csv('data_out/ticker_price_df2.csv')
display(ticker_price_df2)

HBox(children=(FloatProgress(value=0.0, max=505.0), HTML(value='')))

Processing: MMM
https://finance.yahoo.com/quote/MMM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d




Processing: ABT
https://finance.yahoo.com/quote/ABT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ABBV
https://finance.yahoo.com/quote/ABBV/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ABMD
https://finance.yahoo.com/quote/ABMD/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ACN
https://finance.yahoo.com/quote/ACN/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ATVI
https://finance.yahoo.com/quote/ATVI/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ADBE
https://finance.yahoo.com/quote/ADBE/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: AMD
https://finance.yahoo.com/quote/AMD/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: AAP
https://fi

Processing: BLL
https://finance.yahoo.com/quote/BLL/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BAC
https://finance.yahoo.com/quote/BAC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BK
https://finance.yahoo.com/quote/BK/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BAX
https://finance.yahoo.com/quote/BAX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BDX
https://finance.yahoo.com/quote/BDX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BRK.B
https://finance.yahoo.com/quote/BRK.B/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BBY
https://finance.yahoo.com/quote/BBY/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: BIO
https://finance.

Processing: CXO
https://finance.yahoo.com/quote/CXO/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: COP
https://finance.yahoo.com/quote/COP/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ED
https://finance.yahoo.com/quote/ED/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: STZ
https://finance.yahoo.com/quote/STZ/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: COO
https://finance.yahoo.com/quote/COO/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: CPRT
https://finance.yahoo.com/quote/CPRT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: GLW
https://finance.yahoo.com/quote/GLW/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: CTVA
https://finance.y

Processing: EXPE
https://finance.yahoo.com/quote/EXPE/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: EXPD
https://finance.yahoo.com/quote/EXPD/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: EXR
https://finance.yahoo.com/quote/EXR/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: XOM
https://finance.yahoo.com/quote/XOM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: FFIV
https://finance.yahoo.com/quote/FFIV/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: FB
https://finance.yahoo.com/quote/FB/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: FAST
https://finance.yahoo.com/quote/FAST/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: FRT
https://fina

Processing: HBAN
https://finance.yahoo.com/quote/HBAN/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: HII
https://finance.yahoo.com/quote/HII/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: IEX
https://finance.yahoo.com/quote/IEX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: IDXX
https://finance.yahoo.com/quote/IDXX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: INFO
https://finance.yahoo.com/quote/INFO/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ITW
https://finance.yahoo.com/quote/ITW/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ILMN
https://finance.yahoo.com/quote/ILMN/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: INCY
https://f

Processing: MPC
https://finance.yahoo.com/quote/MPC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MKTX
https://finance.yahoo.com/quote/MKTX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MAR
https://finance.yahoo.com/quote/MAR/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MMC
https://finance.yahoo.com/quote/MMC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MLM
https://finance.yahoo.com/quote/MLM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MAS
https://finance.yahoo.com/quote/MAS/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MA
https://finance.yahoo.com/quote/MA/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: MKC
https://finance.ya

Processing: PKG
https://finance.yahoo.com/quote/PKG/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PH
https://finance.yahoo.com/quote/PH/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PAYX
https://finance.yahoo.com/quote/PAYX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PAYC
https://finance.yahoo.com/quote/PAYC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PYPL
https://finance.yahoo.com/quote/PYPL/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PNR
https://finance.yahoo.com/quote/PNR/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PBCT
https://finance.yahoo.com/quote/PBCT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: PEP
https://fina

Processing: LUV
https://finance.yahoo.com/quote/LUV/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: SWK
https://finance.yahoo.com/quote/SWK/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: SBUX
https://finance.yahoo.com/quote/SBUX/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: STT
https://finance.yahoo.com/quote/STT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: STE
https://finance.yahoo.com/quote/STE/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: SYK
https://finance.yahoo.com/quote/SYK/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: SIVB
https://finance.yahoo.com/quote/SIVB/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: SYF
https://financ

Processing: DIS
https://finance.yahoo.com/quote/DIS/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WM
https://finance.yahoo.com/quote/WM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WAT
https://finance.yahoo.com/quote/WAT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WEC
https://finance.yahoo.com/quote/WEC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WFC
https://finance.yahoo.com/quote/WFC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WELL
https://finance.yahoo.com/quote/WELL/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WST
https://finance.yahoo.com/quote/WST/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: WDC
https://finance.ya

ValueError: arrays must all be same length

In [16]:
filter_prices_dict = {k:v for k,v in all_prices_dict.items() if len(v)==253}
ticker_price_df2 = pd.DataFrame(filter_prices_dict)
ticker_price_df2.to_csv('data_out/ticker_price_df2.csv')
display(ticker_price_df2)

Unnamed: 0,MMM,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AMD,AAP,AES,...,WYNN,XEL,XRX,XLNX,XYL,YUM,ZBRA,ZBH,ZION,ZTS
0,179.25,70.28,82.59,325.04,137.59,45.90,226.24,18.46,156.23,13.63,...,94.98,47.34,18.62,82.96,65.21,89.10,159.23,102.49,38.52,84.62
1,179.64,67.53,79.94,309.96,137.18,46.35,224.57,18.83,156.69,13.37,...,100.25,46.33,18.97,84.60,65.00,88.64,156.24,101.07,39.32,83.57
2,172.87,64.34,77.30,302.29,132.50,44.70,215.70,17.05,161.58,13.36,...,99.28,46.15,18.55,81.41,62.78,86.41,146.88,99.24,39.10,80.68
3,179.98,66.18,79.79,313.44,137.65,46.49,226.19,19.00,157.57,13.80,...,103.28,46.60,19.27,85.18,65.42,88.66,152.97,102.63,40.30,83.85
4,179.57,67.17,80.96,314.80,138.13,47.80,229.26,20.57,159.89,13.87,...,105.12,46.40,19.68,87.43,64.46,88.56,155.29,102.67,40.46,84.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,171.81,86.21,86.30,172.30,209.08,58.43,329.64,46.54,159.56,19.24,...,138.28,62.04,36.04,97.28,77.83,98.88,254.33,149.94,50.08,132.33
249,171.72,86.21,86.28,171.01,209.52,58.51,331.20,46.63,158.27,19.41,...,139.77,61.97,36.15,97.47,77.79,100.32,254.42,149.62,50.21,132.44
250,172.37,86.33,85.67,169.27,209.68,58.80,330.79,46.18,157.36,19.46,...,138.65,62.30,35.98,97.21,78.03,100.43,256.00,149.33,50.05,132.66
251,170.98,85.74,85.02,167.29,208.12,58.50,328.34,45.52,158.12,19.52,...,138.22,62.47,35.83,96.48,77.93,99.18,254.11,148.07,50.32,131.62


In [19]:
# Tickers with errors (incomplete data)
{k:len(v) for k,v in all_prices_dict.items() if len(v)!=253}

{'BRK.B': 0,
 'BF.B': 0,
 'CARR': 0,
 'CTVA': 153,
 'DOW': 199,
 'FOXA': 205,
 'FOX': 204,
 'HWM': 62,
 'NLOK': 207,
 'OTIS': 0,
 'TT': 206,
 'VIAC': 18}

- Runtime
    - 2:45:42 (505/505 [2:45:42<00:00, 19.69s/it]

In [None]:
from playsound import playsound
def ALARM():
    for i in range(10):
        playsound('data_in/bell.mp3')

ALARM()

##### Ratio Scraping 

In [None]:
ticker_list = snp_ticker_df.Symbol.head(2)
ticker_ratios_df = pd.DataFrame(columns = ['Ticker', 'mkt_cap', 'pb_ratio', 'beta', 'profit_margin', 'roa', 'roe'])

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

# If you print, tqdm bar will not work
for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/key-statistics?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(2) # wait for webpage to render all required info or to emulate human user .. not sure
    
    mkt_cap = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[1]/td[3]')[0].text
    pb_ratio = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[7]/td[3]')[0].text
    beta = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[2]/div/div[1]/div/div/table/tbody/tr[1]/td[2]')[0].text
    profit_margin = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[2]/div/div/table/tbody/tr[1]/td[2]')[0].text
    roa = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[1]/td[2]')[0].text
    roe = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[2]/td[2]')[0].text

    ticker_ratios_df.loc[len(ticker_ratios_df)] = [ticker, mkt_cap, pb_ratio, beta, profit_margin, roa, roe]
    driver.close()
ticker_ratios_df.to_csv('data_out/ticker_ratios_df.csv')
display(ticker_ratios_df)

# Extra

##### Implicit wait, selenium (google.com)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
# driver.implicitly_wait(10)

driver.get("http://google.com")
driver.maximize_window()

print("Implicit Wait Example")

inputElement = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input')  # .find_element_by_id("lst-ib")
inputElement.send_keys("Techbeamers")
inputElement.submit()

driver.close()

##### Multiprocessing test

- for pool to work on jupyter notebook need to import function
- with muti processing cannot change global variables (there is a fix for this)
- printing is not trivial in multiprocessing

- still need to figure out if multiprocessing or multithreading is more suitable for scraping

In [None]:
%%timeit
p=Pool(processes = 4)
output = p.map(multithreading_test.worker,range(3000000))
print(output[-3:])

In [None]:
%%timeit
p = Pool(processes=4)
results = p.map_async(multithreading_test.worker, range(3000000))
output = results.get()
print(output[-3:])

In [None]:
%%timeit
output = list(map(multithreading_test.worker, range(3000000)))
print(output[-3:])

- I think in this case using ordinary functions is faster than using Pool due to the high over head
- only use pool when there is high CPU requirement, high iterations

In [None]:
# Number of cores for multiprocessing
import multiprocessing
multiprocessing.cpu_count()

##### Check Chrome Driver Version

In [None]:
driver = webdriver.Chrome()
str1 = driver.capabilities['browserVersion']
str2 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0]
print(str1)
print(str2)
print(str1[0:2])
print(str2[0:2])
if str1[0:2] != str2[0:2]: 
    print("please download correct chromedriver version")

##### Running other scripts

In [None]:
%run 

##### Yield

In [None]:
def f():
    yield 1
    yield 2
    yield 3
[i for i in f()]

In [None]:
print(f())

##### Scrapy Tutorial

In [None]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [None]:
import json

class JsonWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [None]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [None]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

In [None]:
import pandas as pd
dfjson = pd.read_json('quoteresult.json')
dfjson

In [None]:
dfjl = pd.read_json('quoteresult.jl', lines=True)
dfjl

In [None]:
dfjson.to_pickle('quotejson.pickle')
dfjl.to_pickle('quotejl.pickle')

##### tqdm

In [None]:
from tqdm import tqdm

j=0
for i in tqdm(range(1000000), ):
    j+=i
    
print(j)

In [None]:
import time
import sys
from tqdm import trange


def do_something():
    time.sleep(1)

def do_another_something():
    time.sleep(1)


for i in trange(10):
    do_something()

#     for j in trange(2):
#         do_another_something()

In [None]:
from tqdm.notebook import trange, tqdm
import time

for i in trange(6):
    do_something()

    for j in trange(2):
        do_another_something()


##### Test Scraping 10K reports

In [None]:

for file in os.listdir("data_in"):
    if file.endswith(".txt") and "Item1_excerpt" in file:
        print(file)
        with open("data_in/" + file, "rt",encoding='utf-8') as f:
            line_num = 1
            for line in f:
                line = line.lower()
                if ("part i" in line) or ("item 1" in line) or ("business" in line and len(line) < 20) or ("introduction" in line):
                    print(line_num, line)                    
                if ("item 1a" in line) or ("risk factors" in line):
                    print(line_num, line)                    
                line_num += 1
        break

In [None]:
import glob

path = 'data_in'

files = [f for f in glob.glob(path + "*/*.txt", recursive=True)]

for f in files:
    print(f)

In [None]:
glob.glob(path + "*/*.txt", recursive=True)