In [8]:
from lxml import etree
import bs4 as bs
import requests
import time
import sys
from selenium import webdriver
import pandas as pd

# Scrape Tickers

- This section is just to get a sets of tickers (the other info collected are not that necessary)

### SnP500

In [3]:
snp_ticker_df = pd.read_html('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', match= 'GICS')[0]

In [9]:
snp_ticker_df = snp_ticker_df[['Symbol', 'Security', 'GICS Sector', 'GICS Sub Industry', 'CIK']]
snp_ticker_df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub Industry,CIK
0,MMM,3M Company,Industrials,Industrial Conglomerates,66740
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,1800
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,1551152
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,815094
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,1467373


In [16]:
# snp_ticker_df.to_csv('data_out/snp_ticker_df.csv')

In [108]:
snp_ticker_df = pd.read_csv('data_out/snp_ticker_df.csv', index_col=0)

In [4]:
'''
Long winded method using BS4

def get_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        if ticker[-1:]=='\n':
            ticker=ticker[:-1]
        ticker=ticker.replace('.','')
        tickers.append(ticker)
    return tickers
'''
pass

In [None]:
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text
    if ticker[-1:]=='\n':
        ticker=ticker[:-1]
    ticker=ticker.replace('.','')
    tickers.append(ticker)
return tickers

### Russell 3000

- could not find reliable source with tickers and companies

##### Source 1

In [None]:
import tabula

In [None]:
all_ticker_tables_df_list = tabula.read_pdf('russell3000.pdf', pages="all")

In [None]:
russell3000_tickers_df = pd.concat(all_ticker_tables_df_list).reset_index(drop=True)
russell3000_tickers_df.head()

- problem with this method is need to get tickers

##### Source 2

In [None]:
russell_ticker_df = pd.read_html('http://www.kibot.com/Historical_Data/Russell_3000_Historical_Intraday_Data.aspx')[1]

In [None]:
russell_ticker_df.head()

In [None]:
russell_ticker_df = russell_ticker_df.rename(columns=russell_ticker_df.iloc[0]).drop(0)[
                        ['Symbol', 'Description', 'Industry', 'Sector']].reset_index(drop=True)
russell_ticker_df.head()

### STI

In [None]:
sti_ticker_df = pd.read_html('https://en.wikipedia.org/wiki/Straits_Times_Index', match= 'Stock Symbol')[0]

In [None]:
sti_ticker_df.head()

# Description, Sector, Industry scraping

- can try selenium grid for multiprocessing

### Selenium

In [56]:
import time
from selenium import webdriver
from tqdm import tqdm # If ur using Jupyter Lab
from tqdm.notebook import tqdm # If you are using Jupyter Notebook
from multiprocessing import Pool
from IPython.display import display, HTML

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

##### Desc Scraping

In [173]:
# %%time
ticker_list = snp_ticker_df.Symbol.head(2)
ticker_desc = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

# If you print, tqdm bar will not work
for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(2) # wait for webpage to render all required info or to emulate human user .. not sure
    
    desc = driver.find_elements_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/section[2]/p')[0].text
    sector = driver.find_elements_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[2]')[0].text
    industry = driver.find_elements_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]')[0].text
    
    ticker_desc.loc[len(ticker_desc)] = [ticker, desc, sector, industry]
    driver.close() #need?
display(ticker_desc)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

https://finance.yahoo.com/quote/MMM/profile?p=MMM
https://finance.yahoo.com/quote/ABT/profile?p=ABT



Unnamed: 0,Ticker,Description,Sector,Industry
0,MMM,"3M Company develops, manufactures, and markets...",Industrials,Specialty Industrial Machinery
1,ABT,"Abbott Laboratories discovers, develops, manuf...",Healthcare,Medical Devices


- Running headless makes it about 30% faster
- This runs faster than the original code as well
- This was the average speed from 7 runs
    - 1min 45s ± 16.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
- running on scrapy took 5s!!

##### Try using multithreading

In [None]:
%%time

import multithreading_test

ticker_list = snp_ticker_df.Symbol.head(5)
ticker_desc = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])

p = Pool(processes = 4)
results = p.map_async(multithreading_test.get_ticker_desc_3, ticker_list)
output = results.get()

ticker_desc_df = pd.DataFrame(output, columns = ['Ticker', 'Description', 'Sector', 'Industry'])
display(ticker_desc_df)

##### Price Scraping (Method 1 : by clicking Downloads)

- Have to use selenium (and not scrapy) for price scraping as you need to interact with the webpage to extract the prices (i.e. clicking button or scrolling)

In [166]:
ticker_list = snp_ticker_df.Symbol.head(3)
DOWNLOAD_FOLDER = os.getcwd() + os.path.sep + 'data_out\\price_csv_files\\'

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
prefs = {
            "profile.default_content_settings.popups": 0,
            "download.default_directory": DOWNLOAD_FOLDER,
            "directory_upgrade": True
        }
options.add_experimental_option('prefs', prefs)
options.add_argument('--headless')

for ticker in tqdm(ticker_list):
    print('Processing: %s' %ticker)
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)
    
    element_present = EC.presence_of_element_located(
                (By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[2]/span[2]/a'))
    WebDriverWait(driver, 10).until(element_present)
    
    dload_button = driver.find_element_by_xpath(
                    '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[2]/span[2]/a')
    dload_button.click()
    
    # Wait for download to complete by checking for csv file locally
    # Note that repeated files not downloaded
    while ('%s.csv' %ticker not in os.listdir(DOWNLOAD_FOLDER)):
        time.sleep(0.1)
    driver.close()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Processing: MMM
https://finance.yahoo.com/quote/MMM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d




Processing: ABT
https://finance.yahoo.com/quote/ABT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ABBV
https://finance.yahoo.com/quote/ABBV/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d



- Sometime might get this error, just retry and it should be okay
    - WebDriverException: Message: unknown error: unable to discover open pages

In [132]:
# Concat all dfs to form list of all prices
list_of_dfs = []
for filename in os.listdir(DOWNLOAD_FOLDER):
    df = pd.read_csv(DOWNLOAD_FOLDER + filename, index_col=0)[['Adj Close']]
    ticker = filename.split('.')[0]
    df.rename({'Adj Close':ticker}, axis=1, inplace=True)
    list_of_dfs.append(df)
all_prices_df = pd.concat(list_of_dfs, axis=1)[ticker_list]
all_prices_df.to_csv('data_out/all_prices_df.csv')
all_prices_df

Unnamed: 0_level_0,ABBV,ABMD,ABT,ACN,MMM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-12-31,82.589752,325.040009,70.279106,137.589874,179.249359
2019-01-02,79.937981,309.959991,67.529343,137.180054,179.635086
2019-01-03,77.304138,302.290009,64.342346,132.496475,172.871124
2019-01-04,79.794632,313.440002,66.178749,137.648422,179.983185
2019-01-07,80.959251,314.799988,67.169830,138.126541,179.569214
...,...,...,...,...,...
2019-12-24,86.296272,172.300003,86.211723,209.082184,171.810944
2019-12-26,86.277069,171.009995,86.211723,209.516922,171.723434
2019-12-27,85.671982,169.270004,86.330261,209.684891,172.374985
2019-12-30,85.018875,167.289993,85.737602,208.123764,170.984390


In [135]:
# To remove all downloaded files (optional)
for filename in os.listdir(DOWNLOAD_FOLDER):
    os.remove(DOWNLOAD_FOLDER + filename)

##### Price Scraping (Method 2: By scrolling down the page and scraping all values)

- will take longer for longer pages 
- difficult to scrape dates, as some dates are for dividends

In [165]:
ticker_list = snp_ticker_df.Symbol.head(5)
all_prices_dict = {}

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

# Yahoo webpage only shows 100 prices at a time
NUM_TIMES_TO_SCROLL = (date2 - date1).days//100 + 1

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')

for ticker in ticker_list:
    print('Processing: %s' %ticker)
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    SCROLL_PAUSE_TIME = 0.5

    # getting scrollHeight using javascript dosent work for YahooFinance pg
    for _ in range(NUM_TIMES_TO_SCROLL):
        driver.execute_script("window.scrollTo(0, 100000);")
        time.sleep(SCROLL_PAUSE_TIME)
    
    items = driver.find_elements_by_xpath(
        '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr/td[6]/span')
    prices = list(map(lambda x: x.text, items))
    all_prices_dict[ticker] = prices
    driver.close()

all_prices_df2 = pd.DataFrame(all_prices_dict)
all_prices_df2.to_csv('data_out/all_prices_df2.csv')

Processing: MMM
https://finance.yahoo.com/quote/MMM/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d




Processing: ABT
https://finance.yahoo.com/quote/ABT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ABBV
https://finance.yahoo.com/quote/ABBV/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ABMD
https://finance.yahoo.com/quote/ABMD/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Processing: ACN
https://finance.yahoo.com/quote/ACN/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d


##### Ratio Scraping 

In [172]:
ticker_list = snp_ticker_df.Symbol.head(5)
ticker_ratios = pd.DataFrame(columns = ['Ticker', 'mkt_cap', 'pb_ratio', 'beta', 'profit_margin', 'roa', 'roe'])

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

# If you print, tqdm bar will not work
for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/key-statistics?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(2) # wait for webpage to render all required info or to emulate human user .. not sure
    
    mkt_cap = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[1]/td[3]')[0].text
    pb_ratio = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[7]/td[3]')[0].text
    beta = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[2]/div/div[1]/div/div/table/tbody/tr[1]/td[2]')[0].text
    profit_margin = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[2]/div/div/table/tbody/tr[1]/td[2]')[0].text
    roa = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[1]/td[2]')[0].text
    roe = driver.find_elements_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[2]/td[2]')[0].text

    ticker_ratios.loc[len(ticker_list)] = [ticker, mkt_cap, pb_ratio, beta, profit_margin, roa, roe]
    driver.close()
display(ticker_ratios)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

https://finance.yahoo.com/quote/MMM/key-statistics?p=MMM
https://finance.yahoo.com/quote/ABT/key-statistics?p=ABT
https://finance.yahoo.com/quote/ABBV/key-statistics?p=ABBV
https://finance.yahoo.com/quote/ABMD/key-statistics?p=ABMD
https://finance.yahoo.com/quote/ACN/key-statistics?p=ACN



Unnamed: 0,Ticker,mkt_cap,pb_ratio,beta,profit_margin,roa,roe
5,ACN,128.62B,8.29,1.04,11.11%,12.93%,32.82%


##### Scrapy

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import logging

In [19]:
class JsonWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('yahoo.csv', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [20]:
['https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker for ticker in snp_ticker_df.Symbol.head(2)]

['https://finance.yahoo.com/quote/MMM/profile?p=MMM',
 'https://finance.yahoo.com/quote/ABT/profile?p=ABT']

In [21]:
class YahooSpider(scrapy.Spider):
    name = "yahoo"
    
    # start_url is scrapy naming convention, dont change (dont need to implement start_requests with this)
    start_urls = ['https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker
                      for ticker in snp_ticker_df.Symbol.head(2)]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING, # Scrapy logs alot of stuff at a lower setting
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
#         'FEED_FORMAT':'json',                                 # Used for pipeline 2
#         'FEED_URI': 'quoteresult.json',                       # Used for pipeline 2
        'FEED_FORMAT':'csv',
        'FEED_URI': 'yahoo.csv'
    }
    
    def parse(self, response):
        yield {
            'Ticker': 'temp2',
            'desc': response.xpath('//*[@id="Col1-0-Profile-Proxy"]/section/section[2]/p/text()').extract(),
            'Sector': response.xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[2]/text()').extract(),
            'Industry': response.xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]/text()').extract()
        }

In [25]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
# Note: Scrapy does not use Browser, it is parser for static html

process.crawl(YahooSpider)
process.start()

2020-08-19 16:11:43 [scrapy.utils.log] INFO: Scrapy 2.3.0 started (bot: scrapybot)
2020-08-19 16:11:43 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.2 (v3.6.2:5fd33b5, Jul  8 2017, 04:57:36) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.18362-SP0
2020-08-19 16:11:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-08-19 16:11:43 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)



<Deferred at 0x1ec61ac7cf8>

ReactorNotRestartable: 

In [35]:
dfjson = pd.read_json('quoteresult.json')
dfjson

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
2,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
3,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
4,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
5,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
6,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
7,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
8,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
9,"“You may not be her first, her last, or her on...",Bob Marley,[love]


In [36]:
dfjl = pd.read_json('quoteresult.jl', lines=True)
dfjl

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
2,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
3,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
4,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
5,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
6,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
7,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
8,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
9,"“You may not be her first, her last, or her on...",Bob Marley,[love]


In [37]:
dfjson.to_pickle('quotejson.pickle')
dfjl.to_pickle('quotejl.pickle')

# Price Scraping

In [None]:
from lxml import etree
import requests
import time
import sys
from selenium import webdriver
from datetime import date,datetime

In [None]:
list_of_names = ['JPM','AMZN']
d1 = datetime.strptime('20120101', "%Y%m%d")
d2 = datetime.strptime('20120401', "%Y%m%d")

time_str1 = str(int(datetime.timestamp(d1)))
time_str2 = str(int(datetime.timestamp(d2)))
print(time_str1, time_str2)

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')

for n in range(len(list_of_names)):
    ticker = list_of_names[n]
    print('processing '+ticker)
    url='https://finance.yahoo.com/quote/'+list_of_names[n]+'/history?period1='+time_str1+'&period2='+time_str2+'&interval=1d&filter=history&frequency=1d'

    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    html = driver.page_source
    html = etree.HTML(html)
    
    items = html.xpath('//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody//text()')
    print(items)
    driver.close()

In [53]:
import pandas as pd
x = pd.read_html('https://sg.finance.yahoo.com/quote/JPM/history?p=JPM&.tsrc=fin-srch')

In [55]:
x[0]

Unnamed: 0,Date,Open,High,Low,Close*,Adj. close**,Volume
0,28 Aug 2020,103.14,103.25,101.81,102.77,102.77,13754600
1,27 Aug 2020,99.00,102.65,99.00,102.35,102.35,22163900
2,26 Aug 2020,99.94,100.27,99.05,99.09,99.09,12052400
3,25 Aug 2020,101.27,101.98,99.80,100.50,100.50,13648900
4,24 Aug 2020,98.45,100.21,97.42,100.06,100.06,17898300
...,...,...,...,...,...,...,...
96,15 Apr 2020,91.15,92.03,90.13,90.79,89.91,30118600
97,14 Apr 2020,101.02,102.00,93.64,95.50,94.58,46151300
98,13 Apr 2020,103.11,103.11,97.30,98.19,97.24,26911900
99,09 Apr 2020,96.92,104.39,96.89,102.76,101.77,42084700


# Extra

##### Implicit wait, selenium (google.com)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
# driver.implicitly_wait(10)

driver.get("http://google.com")
driver.maximize_window()

print("Implicit Wait Example")

inputElement = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input')  # .find_element_by_id("lst-ib")
inputElement.send_keys("Techbeamers")
inputElement.submit()

driver.close()

##### Multiprocessing test

- for pool to work on jupyter notebook need to import function
- with muti processing cannot change global variables (there is a fix for this)
- printing is not trivial in multiprocessing

- still need to figure out if multiprocessing or multithreading is more suitable for scraping

In [None]:
%%timeit
p=Pool(processes = 4)
output = p.map(multithreading_test.worker,range(3000000))
print(output[-3:])

In [None]:
%%timeit
p = Pool(processes=4)
results = p.map_async(multithreading_test.worker, range(3000000))
output = results.get()
print(output[-3:])

In [None]:
%%timeit
output = list(map(multithreading_test.worker, range(3000000)))
print(output[-3:])

- I think in this case using ordinary functions is faster than using Pool due to the high over head
- only use pool when there is high CPU requirement, high iterations

In [None]:
# Number of cores for multiprocessing
import multiprocessing
multiprocessing.cpu_count()

##### Check Chrome Driver Version

In [None]:
driver = webdriver.Chrome()
str1 = driver.capabilities['browserVersion']
str2 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0]
print(str1)
print(str2)
print(str1[0:2])
print(str2[0:2])
if str1[0:2] != str2[0:2]: 
    print("please download correct chromedriver version")

##### Running other scripts

In [None]:
%run 

##### Yield

In [22]:
def f():
    yield 1
    yield 2
    yield 3
[i for i in f()]

[1, 2, 3]

In [26]:
print(f())

<generator object f at 0x000002D2DEF26678>


##### Scrapy Tutorial

In [27]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

'3.6.2'

In [33]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [34]:
import json

class JsonWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [35]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [36]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

2020-08-17 11:44:18 [scrapy.utils.log] INFO: Scrapy 2.3.0 started (bot: scrapybot)
2020-08-17 11:44:18 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.2 (v3.6.2:5fd33b5, Jul  8 2017, 04:57:36) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.18362-SP0
2020-08-17 11:44:18 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-08-17 11:44:18 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
  exporter = cls(crawler)



<Deferred at 0x22ca0da6ef0>

ReactorNotRestartable: 

In [35]:
import pandas as pd
dfjson = pd.read_json('quoteresult.json')
dfjson

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
2,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
3,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
4,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
5,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
6,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
7,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
8,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
9,"“You may not be her first, her last, or her on...",Bob Marley,[love]


In [36]:
dfjl = pd.read_json('quoteresult.jl', lines=True)
dfjl

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
2,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
3,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
4,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
5,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
6,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
7,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
8,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
9,"“You may not be her first, her last, or her on...",Bob Marley,[love]


In [37]:
dfjson.to_pickle('quotejson.pickle')
dfjl.to_pickle('quotejl.pickle')

##### tqdm

In [10]:
from tqdm import tqdm

j=0
for i in tqdm(range(1000000), ):
    j+=i
    
print(j)

100%|███████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1562553.30it/s]

499999500000





In [12]:
import time
import sys
from tqdm import trange


def do_something():
    time.sleep(1)

def do_another_something():
    time.sleep(1)


for i in trange(10):
    do_something()

#     for j in trange(2):
#         do_another_something()

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.01s/it]


In [13]:
from tqdm.notebook import trange, tqdm
import time

for i in trange(6):
    do_something()

    for j in trange(2):
        do_another_something()


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))





KeyboardInterrupt: 

##### Test Scraping 10K reports

In [49]:
import os
for file in os.listdir("data_in"):
    if file.endswith(".txt") and "Item1_excerpt" in file:
        print(file)
        with open("data_in/" + file, "rt",encoding='utf-8') as f:
            line_num = 1
            for line in f:
                line = line.lower()
                if ("part i" in line) or ("item 1" in line) or ("business" in line and len(line) < 20) or ("introduction" in line):
                    print(line_num, line)                    
                if ("item 1a" in line) or ("risk factors" in line):
                    print(line_num, line)                    
                line_num += 1
        break

HD_0000354950_10K_20180128_Item1_excerpt.txt
1 part i

3 item 1. business.

5 introduction

11 our business

163 item 1a. risk factors.
163 item 1a. risk factors.


In [20]:
import glob

path = 'data_in'

files = [f for f in glob.glob(path + "*/*.txt", recursive=True)]

for f in files:
    print(f)

data_in\HD_0000354950_10K_20180128_Item1_excerpt.txt
data_in\HD_0000354950_10K_20190203_Item1_excerpt.txt
data_in\TSO_0000050104_10K_20171231_Item1_excerpt.txt


In [21]:
glob.glob(path + "*/*.txt", recursive=True)

['data_in\\HD_0000354950_10K_20180128_Item1_excerpt.txt',
 'data_in\\HD_0000354950_10K_20190203_Item1_excerpt.txt',
 'data_in\\TSO_0000050104_10K_20171231_Item1_excerpt.txt']