# Scrape Tickers, 10K reports merging, Clean GICS classification, List of CIKs for relevant tickers, Scrape Wikipedia data, Clean Russell Ratios

In [2]:
import os
from lxml import etree
import bs4 as bs
import requests
import time
import sys
from selenium import webdriver
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm # If ur using Jupyter Lab
from tqdm.notebook import tqdm # If you are using Jupyter Notebook
import wikipedia as wiki
import re
import yfinance as yf

In [3]:
pd.set_option('display.max_rows', 100)

# Scrape Tickers

- All non-scrapy scrappers are here
- This section is just to get a sets of tickers (the other info collected are not that necessary)

### SnP500

In [6]:
snp_tickers_df = pd.read_html('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', match= 'GICS')[0]
snp_tickers_df = snp_tickers_df[['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'CIK']]

In [7]:
snp_tickers_df.columns = ['Ticker', 'Name', 'Sector', 'Sub Industry', 'CIK']
snp_tickers_df.Ticker = snp_tickers_df.Ticker.str.replace('.', '-')
snp_tickers_df.head()

Unnamed: 0,Ticker,Name,Sector,Sub Industry,CIK
0,MMM,3M Company,Industrials,Industrial Conglomerates,66740
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,1800
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,1551152
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,815094
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,1467373


In [None]:
# if not df not saved yet, do this
# snp_tickers_df.to_csv('data_out/snp_tickers_df.csv', index=False)
# snp_tickers_df.to_csv('scrapy_spiders/data_in/snp_tickers_df.csv', index=False)

In [23]:
snp_tickers_df = pd.read_csv('data_out/snp_tickers_df.csv')
snp_tickers_df.head()

Unnamed: 0,Ticker,Name,Sector,Sub Industry,CIK
0,MMM,3M Company,Industrials,Industrial Conglomerates,66740
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,1800
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,1551152
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,815094
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,1467373


In [171]:
snp_tickers_df[snp_tickers_df.Ticker == 'ETFC']

Unnamed: 0,Ticker,Name,Sector,Sub Industry,CIK
160,ETFC,E*Trade,Financials,Investment Banking & Brokerage,1015780


In [8]:
snp_tickers_df.shape

(505, 5)

In [7]:
'''
Long winded method using BS4

def get_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        if ticker[-1:]=='\n':
            ticker=ticker[:-1]
        ticker=ticker.replace('.','')
        tickers.append(ticker)
    return tickers
'''
pass

### Russell 3000

- could not find reliable source with tickers and companies

##### Source 1

 - This is the best source for this project as it has 3000 tickers unlike the source below and all these tickers have GICS mappings

In [9]:
russell_tickers_df = pd.read_csv('data_in/ticker_to_gics.csv', names=['Ticker', 'GICS'])
russell_tickers_df.Ticker = russell_tickers_df.Ticker.str.replace('/','-')
russell_tickers_df.drop('GICS', axis=1, inplace=True)
print(russell_tickers_df.shape)
russell_tickers_df.head()

(2964, 1)


Unnamed: 0,Ticker
0,A
1,AA
2,AAL
3,AAN
4,AAOI


In [10]:
def get_ticker_fullname(ticker):
    try:
        cpy = yf.Ticker(ticker)
        name = cpy.info['longName']
        return name
    except:
        return None

In [11]:
russell_tickers_df['Name'] = [get_ticker_fullname(ticker) for ticker in tqdm(russell_tickers_df.Ticker)]
russell_tickers_df

HBox(children=(FloatProgress(value=0.0, max=2964.0), HTML(value='')))




Unnamed: 0,Ticker,Name
0,A,"Agilent Technologies, Inc."
1,AA,Alcoa Corporation
2,AAL,American Airlines Group Inc.
3,AAN,"Aaron's Holdings Company, Inc."
4,AAOI,"Applied Optoelectronics, Inc."
...,...,...
2959,ZTS,Zoetis Inc.
2960,ZUMZ,Zumiez Inc.
2961,ZUO,"Zuora, Inc."
2962,ZYNE,"Zynerba Pharmaceuticals, Inc."


In [12]:
# if not df not saved yet, do this
russell_tickers_df.to_csv('data_out/russell_tickers_df.csv', index=False)
russell_tickers_df.to_csv('scrapy_spiders/data_in/russell_tickers_df.csv', index=False)

In [56]:
russell_tickers_df = pd.read_csv('data_out/russell_tickers_df.csv')
russell_tickers_df.head()

Unnamed: 0,Ticker
0,A
1,AA
2,AAL
3,AAN
4,AAOI


##### Source 1

 - this shd be better, but it is from a 3rd party source
 - only source which contains industry and sector

In [6]:
russell_tickers_df = pd.read_html('http://www.kibot.com/Historical_Data/Russell_3000_Historical_Intraday_Data.aspx')[1]
russell_tickers_df = russell_tickers_df.rename(columns=russell_tickers_df.iloc[0]).drop(0)[
                        ['Symbol', 'Description', 'Industry', 'Sector']].reset_index(drop=True)

In [7]:
russell_tickers_df.columns = ['Ticker', 'Name', 'Industry', 'Sector']
russell_tickers_df.Ticker = russell_tickers_df.Ticker.str.replace('.', '-')
russell_tickers_df.head()

Unnamed: 0,Ticker,Name,Industry,Sector
0,A,"AGILENT TECHNOLOGIES, INC.",Biotechnology: Laboratory Analytical Instruments,Capital Goods
1,AA,ALCOA CORPORATION,Aluminum,Basic Industries
2,AAL,"AMERICAN AIRLINES GROUP, INC.",Air Freight/Delivery Services,Transportation
3,AAN,"AARON'S, INC.",Diversified Commercial Services,Technology
4,AAOI,"APPLIED OPTOELECTRONICS, INC.",Semiconductors,Technology


In [43]:
russell_tickers_df.shape

(2721, 4)

In [25]:
len([i for i in snp_tickers_df.Ticker if i not in russell_tickers_df.Ticker.values])

24

=> SnP is not a subset of Russell

##### Source 3
- problem with this method is need to get tickers

In [1]:
import tabula

In [5]:
all_ticker_tables_df_list = tabula.read_pdf('data_in/russell3000.pdf', pages="all")

Got stderr: Sep 23, 2020 6:55:33 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+Arial,Bold are not implemented in PDFBox and will be ignored
Sep 23, 2020 6:55:39 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+Arial,Bold are not implemented in PDFBox and will be ignored
Sep 23, 2020 6:55:40 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+Arial,Bold are not implemented in PDFBox and will be ignored
Sep 23, 2020 6:55:40 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+Arial are not implemented in PDFBox and will be ignored
Sep 23, 2020 6:55:40 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font ABCDEE+Arial are not implemented in PDFBox and will be ignored
Sep 23, 2020 6:55:41 PM org.apache.pdfbox.pdmodel.fon

In [6]:
russell_tickers_df2 = pd.concat(all_ticker_tables_df_list).reset_index(drop=True)
russell_tickers_df2.head()

Unnamed: 0,Russell 3000®,Weight(%),Country
0,1-800 Flowers Com,0.001,United States
1,1life Healthcare,0.001,United States
2,1st Constitution Bancorp,0.0,United States
3,1st Source Corp,0.002,United States
4,22nd Century Group Inc,0.0,United States


In [None]:
russell_tickers_df2[russell_tickers_df2.Ticker.str.contains(".", regex=False)]

### STI

In [7]:
sti_ticker_df = pd.read_html('https://en.wikipedia.org/wiki/Straits_Times_Index', match= 'Stock Symbol')[0]

In [8]:
sti_ticker_df.head()

Unnamed: 0,Stock Symbol,Company
0,SGX: A17U,Ascendas Real Estate Investment Trust
1,SGX: C61U,CapitaCom Trust
2,SGX: C31,CapitaLand
3,SGX: C38U,CapitaMall Trust
4,SGX: C09,City Developments Limited


In [9]:
sti_ticker_df.to_csv('data_out/sti_ticker_df.csv')

# Yahoo Description, Price, Ratios scraping

- can try selenium grid for multiprocessing

In [14]:
tickers_df = russell_tickers_df

### Selenium

In [41]:
import time
from selenium import webdriver
from multiprocessing import Pool
from IPython.display import display, HTML

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

##### Desc Scraping

In [42]:
# %%time
ticker_list = tickers_df.Ticker[:2]
# ticker_list = ['GOOG', 'GOOGL']
ticker_desc_df = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])
wrong_ticker_list = []

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

# If you print, tqdm bar will not work
for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    desc_xpath = '//*[@id="Col1-0-Profile-Proxy"]/section/section[2]/p'
    element_present = EC.presence_of_element_located(
                (By.XPATH, desc_xpath))
    try:
        WebDriverWait(driver, 10).until(element_present)
    except:
        print(ticker)
        wrong_ticker_list.append(ticker)
        continue
    
    desc = driver.find_element_by_xpath(desc_xpath).text
    sector = driver.find_element_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[2]').text
    industry = driver.find_element_by_xpath('//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]').text
    
    ticker_desc_df.loc[len(ticker_desc_df)] = [ticker, desc, sector, industry]
    driver.close() #need?
display(ticker_desc_df)
wrong_ticker_list

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

https://finance.yahoo.com/quote/A/profile?p=A



SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 84


In [None]:
ticker_desc_df.to_csv('data_out/russell_desc_yahoo_df.csv', index=False, encoding='utf-8-sig')

Comparison to Scrapy
- Sometime when running scrapy spider, it causes some links to stop be able to be accessed by this computer, which is a odd phenomenon
- after a while it is okay

Timing
- Scrapy: 78s
- Selenium: 1hr ++
    - Running headless makes it about 30% faster
    - Rruns faster than the original code (profs code) as well

##### Desc Scraping (using yfinance, faster)

In [13]:
def get_ticker_desc(ticker):
    try:
        cpy = yf.Ticker(ticker)
        name = cpy.info['longBusinessSummary']
        return name
    except:
        return None

In [None]:
ticker_desc_df = tickers_df.copy()
ticker_desc_df['Description'] = [get_ticker_desc(ticker) for ticker in tqdm(tickers_df.Ticker)]

HBox(children=(FloatProgress(value=0.0, max=2964.0), HTML(value='')))

In [None]:
ticker_desc_df.to_csv('data_out/russell_desc_yahoo_df.csv', index=False, encoding='utf-8-sig')

##### Try using multithreading with Selenium

In [None]:
'''%%time

import multithreading_test

ticker_list = tickers_df.Ticker.head(5)
ticker_desc = pd.DataFrame(columns = ['Ticker', 'Description', 'Sector', 'Industry'])

p = Pool(processes = 4)
results = p.map_async(multithreading_test.get_ticker_desc_3, ticker_list)
output = results.get()

ticker_desc_df = pd.DataFrame(output, columns = ['Ticker', 'Description', 'Sector', 'Industry'])
display(ticker_desc_df)
'''
pass

##### Price Scraping (Method 1 : by clicking Downloads)

- Have to use selenium (and not scrapy) for price scraping as you need to interact with the webpage to extract the prices (i.e. clicking button or scrolling)
    - even if use pd.read_HTML(), it only outputs 100 elements (no interactivity)

In [11]:
ticker_list = tickers_df.Ticker
DOWNLOAD_FOLDER = os.getcwd() + os.path.sep + 'data_out\\russell_price_csv_files\\'
wrong_ticker_list = []

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
prefs = {
            "profile.default_content_settings.popups": 0,
            "download.default_directory": DOWNLOAD_FOLDER,
            "directory_upgrade": True
        }
options.add_experimental_option('prefs', prefs)
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

for ticker in tqdm(ticker_list):
    print('Processing: %s' %ticker)
    
    # Skip counters which have already been downloaded
    if '%s.csv' %ticker in os.listdir(DOWNLOAD_FOLDER):
        continue 
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)
    
    dload_button_xpath = '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[2]/span[2]/a'
    element_present = EC.presence_of_element_located(
                (By.XPATH, dload_button_xpath))
    try:
        WebDriverWait(driver, 10).until(element_present)
    except:
        print(ticker)
        wrong_ticker_list.append([ticker, "Driver Wait too long"])
        continue
        
    dload_button = driver.find_element_by_xpath(dload_button_xpath)
    dload_button.click()
    
    try:
        error_msg = driver.find_element_by_xpath('/html/body/pre').text
        print('Ticker Error: %s, %s' %(ticker, error_msg))
        wrong_ticker_list.append([ticker, error_msg])
        continue
    except Exception as e:
        pass
        
    # Wait for download to complete by checking for csv file locally
    # Note that repeated files not downloaded
    while ('%s.csv' %ticker not in os.listdir(DOWNLOAD_FOLDER)):
        time.sleep(0.1)
    driver.close()
wrong_ticker_list

HBox(children=(FloatProgress(value=0.0, max=2720.0), HTML(value='')))

Processing: A
Processing: AA
Processing: AAL
Processing: AAN
Processing: AAOI
Processing: AAON
Processing: AAP
Processing: AAPL
Processing: AAT
Processing: AAWW
Processing: AAXN
Processing: ABBV
Processing: ABC
Processing: ABCB
Processing: ABEO
Processing: ABG
Processing: ABM
Processing: ABMD
Processing: ABR
Processing: ABT
Processing: ABTX
Processing: AC
Processing: ACAD
Processing: ACBI
Processing: ACC
Processing: ACCO
Processing: ACGL
Processing: ACHC
Processing: ACI
https://finance.yahoo.com/quote/ACI/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d




Ticker Error: ACI, {
    "finance": {
        "error": {
            "code": "Unauthorized",
            "description": "Invalid cookie"
        }
    }
}
Processing: ACIA
Processing: ACIW
Processing: ACLS
Processing: ACM
Processing: ACN
Processing: ACNB
Processing: ACOR
Processing: ACRE
Processing: ACRS
Processing: ACRX
Processing: ACT
Processing: ACTG
Processing: ACV
Processing: ADBE
Processing: ADC
Processing: ADCT
https://finance.yahoo.com/quote/ADCT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
ADCT
Processing: ADES
Processing: ADI
Processing: ADM
Processing: ADMS
Processing: ADNT
Processing: ADP
Processing: ADPT
Processing: ADRO
Processing: ADS
Processing: ADSK
Processing: ADSW
Processing: ADT
Processing: ADTN
Processing: ADUS
Processing: ADXS
Processing: AE
Processing: AEE
Processing: AEGN
Processing: AEIS
Processing: AEL
Processing: AEO
Processing: AEP
Processing: AERI
Processing: AES
Processing: AFG
Processing: AFH
Processing: AFI
Proces

Ticker Error: BWFG, {
    "finance": {
        "error": {
            "code": "Unauthorized",
            "description": "Invalid cookie"
        }
    }
}
Processing: BWXT
https://finance.yahoo.com/quote/BWXT/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Ticker Error: BWXT, {
    "finance": {
        "error": {
            "code": "Unauthorized",
            "description": "Invalid cookie"
        }
    }
}
Processing: BXC
https://finance.yahoo.com/quote/BXC/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Ticker Error: BXC, {
    "finance": {
        "error": {
            "code": "Unauthorized",
            "description": "Invalid cookie"
        }
    }
}
Processing: BXG
https://finance.yahoo.com/quote/BXG/history?period1=1546272000&period2=1577808000&interval=1d&filter=history&frequency=1d
Ticker Error: BXG, {
    "finance": {
        "error": {
            "code": "Unauthorized",
            "description

KeyboardInterrupt: 

In [9]:
wrong_ticker_list

[['ACI',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['ADCT',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['ARB',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['ARNC',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['AWAY',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['BEAM',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['BRLI',
  "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
 ['BUFF', '404 Not Found: Timestamp data missing.'],
 ['BWA',
  '{\n    "finance": {\n        "error": {\n            "code": "Unauthorized",\n            "description": "Invalid cookie"\n        }\n    }\n}'],
 ['BWEN', 'Driver Wait too long'],
 ['BWFG', 'Driver Wait too long'],
 ['BWXT', 'Dri

- To run faster, restart the kernel and run again
- Sometime might get this error, just retry and it should be okay
    - WebDriverException: Message: unknown error: unable to discover open pages (FIXED)
    - SessionNotCreatedException: Message: session not created from tab crashed (Session info: headless chrome=84.0.4147.135)

- Error examples for wrong tickers
    - 404 Not Found: No data found, symbol may be delisted
    - 404 Not Found: Timestamp data missing
    - 400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000

- Sometimes a valid symbol may not retrieve the data (if is in the bad_ticker list, with label "Driver Wait too long", but just rerun the code above and it should dload the data properly

- Bad Ticker Data (from above)
    - [['BRK.B', '404 Not Found: No data found, symbol may be delisted'],
    - ['BF.B', '404 Not Found: Timestamp data missing.'],
    - ['CARR', "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"],
    - ['OTIS', "400 Bad Request: Data doesn't exist for startDate = 1546272000, endDate = 1577808000"]]

In [10]:
# Concat all dfs to form list of all prices
list_of_dfs = []
for filename in os.listdir(DOWNLOAD_FOLDER):
    df = pd.read_csv(DOWNLOAD_FOLDER + filename, index_col=0)[['Adj Close']]
    ticker = filename.split('.')[0]
    df.rename({'Adj Close':ticker}, axis=1, inplace=True)
    list_of_dfs.append(df)
ticker_price_df = pd.concat(list_of_dfs, axis=1)
ticker_price_df.to_csv('data_out/russell_price_df.csv')
ticker_price_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,A,AA,AAL,AAN,AAOI,AAON,AAP,AAPL,AAT,AAWW,...,BRY,BSRR,BSTC,BSX,BTU,BUD,BURL,BUSE,BV,BW
2018-12-31,66.579430,26.580000,31.599045,41.788548,15.43,34.704407,156.234573,38.585068,38.284000,42.189999,...,8.039623,22.729055,60.599998,35.340000,27.906427,63.634602,162.669998,22.945808,10.210000,3.90
2019-01-02,64.832542,26.240000,31.963160,43.209652,15.88,34.714302,156.691010,38.629097,37.454842,41.709999,...,8.260138,23.050650,59.990002,34.459999,28.519855,63.731300,160.160004,23.282421,10.670000,4.61
2019-01-03,62.444126,26.240000,29.581665,42.494129,15.06,33.427486,161.582657,34.781353,37.464375,40.610001,...,8.205010,22.956062,60.160000,32.910000,27.860649,64.224442,160.199997,23.329176,10.630000,4.60
2019-01-04,64.605537,28.340000,31.530161,43.716484,15.54,35.248825,157.574066,36.266144,37.769356,41.900002,...,8.857368,23.220903,61.439999,34.209999,28.821993,66.806183,163.369995,23.936949,11.500000,4.76
2019-01-07,65.977386,28.420000,32.425678,44.591007,15.68,35.783348,159.885956,36.185429,38.360241,42.810001,...,9.399467,23.419535,62.209999,34.419998,27.906427,67.105942,166.880005,23.964998,11.670000,6.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,84.726265,21.580000,29.072563,57.942677,11.12,49.625648,159.563675,70.580566,43.691719,27.420000,...,9.078856,28.783663,58.910000,45.310001,9.160000,80.833633,227.759995,26.675636,16.980000,3.30
2019-12-26,84.875252,21.450001,29.560759,56.737198,11.61,49.725300,158.271820,71.980911,43.896843,27.680000,...,9.233225,28.433590,58.080002,45.259998,9.340000,81.586403,225.929993,26.733604,16.820000,3.39
2019-12-27,84.845451,21.580000,28.335287,56.119514,11.52,49.635616,157.357590,71.953598,44.199650,26.870001,...,8.866597,28.365522,57.000000,45.220001,9.540000,82.329269,226.190002,26.627327,16.820000,3.59
2019-12-30,84.507034,21.340000,28.195801,56.657494,11.61,49.207123,158.122742,72.380653,44.443848,26.580000,...,8.731524,28.278004,56.630001,44.990002,9.150000,81.596306,229.580002,26.579020,16.820000,3.45


In [None]:
# OPTIONAL: Remove all downloaded files

# for filename in os.listdir(DOWNLOAD_FOLDER):
#     os.remove(DOWNLOAD_FOLDER + filename)

##### Price Scraping (Method 2: By scrolling down the page and scraping all values)

- Method 1 is better
- will take longer for longer pages 
- difficult to scrape dates, as some dates are for dividends (not scraped in this case)
- other than dates, output should be the same as Method 1


In [None]:
ticker_list = tickers_df.Ticker[:2]
all_prices_dict = {}

date1 = datetime.strptime('20190101', "%Y%m%d")
date2 = datetime.strptime('20200101', "%Y%m%d")

# Yahoo webpage only shows 100 prices at a time
NUM_TIMES_TO_SCROLL = (date2 - date1).days//100 + 1

time_str1 = str(int(datetime.timestamp(date1)))
time_str2 = str(int(datetime.timestamp(date2)))

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--no-sandbox') # Bypass OS security model
options.add_argument('--headless')

for ticker in tqdm(ticker_list):
    print('Processing: %s' %ticker)
    
    url= 'https://finance.yahoo.com/quote/%s/history?' \
         'period1=%s&period2=%s&interval=1d&filter=history&frequency=1d' %(ticker, time_str1, time_str2)
    print(url)
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    SCROLL_PAUSE_TIME = 0.1

    # getting scrollHeight using javascript dosent work for YahooFinance pg
    for _ in range(NUM_TIMES_TO_SCROLL):
        driver.execute_script("window.scrollTo(0, 100000);")
        time.sleep(SCROLL_PAUSE_TIME)
    
    items = driver.find_elements_by_xpath(
        '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr/td[6]/span')
    prices = list(reversed(list(map(lambda x: x.text, items))))
    all_prices_dict[ticker] = prices
    driver.close()
print(pd.DataFrame(all_prices_dict))
ticker_price_df2 = pd.DataFrame(all_prices_dict)
ticker_price_df2.to_csv('data_out/ticker_price_df2.csv')
display(ticker_price_df2)

In [None]:
filter_prices_dict = {k:v for k,v in all_prices_dict.items() if len(v)==253}
ticker_price_df2 = pd.DataFrame(filter_prices_dict)
ticker_price_df2.to_csv('data_out/snp_price_scrolling_df.csv')
display(ticker_price_df2)

In [None]:
# Tickers with errors (incomplete data)
{k:len(v) for k,v in all_prices_dict.items() if len(v)!=253}

- Runtime
    - 2:45:42 (505/505 [2:45:42<00:00, 19.69s/it]

In [None]:
from playsound import playsound
def ALARM():
    for i in range(10):
        playsound('data_in/bell.mp3')

ALARM()

##### Ratio Scraping 

In [None]:
ticker_list = tickers_df.Ticker[:2]
# ticker_list = ['BRK.B']
ticker_ratios_df = pd.DataFrame(columns = ['Ticker', 'mkt_cap', 'pb_ratio', 'beta', 'profit_margin', 'ROA', 'ROE'])
wrong_ticker_ratio_list = []

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Bypass OS security model

for ticker in tqdm(ticker_list):
    url = 'https://finance.yahoo.com/quote/'+ticker+'/key-statistics?p='+ticker
    print(url)    
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    mkt_cap_xpath = '//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[1]/td[3]'
    element_present = EC.presence_of_element_located(
                (By.XPATH, mkt_cap_xpath))
    try:
        WebDriverWait(driver, 10).until(element_present)
    except:
        print(ticker)
        wrong_ticker_ratio_list.append(ticker)
        continue
        
    mkt_cap = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[1]/td[3]').text
    pb_ratio = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[1]/div[2]/div/div[1]/div[1]/table/tbody/tr[7]/td[3]').text
    beta = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[2]/div/div[1]/div/div/table/tbody/tr[1]/td[2]').text
    profit_margin = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[2]/div/div/table/tbody/tr[1]/td[2]').text
    roa = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[1]/td[2]').text
    roe = driver.find_element_by_xpath('//*[@id="Col1-0-KeyStatistics-Proxy"]/section/div[3]/div[3]/div/div[3]/div/div/table/tbody/tr[2]/td[2]').text

    ticker_ratios_df.loc[len(ticker_ratios_df)] = [ticker, mkt_cap, pb_ratio, beta, profit_margin, roa, roe]
    driver.close()
display(ticker_ratios_df)
wrong_ticker_ratio_list

In [None]:
def percentage_to_float(s):
    if s =='N/A':
        return np.NaN
    return float(s.strip('%').replace(',',''))/100

def mkt_cap_converter(s):
    multiplier = s[-1]
    if multiplier == 'B':
        return float(s[:-1]) # * 1000000000
    if multiplier == 'T':
        return float(s[:-1]) * 1000
    else:
        raise Exception(s)
        
def pb_ratio_converter(s):
    if s == 'N/A':
        return np.NaN
    multiplier = s[-1]
    if multiplier == 'k':
        return float(s[:-1]) * 1000
    else:
        return float(s)

In [None]:
ticker_ratios_clean_df = ticker_ratios_df.copy()
ticker_ratios_clean_df.ROA = ticker_ratios_df.ROA.apply(percentage_to_float)
ticker_ratios_clean_df.ROE = ticker_ratios_df.ROE.apply(percentage_to_float)
ticker_ratios_clean_df.profit_margin = ticker_ratios_df.profit_margin.apply(percentage_to_float)
ticker_ratios_clean_df.mkt_cap = ticker_ratios_df.mkt_cap.apply(mkt_cap_converter)
ticker_ratios_clean_df.pb_ratio = ticker_ratios_df.pb_ratio.apply(pb_ratio_converter)
ticker_ratios_clean_df.beta = ticker_ratios_clean_df.beta.replace('N/A', np.NaN).astype(float)

ticker_ratios_clean_df

In [None]:
ticker_ratios_clean_df.to_csv('data_out/russell_ratios_df.csv', index=False)

# 10K reports merging

- merge all 10K report .txt files into a dataframe

In [21]:
DOWNLOAD_FOLDER_10K = os.getcwd() + os.path.sep + 'data_in\\10K intros\\russell\\'
desc_list = []

for filename in tqdm(os.listdir(DOWNLOAD_FOLDER_10K)):
    if filename == 'desktop.ini': # not sure why its there, but cant seem to be able to take it out
        continue
    ticker,cik,report,date,_,_ = filename.split('_')
    with open(DOWNLOAD_FOLDER_10K + filename,  'r', encoding='utf-8') as f:
        desc = f.read()
        desc = desc.strip().replace('\n', ' ').replace(',', ' ')
    desc_list.append([ticker,cik,desc,report,date])

desc_10K_df = pd.DataFrame(desc_list, columns=['Ticker','CIK', 'Description', 'Report', 'Date'])

desc_10K_df.sort_values(by=['Ticker', 'Date'], inplace=True)
desc_10K_df.drop_duplicates('Ticker', keep='last', inplace=True)
desc_10K_df = desc_10K_df[desc_10K_df.Report == '10K'][['Ticker','CIK', 'Description']]
desc_10K_df.reset_index(drop=True, inplace=True)

desc_10K_df

HBox(children=(FloatProgress(value=0.0, max=2829.0), HTML(value='')))




Unnamed: 0,Ticker,CIK,Description
0,A,0001090872,PART I Item 1. Business Overview Agilent T...
1,AA,0001675149,PART I Item 1. Business. (dollars in milli...
2,AAL,0000004515,PART I ITEM 1. BUSINESS Overview American ...
3,AAN,0000706688,PART I ITEM 1. BUSINESS Unless otherwise ind...
4,AAOI,0001158114,PART I Item 1. Business BUSINESS Overv...
...,...,...,...
2802,ZTS,0001555280,PART I Item 1. Business. Overview Zoetis I...
2803,ZUMZ,0001318008,Item 1. BUSINESS Zumiez Inc. including i...
2804,ZUO,0001423774,PART I Item 1. Business Overview Zuora is a...
2805,ZYNE,0001621443,PART I Item 1. Business Unless the con...


In [22]:
desc_10K_df.to_pickle('data_out/russell_desc_10K.pkl')

# GICS code to Industry mapping scraping

In [33]:
gics_code_to_industry_df = pd.read_html('https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard')[0][['Industry', 'Industry.1']].drop_duplicates()
gics_code_to_industry_df.columns = ['GICS', 'GICS Industry']
gics_code_to_industry_df.reset_index(inplace=True, drop=True)
gics_code_to_industry_df

Unnamed: 0,GICS,GICS Industry
0,101010,Energy Equipment & Services
1,101020,"Oil, Gas & Consumable Fuels"
2,151010,Chemicals
3,151020,Construction Materials
4,151030,Containers & Packaging
5,151040,Metals & Mining
6,151050,Paper & Forest Products
7,201010,Aerospace & Defense
8,201020,Building Products
9,201030,Construction & Engineering


In [34]:
gics_code_to_industry_df.shape

(69, 2)

In [35]:
gics_code_to_industry_df.to_csv('data_out/gics_code_to_industry.csv', index=False)

# Clean GICS classification

- Get Ticker to GICS mapping
- The GICS hierarchy begins with 11 sectors and is followed by 24 industry groups, 68 industries, and 157 sub-industries.
- **Not important bcos found better data source in data_in**

In [109]:
gics_df = pd.read_csv('data_in/snp_GICS_classification_bloomberg.csv', names=['Ticker', 'GICS', 'NAICS'], dtype=str)

In [110]:
gics_filter_df = gics_df[~gics_df.GICS.isna()].reset_index(drop=True).iloc[:,:-1]

In [111]:
len(gics_filter_df.GICS.value_counts())

138

In [112]:
gics_filter_df['GICS'] = gics_filter_df.GICS.apply(lambda s: s[:6])
gics_filter_df

Unnamed: 0,Ticker,GICS
0,0910150D,253010
1,1284849D,352020
2,1288453D,302010
3,1317355D,451020
4,1448062D,502010
...,...,...
714,YUM,253010
715,ZBH,351010
716,ZBRA,452030
717,ZION,401010


In [113]:
len(gics_filter_df.GICS.value_counts())

66

In [114]:
gics_filter_df.to_csv('data_out/ticker_to_gics_bloomberg.csv', index=False)

# List of CIKs for relevant tickers

- currently just collect all CIKS for all the tickers from output from the above section (data_out/ticker_to_gics.csv)
- website contains 10102 tickers-cik mappings
- need this to scrape 10K reports

In [69]:
import urllib.request

url = "https://www.sec.gov/include/ticker.txt"
html = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(html)
html_text = soup.find('p').text
ticker_to_cik_list = [cpy.upper().split('\t') for cpy in html_text.split('\n')]
ticker_to_cik_df = pd.DataFrame(ticker_to_cik_list, columns=['Ticker','CIK'])
ticker_to_cik_df

Unnamed: 0,Ticker,CIK
0,AAPL,320193
1,AMZN,1018724
2,MSFT,789019
3,GOOG,1652044
4,FB,1326801
...,...,...
10098,FSNN,1071411
10099,FSRVW,1785424
10100,FTD,1575360
10101,FTAI-PA,1590364


In [27]:
ticker_to_gics_df = pd.read_csv('data_in/ticker_to_gics.csv', names=['Ticker', 'GICS'])
ticker_to_gics_df.Ticker = ticker_to_gics_df.Ticker.str.replace('/','-')
ticker_to_gics_df

Unnamed: 0,Ticker,GICS
0,A,35203010
1,AA,15104010
2,AAL,20302010
3,AAN,25504060
4,AAOI,45201020
...,...,...
2959,ZTS,35202010
2960,ZUMZ,25504010
2961,ZUO,45103020
2962,ZYNE,35202010


In [33]:
#Test
display(snp_tickers_df[~snp_tickers_df.Ticker.isin(ticker_to_gics_df.Ticker)])
display(russell_tickers_df[~russell_tickers_df.Ticker.isin(ticker_to_gics_df.Ticker)])

Unnamed: 0,Ticker,Name,Sector,Sub Industry,CIK
26,AMCR,Amcor plc,Materials,Paper Packaging,1748790
296,LUMN,Lumen Technologies,Communication Services,Alternative Carriers,18926
410,STX,Seagate Technology,Information Technology,"Technology Hardware, Storage & Peripherals",1137789
435,TEL,TE Connectivity Ltd.,Information Technology,Electronic Manufacturing Services,1385157
436,FTI,TechnipFMC,Energy,Oil & Gas Equipment & Services,1681459


Unnamed: 0,Ticker,Name,Industry,Sector
18,ABR,ARBOR REALTY TRUST,Real Estate Investment Trusts,Consumer Services
26,ACET,,,
29,ACI,"ALBERTSONS COMPANIES, INC.",Food Chains,Consumer Services
40,ACT,ADVISORSHARES VICE ETF,,
42,ACV,ALLIANZGI DIVERSIFIED INCOME & CONVERTIBLE FUND,,
...,...,...,...,...
2651,WPP,WPP PLC,Advertising,Technology
2666,WSTL,"WESTELL TECHNOLOGIES, INC.",Telecommunications Equipment,Public Utilities
2691,XOMA,XOMA CORPORATION,Major Pharmaceuticals,Health Care
2692,XONE,THE EXONE COMPANY,Industrial Machinery/Components,Capital Goods


In [96]:
scrapping_ticker_ciks_df = pd.merge(ticker_to_gics_df, ticker_to_cik_df, on='Ticker', how='inner')[['CIK', 'Ticker']]
scrapping_ticker_ciks_df

Unnamed: 0,CIK,Ticker
0,1090872,A
1,1675149,AA
2,6201,AAL
3,706688,AAN
4,1158114,AAOI
...,...,...
2949,1555280,ZTS
2950,1318008,ZUMZ
2951,1423774,ZUO
2952,1621443,ZYNE


In [98]:
scrapping_ticker_ciks_df.to_csv('data_out/scrapping_ticker_ciks.txt', sep=" ", index=False, header=False)

# Scrape Wikipedia data

In [3]:
def get_wiki_details(cpy):
    return_for_invalid_entry = [None]
    if pd.isna(cpy):
        return return_for_invalid_entry
    try:
        p = wiki.page('%s company'%cpy) # Can also do wiki.summary('APPL company')
        title = p.title
        title_list = re.sub('[^a-zA-Z0-9]', ' ', title).lower().split()
        cpy_list = re.sub('[^a-zA-Z0-9]', ' ', cpy).lower().split()
        if not [word for word in cpy_list if word in title_list]:
            summary = p.summary
            summary_list = re.sub('[^a-zA-Z0-9]', ' ', summary).lower().split()
            if not [word for word in cpy_list if word in summary_list]:
                print('wrong search: %s (search title: %s)'%(cpy, title))
                return return_for_invalid_entry
    except:
        print('no wiki result: %s'%cpy)
        return return_for_invalid_entry
    return [p.summary] 

In [4]:
# Test
p = wiki.page('Equinix company')
p.summary

'Equinix, Inc. is an American multinational company headquartered in Redwood City, California. It specializes in Internet connection and data centers. The company owns 205 colocation data centers in 25 countries.It is listed on the NASDAQ stock exchange under the ticker symbol EQIX. As of 2019, it had approximately 7,800 employees globally. The company converted to a real estate investment trust (REIT) in January 2015.'

In [69]:
# For russell tickers, need to add company names to russell_tickers_df
russell_tickernames_df = pd.read_csv('data_in/russell_desc_bloomberg.csv', names=['Ticker', 'Company', 'Description'])
russell_tickernames_df = russell_tickernames_df[['Ticker', 'Company']]
russell_tickernames_df = pd.merge(russell_tickernames_df, russell_tickers_df, on='Ticker')
russell_tickernames_df

Unnamed: 0,Ticker,Company
0,A,AGILENT TECHNOLOGIES INC
1,AA,ALCOA CORP
2,AAL,AMERICAN AIRLINES GROUP INC
3,AAN,AARON'S INC
4,AAOI,APPLIED OPTOELECTRONICS INC
...,...,...
2752,ZS,ZSCALER INC
2753,ZTS,ZOETIS INC
2754,ZUMZ,ZUMIEZ INC
2755,ZUO,ZUORA INC - CLASS A


In [10]:
ticker_df = russell_tickers_df.reset_index(drop=True)
wiki_desc_list = [get_wiki_details(cpy) for cpy in tqdm(ticker_df.Name)]

HBox(children=(FloatProgress(value=0.0, max=2721.0), HTML(value='')))

wrong search: APPLIED OPTOELECTRONICS, INC. (search title: List of flat panel display manufacturers)
wrong search: AMERICAN ASSETS TRUST, INC. (search title: List of asset management firms)
no wiki result: ABEONA THERAPEUTICS INC.
no wiki result: ALLEGIANCE BANCSHARES, INC.
wrong search: ATLANTIC CAPITAL BANCSHARES, INC. (search title: Truist Financial)
wrong search: ACACIA COMMUNICATIONS, INC. (search title: List of acquisitions by Cisco Systems)
no wiki result: ACNB CORPORATION
no wiki result: ARES COMMERCIAL REAL ESTATE CORPORATION
no wiki result: ACLARIS THERAPEUTICS, INC.
no wiki result: ACELRX PHARMACEUTICALS, INC.
no wiki result: ALLIANZGI DIVERSIFIED INCOME & CONVERTIBLE FUND
no wiki result: AGREE REALTY CORPORATION
no wiki result: ADAMAS PHARMACEUTICALS, INC.
no wiki result: ADURO BIOTECH, INC.
no wiki result: ADDUS HOMECARE CORPORATION
no wiki result: AEGION CORP
no wiki result: AGNC INVESTMENT CORP.
no wiki result: ARGAN, INC.
wrong search: ARMADA HOFFLER PROPERTIES, INC. (s



  lis = BeautifulSoup(html).find_all('li')


no wiki result: CADIZ, INC.
wrong search: CLEARBRIDGE MLP AND MIDSTREAM FUND INC. (search title: Companies listed on the New York Stock Exchange (C))
no wiki result: CERUS CORPORATION
wrong search: CONFORMIS, INC. (search title: 2020 in Japanese music)
no wiki result: CHEMED CORP.
no wiki result: CHEMUNG FINANCIAL CORP
no wiki result: COHERUS BIOSCIENCES, INC.
no wiki result: CHUY'S HOLDINGS, INC.
wrong search: CHIMERA INVESTMENT CORPORATION (search title: Russell 1000 Index)
no wiki result: CIVISTA BANCSHARES, INC.
wrong search: COMPX INTERNATIONAL INC. (search title: NL Industries)
no wiki result: CITIZENS HOLDING COMPANY
no wiki result: CHATHAM LODGING TRUST (REIT)
no wiki result: CELLDEX THERAPEUTICS, INC.
wrong search: CLIPPER REALTY INC. (search title: List of Jewish American businesspeople in real estate)
no wiki result: CLEARSIDE BIOMEDICAL, INC.
wrong search: COLUMBUS MCKINNON CORPORATION (search title: 2018 in American television)
wrong search: CAPSTEAD MORTGAGE CORPORATION (

no wiki result: HAMILTON LANE INCORPORATED
no wiki result: HALLADOR ENERGY COMPANY
wrong search: HOOKER FURNITURE CORPORATION (search title: Hicksville, New York)
no wiki result: HARBORONE BANCORP, INC.
no wiki result: HOOKIPA PHARMA INC.
wrong search: HEALTHCARE REALTY TRUST INCORPORATED (search title: List of S&P 400 companies)
no wiki result: HERC HOLDINGS INC.
wrong search: HERON THERAPEUTICS, INC. (search title: List of companies based in London)
wrong search: HEIDRICK & STRUGGLES INTERNATIONAL, INC. (search title: List of companies in the Chicago metropolitan area)
no wiki result: HESKA CORPORATION
wrong search: HEALTHSTREAM, INC. (search title: List of S&P 600 companies)
wrong search: HERITAGE COMMERCE CORP (search title: Norfolk Southern Railway)
no wiki result: HERCULES CAPITAL, INC.
wrong search: HEARTLAND EXPRESS, INC. (search title: Standard Carrier Alpha Code)
no wiki result: HURCO COMPANIES, INC.
no wiki result: HORIZON GLOBAL CORPORATION
wrong search: MARINEMAX, INC. (se

no wiki result: PERFORMANT FINANCIAL CORPORATION
no wiki result: PIMCO GLOBAL STOCKSPLUS & INCOME FUND
wrong search: PGT INNOVATIONS, INC. (search title: List of S&P 600 companies)
wrong search: PICO HOLDINGS INC. (search title: SM Investments)
wrong search: PIERIS PHARMACEUTICALS, INC. (search title: Bangladesh)
no wiki result: PARKE BANCORP, INC.
no wiki result: PREFORMED LINE PRODUCTS COMPANY
wrong search: PLEXUS CORP. (search title: List of S&P 600 companies)
wrong search: PNM RESOURCES, INC. (HOLDING CO.) (search title: List of companies of the United States by state)
no wiki result: PRIMEENERGY RESOURCES CORPORATION
no wiki result: INSULET CORPORATION
no wiki result: POWER INTEGRATIONS, INC.
no wiki result: PRGX GLOBAL, INC.
wrong search: PROTO LABS, INC. (search title: Protolabs)
no wiki result: PROS HOLDINGS, INC.
no wiki result: PROTHENA CORPORATION PLC
no wiki result: PARATEK PHARMACEUTICALS, INC.
wrong search: PROTAGONIST THERAPEUTICS, INC. (search title: Strychnine)
no wiki

no wiki result: XENCOR, INC.
no wiki result: XOMA CORPORATION
no wiki result: THE EXONE COMPANY
wrong search: ZOGENIX, INC. (search title: Fenfluramine)
no wiki result: ZYNERBA PHARMACEUTICALS, INC.



snp: 

    wrong search: E*Trade (search title: Expotrade Arena)
    wrong search: Equinix (search title: Equinox)
    no wiki result: Laboratory Corp. of America Holding
    no wiki result: Linde plc
    no wiki result: Zoetis

russell:

    wrong search: APPLIED OPTOELECTRONICS, INC. (search title: List of flat panel display manufacturers)
    wrong search: AMERICAN ASSETS TRUST, INC. (search title: List of asset management firms)
    no wiki result: ABEONA THERAPEUTICS INC.
    no wiki result: ALLEGIANCE BANCSHARES, INC.
    wrong search: ATLANTIC CAPITAL BANCSHARES, INC. (search title: Truist Financial)
    wrong search: ACACIA COMMUNICATIONS, INC. (search title: List of acquisitions by Cisco Systems)
    no wiki result: ACNB CORPORATION
    no wiki result: ARES COMMERCIAL REAL ESTATE CORPORATION
    ...

In [163]:
snp_tickers_df[snp_tickers_df.Name =='E*Trade']

Unnamed: 0,Ticker,Name,Sector,Sub Industry,CIK
160,ETFC,E*Trade,Financials,Investment Banking & Brokerage,1015780


In [59]:
desc_wiki_df = pd.concat([ticker_df[['Ticker', 'Name']], pd.DataFrame(wiki_desc_list, columns=['Description'])], axis=1)
# desc_wiki_df = desc_wiki_df.dropna().reset_index(drop=True)
desc_wiki_df.head(100)

Unnamed: 0,Ticker,Name,Description
0,A,"AGILENT TECHNOLOGIES, INC.","Agilent Technologies, Inc. is a global analyti..."
1,AA,ALCOA CORPORATION,Alcoa Corporation (a portmanteau of Aluminum C...
2,AAL,"AMERICAN AIRLINES GROUP, INC.",American Airlines Group Inc. is an American pu...
3,AAN,"AARON'S, INC.","Aaron's, Inc. is a lease-to-own retailer. The ..."
4,AAOI,"APPLIED OPTOELECTRONICS, INC.",
5,AAON,"AAON, INC.","AAON Inc. (NASDAQ: AAON) designs, manufactures..."
6,AAP,ADVANCE AUTO PARTS INC,"Advance Auto Parts, Inc. (Advance) is an Ameri..."
7,AAPL,APPLE INC.,Apple Inc. is an American multinational techno...
8,AAT,"AMERICAN ASSETS TRUST, INC.",
9,AAWW,ATLAS AIR WORLDWIDE HOLDINGS,"Atlas Air, Inc., a wholly owned subsidiary of ..."


In [14]:
desc_wiki_df.to_csv('data_out/russell_desc_wiki_df.csv', index=False)

# Clean Russell Ratios

In [37]:
def percentage_to_float(s):
    if type(s) == float:
        return s
    return float(s.strip('%').replace(',',''))/100

def mkt_cap_converter(s):
    if type(s) == float:
        return s
    multiplier = s[-1]
    if multiplier == 'M':
        return float(s[:-1])* 1000
    if multiplier == 'B':
        return float(s[:-1]) * 1000
    if multiplier == 'T':
        return float(s[:-1]) * 1000000
    else:
        raise Exception(s)
        
def pb_ratio_converter(s):
    if type(s) == float:
        return s
    multiplier = s[-1]
    if multiplier == 'k':
        return float(s[:-1]) * 1000
    else:
        return float(s)
    
def remove_commas(s):
    if type(s) == float:
        return s
    return float(s.replace(',',''))

In [38]:
ticker_ratios_df = pd.read_csv('data_in/russell_ratios.csv')
ticker_ratios_df.columns = ['Ticker', 'mkt_cap', 'pb_ratio', 'beta', 'profit_margin', 'ROA', 'ROE']

ticker_ratios_df.mkt_cap = ticker_ratios_df.mkt_cap.apply(mkt_cap_converter)
ticker_ratios_df.pb_ratio = ticker_ratios_df.pb_ratio.apply(pb_ratio_converter)
ticker_ratios_df.beta = ticker_ratios_df.beta.apply(remove_commas)
ticker_ratios_df.profit_margin = ticker_ratios_df.profit_margin.apply(percentage_to_float)
ticker_ratios_df.ROA = ticker_ratios_df.ROA.apply(percentage_to_float)
ticker_ratios_df.ROE = ticker_ratios_df.ROE.apply(percentage_to_float)

ticker_ratios_df

Unnamed: 0,Ticker,mkt_cap,pb_ratio,beta,profit_margin,ROA,ROE
0,MMM,78520.0,7.80,1.00,0.1537,0.1040,0.4992
1,ABT,139580.0,4.49,0.97,0.1115,0.0459,0.1156
2,ABBV,112510.0,,0.78,0.2477,0.1180,
3,ABMD,6530.0,6.24,0.86,0.2414,0.1372,0.2028
4,ACN,114850.0,7.59,1.03,0.1113,0.1344,0.3401
...,...,...,...,...,...,...,...
2886,PMT,1060.0,0.49,1.09,0.0000,-0.0433,-0.2376
2887,COLL,560220.0,6.41,1.07,-0.0421,-0.0117,-0.1106
2888,FLIC,412810.0,1.06,0.52,0.3688,0.0096,0.1045
2889,GWR,6300.0,1.81,1.31,0.0928,0.0308,0.0580


In [39]:
ticker_ratios_df.to_csv('data_out/russell_ratios_clean.csv', index=False)

# Extra

##### Implicit wait, selenium (google.com)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
# driver.implicitly_wait(10)

driver.get("http://google.com")
driver.maximize_window()

print("Implicit Wait Example")

inputElement = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input')  # .find_element_by_id("lst-ib")
inputElement.send_keys("Techbeamers")
inputElement.submit()

driver.close()

##### Multiprocessing test

- for pool to work on jupyter notebook need to import function
- with muti processing cannot change global variables (there is a fix for this)
- printing is not trivial in multiprocessing

- still need to figure out if multiprocessing or multithreading is more suitable for scraping

In [None]:
%%timeit
p=Pool(processes = 4)
output = p.map(multithreading_test.worker,range(3000000))
print(output[-3:])

In [None]:
%%timeit
p = Pool(processes=4)
results = p.map_async(multithreading_test.worker, range(3000000))
output = results.get()
print(output[-3:])

In [None]:
%%timeit
output = list(map(multithreading_test.worker, range(3000000)))
print(output[-3:])

- I think in this case using ordinary functions is faster than using Pool due to the high over head
- only use pool when there is high CPU requirement, high iterations

In [None]:
# Number of cores for multiprocessing
import multiprocessing
multiprocessing.cpu_count()

##### Check Chrome Driver Version

In [None]:
driver = webdriver.Chrome()
str1 = driver.capabilities['browserVersion']
str2 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0]
print(str1)
print(str2)
print(str1[0:2])
print(str2[0:2])
if str1[0:2] != str2[0:2]: 
    print("please download correct chromedriver version")

##### Running other scripts

In [None]:
%run 

##### Yield

In [None]:
def f():
    yield 1
    yield 2
    yield 3
[i for i in f()]

In [None]:
print(f())

##### Scrapy Tutorial

In [None]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [None]:
import json

class JsonWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [None]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

In [None]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

In [None]:
import pandas as pd
dfjson = pd.read_json('quoteresult.json')
dfjson

In [None]:
dfjl = pd.read_json('quoteresult.jl', lines=True)
dfjl

In [None]:
dfjson.to_pickle('quotejson.pickle')
dfjl.to_pickle('quotejl.pickle')

##### tqdm

In [None]:
from tqdm import tqdm

j=0
for i in tqdm(range(1000000), ):
    j+=i
    
print(j)

In [None]:
import time
import sys
from tqdm import trange


def do_something():
    time.sleep(1)

def do_another_something():
    time.sleep(1)


for i in trange(10):
    do_something()

#     for j in trange(2):
#         do_another_something()

In [None]:
from tqdm.notebook import trange, tqdm
import time

for i in trange(6):
    do_something()

    for j in trange(2):
        do_another_something()


##### Test Scraping 10K reports

In [None]:

for file in os.listdir("data_in"):
    if file.endswith(".txt") and "Item1_excerpt" in file:
        print(file)
        with open("data_in/" + file, "rt",encoding='utf-8') as f:
            line_num = 1
            for line in f:
                line = line.lower()
                if ("part i" in line) or ("item 1" in line) or ("business" in line and len(line) < 20) or ("introduction" in line):
                    print(line_num, line)                    
                if ("item 1a" in line) or ("risk factors" in line):
                    print(line_num, line)                    
                line_num += 1
        break

In [None]:
import glob

path = 'data_in'

files = [f for f in glob.glob(path + "*/*.txt", recursive=True)]

for f in files:
    print(f)

In [None]:
glob.glob(path + "*/*.txt", recursive=True)

###### Test if df contains .

In [None]:
snp_tickers_df[snp_tickers_df.Ticker.str.contains(".", regex=False)]