# Webscraper Notebook

##### Order
- Scrape Nasdaq IPO Calendar
- Follow Links from Ininital Scrape and scrape company data
- Search Company Stocks scrape first 180 days stock data



#### Start By Importing what we will need

In [2]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pickle
import time
from bs4 import BeautifulSoup
import random
from requests_html import HTMLSession
import json

#### Define Functions that may be helpful later

In [3]:
def pickle_file(obj, path):
    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_pickle(path):
    file = open(path, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def start_driver(driver_path = './WebDriver/chromedriver'):
    ser = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument('--no-sandbox')   
    chrome_options.add_argument('--disable-dev-shm-usage')   
    chrome_options.add_argument("--incognito")
    chrome_options.page_load_strategy = 'eager'
    
    driver_path = './WebDriver/chromedriver'
    driver = webdriver.Chrome(service = ser, options=chrome_options)
    return driver

#Loading a webpage
# driver.get('https://www.nasdaq.com/market-activity/ipos')

## Scrape Nasdaq IPO Calendar

- Define Scrape Specific Functions
  - Navigate To Starting Date
  - Generate Dictionary from rows in table
  - Navigate to next page
- Define Main Loop
  - Load Driver
  - Iterate Through Months from starting date to end date
- Run

In [3]:
def get_starting_page(driver, starting_date = '01/1995'):
    #Open Date 
    date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/button'))
        ).click()

    #Set Date Picker Date
    date_picker = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/input'))
        )
    driver.execute_script(f"arguments[0].value = '{starting_date}';", date_picker)     
        # .setAttribute('value', '01/1999')

    #Navigate to New Page
    apply_date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/button[2]'))
        ).click()

def generate_data(driver):
    table = driver.find_element(By.XPATH,
        './/table',{'class':'market-calendar-table__table'})
    html = table.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    
    url_base = 'https://www.nasdaq.com'
    table_data = []
    # print(soup)
    # if soup.find('th').text == '':
    #     return []

    rows = soup.find_all('tr',{'class':'market-calendar-table__row'})
    for row in rows:
        row_data = {}

        th = row.find('th', {'role':'cell'})
        ticker = th.findChild().text
        if not ticker:
            continue

        row_data[th['data-column']] = ticker
        row_data['link'] = url_base + row.find('a')['href']

        cells = row.find_all('td',{'role':'cell'})        
        for cell in cells:
            row_data[cell['data-column']] = cell.findChild().text
        table_data.append(row_data)
    return table_data

# generate_data(driver)

def get_next_calendar_page(driver):
    next_scroll = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__next']"))
        ).click()

    next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__item']"))
        ).click()

# get_next_calendar_page(driver)


In [None]:
def main_loop(pickup_file = False, starting_date= '01/1995', delay = 10):
    if pickup_file:
        data = load_pickle(pickup_file)
    else:
        data = []

    driver = start_driver('./WebDriver/chromedriver')
    print('Started Driver')

    time.sleep(3)

    driver.get('https://www.nasdaq.com/market-activity/ipos')
    print('Loaded Initial Page')

    # time.sleep(delay)

    get_starting_page(driver, starting_date=starting_date)
    print('Loaded Starting Page')

    date_year = 0
    count = 0
    while date_year <= 2022:
        count+=1
        date_year = int(driver.find_element(By.XPATH, ".//button[@class='time-belt__item']").get_attribute('data-year'))

        data += generate_data(driver)
        pickle_file(data, f'Data/nasdaq_checkpoint_{date_year}')
        print(f'Checkpoint-{date_year}\t(Navigations: {count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(10,30))

        get_next_calendar_page(driver)
    
    print('\nAll Done')
    return data

data = main_loop(starting_date='02/1998', pickup_file='Data/nasdaq_checkpoint_1998')



## Follow Links from Ininital Scrape and scrape company data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Get Page from link in data
  - Generate Dictionary for page data
  - Navigate to next page (Row in data)
- Define Main Loop
- Run

In [121]:
def make_request(url):
    session = HTMLSession()
    r = session.get(url)
    if r.status_code == 200:
        response = json.loads(r.text)
        return response
    return False
    
def process_overview_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['poOverview'].values():
            data[d['label']] = d['value']
        data['company_description'] = response['data']['companyInformation']['companyDescription']
    return data

def process_financials_response(response, data = {}):
    if response['data'] !=None:
        for d in list(response['data']['financials'][0].values()):
            data[d['label']] = d['value']
            if type(response['data']['filings']) == list and len(response['data']['filings']) > 1:
                data['first_filing_type'] = response['data']['filings'][0]['FormType']
                data['first_filing_date'] = response['data']['filings'][0]['DateReceived']
                data['first_filing_link'] = response['data']['filings'][0]['FilingLink']['value']
    return data

def process_experts_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['tableModel']['rows']:
            data[d['role']] = d['expertName']
    return data

In [122]:
def main_loop(data = False):
    ipo_calendar_data = load_pickle('Data/nasdaq_ipos')
    if not data:
        data = []

    ipo_calendar_data = ipo_calendar_data[len(data):]
    skipped_count = 0
    count = 0

    for entry in ipo_calendar_data:
        
        count +=1
        ticker = entry['proposedTickerSymbol']   
        url = entry['link']
        
        if url:
            row_data = {}
            row_data ['proposedTickerSymbol']= ticker
            
            response = make_request('https://api.nasdaq.com/api/ipo/overview/?' + url.split('?')[1])
            if response:
                row_data =  process_overview_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/financials-filings/?' + url.split('?')[1])
            if response:
                row_data =  process_financials_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/experts/?' + url.split('?')[1])
            if response:
                row_data =  process_experts_response(response, row_data)

            data.append(row_data)
        
        else:
            skipped_count +=1
            print(f'Skipped-{ticker}',end='\r', flush=True)

        pickle_file(data, f'Data/nasdaq_company_checkpoint')
        print(f'Checkpoint-{ticker}\t(Progress: {count}/{len(ipo_calendar_data)};\tSkipped: {skipped_count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(1, 25)/100)

    print('\nAll Done')
    pickle_file(data, f'Data/nasdaq_company_checkpoint_complete')
    return data


data = load_pickle('Data/nasdaq_company_checkpoint_')
data = main_loop(data)


Checkpoint-SVRE	(Progress: 3367/3367;	Skipped: 0;	 Rows of Data: 7630))
All Done


## Search Company Stocks scrape first 180 days stock data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Search ticker in Yahoo_fin
  - Check if results valid
  - Navigate to next
- Define Main Loop
- Run

In [4]:
# Import Needed Libraries
import datetime as dt
import pandas as pd
import yahoo_fin.stock_info as si

In [7]:
#Load our initial data with ipos and dates
ipo_data = load_pickle('Data/nasdaq_ipos')
ipo_data[0]

{'proposedTickerSymbol': 'SVRE',
 'link': 'https://www.nasdaq.com/market-activity/ipos/overview?dealId=1205111-102356',
 'companyName': 'SaverOne 2014 Ltd.',
 'proposedExchange': 'NASDAQ Capital',
 'proposedSharePrice': '5.80',
 'sharesOffered': '2,241,379',
 'expectedPriceDate': '06/03/2022',
 'dollarValueOfSharesOffered': '$14,949,998.80'}

In [5]:
def get_start_end_dates(date_string):
    start_date = dt.datetime.strptime(date_string, "%m/%d/%Y") -dt.timedelta(days=90)
    end_date = start_date + dt.timedelta(days=370)
    return str(start_date), str(end_date)

# get_start_end_dates(ipo_data[1000]['expectedPriceDate'])

In [6]:
# keys = []
# for dct in ipo_data:
#     if type(dct) == dict:
#         keys += list(dct.keys())
# set(keys)

def check_for_date_entry(entry):
    for key in ['pricedDate','expectedPriceDate','filedDate']:
        if key in entry.keys():
            return entry[key]
    return False
# check_for_date_entry(ipo_data[1000])

In [9]:
def main_loop(checkpoint_interval = 200):
    ipo_data = load_pickle('Data/nasdaq_ipos')
    skipped = 0
    list_of_dataframes = []
    completed_tickers = []

    for i in range(len(ipo_data)-1):
        
        if type(ipo_data[i]) != dict:
            skipped += 1
            print(f'Skipping Index-{i} (Not a valid dictionary)',end='\r', flush=True)

        elif ipo_data[i]["proposedTickerSymbol"] in completed_tickers:
            skipped += 1
            print(f'Repeated Ticker Skipping-{ipo_data[i]["proposedTickerSymbol"]}\t(Skipped: {skipped})',end='\r', flush=True)

        else:
            print(f'Processing-{ipo_data[i]["proposedTickerSymbol"]}\t(Progress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)

            date_of_ipo = check_for_date_entry(ipo_data[i])
            if not date_of_ipo:
                skipped +=1
                print(f'Skipping-{ipo_data[i]["proposedTickerSymbol"]}\t(Skipped: {skipped})',end='\r', flush=True)
                continue
            start_date, end_date = get_start_end_dates(date_of_ipo)
            try:
                list_of_dataframes.append(
                    si.get_data(ipo_data[i]['proposedTickerSymbol'] ,
                    start_date = start_date, end_date = end_date, index_as_date = False))
                completed_tickers.append(ipo_data[i]["proposedTickerSymbol"])
            except Exception as e:
                if e == KeyboardInterrupt():
                    raise e
                skipped+= 1
                print(f'Error Retrieving Data (Skipping-{ipo_data[i]["proposedTickerSymbol"]};\tSkipped: {skipped})',end='\r', flush=True)

        if i%checkpoint_interval ==0 and len(list_of_dataframes)>1:
            print(f'Checkpointing\t(Unique Tickers: {len(completed_tickers)};\tProgress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)
            data = pd.concat(list_of_dataframes)
            list_of_dataframes = [data]
            pickle_file(data, 'Data/stock_info')

    print(f'Saving Final\t(Progress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)
    data = pd.concat(list_of_dataframes)
    pickle_file(data, 'Data/stock_info')
    print(f'\nAll Done',end='\r', flush=True)
    return data

data = main_loop(checkpoint_interval=250)
# print(ipo_data[-1])


Saving Final	(Progress: 7628/7630;	Skipped: 4759)53))630;	Skipped: 4736)
All Done

In [None]:
data['days_post_ipo'] = data.date
ipo_data = load_pickle('Data/nasdaq_ipos')
for i in range(len(ipo_data)-1):
        
    if type(ipo_data[i]) == dict:
        if ipo_data[i]["proposedTickerSymbol"] in list(data.ticker):
            start_date = check_for_date_entry(ipo_data[i])
            data[data.ticker == ipo_data[i]["proposedTickerSymbol"]]['days_post_ipo'] = ((
             data[data.ticker == ipo_data[i]["proposedTickerSymbol"]]['date'] -
               pd.to_datetime(start_date)).days
            )
data.head()

data['days_post_ipo'] = data.apply(lambda row: (row.date -
                    pd.to_datetime(date_keys[row.ticker])).days)

data.tail()
            
    

In [36]:
data['days_post_ipo'] = data.apply(lambda row: (row.date -
                    pd.to_datetime(date_keys[row.ticker.lower()])).days, axis = 1)

data.tail(50)

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,days_post_ipo
20,2022-05-27,14.604,14.9,14.06,14.53,14.53,44000.0,BLTE,28
21,2022-05-31,14.65,15.313,14.6,15.12,15.12,11900.0,BLTE,32
22,2022-06-01,15.15,15.2,14.95,14.95,14.95,14400.0,BLTE,33
23,2022-06-02,15.0,15.05,14.55,14.85,14.85,9700.0,BLTE,34
24,2022-06-03,14.73,15.0,14.6,14.95,14.95,4200.0,BLTE,35


In [34]:
date_keys= {}
for i in range(len(ipo_data)-1):
    if type(ipo_data[i]) == dict:
        ticker = ipo_data[i]["proposedTickerSymbol"].lower()
        date_keys[ticker] = check_for_date_entry(ipo_data[i])

In [39]:
ipo_open_keys= {}
unique_tickers = list(stock_info.ticker.unique())
for i in range(len(unique_tickers)):
    ipo_open_keys[unique_tickers[i]]  =  stock_info.loc[stock_info.ticker == unique_tickers[i]].iloc[0]['adjclose']

stock_info['price_change'] = stock_info.apply(lambda row: None if row.ticker not in unique_tickers else
                    row.adjclose / ipo_open_keys[row.ticker], axis=1)

data.to_csv('Data/stock_info.csv')

### Index Date
- Gather Daily price info for:
    - S&P
    - DOW Jones
    - Russel 2000

In [77]:
tickers = ['^DJI', '^GSPC', '^RUT', '^IXIC', '^NYA']

market_info = []

for ticker in tickers:
    market_info.append(si.get_data(ticker,
                        start_date = '01/01/1995', end_date = '06/06/2022', index_as_date = False))

In [78]:
market_info[0].columns

# for df in market_info:
for i in range(len(market_info)):
    df = market_info[i]
    # print(len(df))
    df.drop('ticker', axis=1, inplace=True)
    columns = list(df.columns)
    for x in range(1,len(df.columns)):
        columns[x] = tickers[i] + '_' + columns[x]
    df.columns = columns
    

In [79]:
market_info[0].head()

Unnamed: 0,date,^DJI_open,^DJI_high,^DJI_low,^DJI_close,^DJI_adjclose,^DJI_volume
0,1995-01-03,3834.399902,3845.199951,3827.709961,3838.47998,3838.47998,24440000
1,1995-01-04,3838.5,3857.98999,3831.070068,3857.649902,3857.649902,27220000
2,1995-01-05,3857.600098,3860.679932,3843.189941,3850.919922,3850.919922,25810000
3,1995-01-06,3850.899902,3887.26001,3841.840088,3867.409912,3867.409912,30240000
4,1995-01-09,3867.399902,3874.47998,3853.280029,3861.350098,3861.350098,20820000


In [80]:
df = market_info[0]
for i in range(1,len(market_info)):
    df = df.merge(market_info[i], on='date')
df.head()

Unnamed: 0,date,^DJI_open,^DJI_high,^DJI_low,^DJI_close,^DJI_adjclose,^DJI_volume,^GSPC_open,^GSPC_high,^GSPC_low,...,^IXIC_low,^IXIC_close,^IXIC_adjclose,^IXIC_volume,^NYA_open,^NYA_high,^NYA_low,^NYA_close,^NYA_adjclose,^NYA_volume
0,1995-01-03,3834.399902,3845.199951,3827.709961,3838.47998,3838.47998,24440000,459.209991,459.269989,457.200012,...,743.530029,743.580017,743.580017,248750000,2651.149902,2651.149902,2651.149902,2651.149902,2651.149902,0
1,1995-01-04,3838.5,3857.98999,3831.070068,3857.649902,3857.649902,27220000,459.130005,460.720001,457.559998,...,740.469971,745.840027,745.840027,290350000,2658.870117,2658.870117,2658.870117,2658.870117,2658.870117,0
2,1995-01-05,3857.600098,3860.679932,3843.189941,3850.919922,3850.919922,25810000,460.730011,461.299988,459.75,...,745.130005,745.659973,745.659973,297510000,2658.129883,2658.129883,2658.129883,2658.129883,2658.129883,0
3,1995-01-06,3850.899902,3887.26001,3841.840088,3867.409912,3867.409912,30240000,460.380005,462.48999,459.470001,...,745.76001,749.690002,749.690002,312920000,2660.23999,2660.23999,2660.23999,2660.23999,2660.23999,0
4,1995-01-09,3867.399902,3874.47998,3853.280029,3861.350098,3861.350098,20820000,460.670013,461.769989,459.73999,...,750.099976,752.090027,752.090027,267090000,2660.139893,2660.139893,2660.139893,2660.139893,2660.139893,0


In [81]:
df.to_csv('Data/market_indicators.csv')

In [94]:
ticker = 'amzn'

info = si.get_company_info(ticker).transpose()
info['ticker'] = ticker
# print(infolongBusinessSummary)
# print(info.phonenum)

In [100]:
def get_tickers():
    df = pd.read_csv('Data/stock_info_complete.csv')
    return df.ticker.unique()
tickers = get_tickers()

In [101]:
def PullCompanyDetails():
    company_info = []
    skipped = 0
    for i in range(len(tickers)):
        ticker = tickers[i]
        try:
            info = si.get_company_info(ticker).transpose()
            info['ticker'] = ticker
            company_info.append(info)
            print(f'Retrieved {ticker}(Progress: {i}/{len(tickers)};\tSkipped: {skipped})',end='\r', flush=True)
        except:
            print(f'Skipping {ticker}(Progress: {i}/{len(tickers)};\tSkipped: {skipped})',end='\r', flush=True)
            skipped += 1
            pass
    return company_info

company_info = PullCompanyDetails()

Retrieved BLTE(Progress: 2869/2870;	Skipped: 205))

In [102]:
pickle_file(company_info,'Data/company_info')

df = pd.concat(company_info)

In [103]:
df.head()

Breakdown,sector,fullTimeEmployees,longBusinessSummary,city,phone,country,website,maxAge,address1,industry,...,auditRisk,state,shareHolderRightsRisk,compensationAsOfEpochDate,governanceEpochDate,boardRisk,overallRisk,address2,fax,industrySymbol
Value,Technology,41,"SaverOne 2014 Ltd, a technology company, engag...",Petah Tikva,972 3 909 4177,Israel,https://saver.one,86400,Em Hamoshavot Road 94,Scientific & Technical Instruments,...,,,,,,,,,,
Value,Healthcare,631,Cerus Corporation operates as a biomedical pro...,Concord,925 288 6000,United States,https://www.cerus.com,86400,1220 Concord Avenue,Medical Devices,...,3.0,CA,7.0,1640908800.0,1654041600.0,4.0,4.0,Suite 600,,
Value,Consumer Cyclical,32647,"CarMax, Inc., together with its subsidiaries, ...",Richmond,804 747 0422,United States,https://www.carmax.com,86400,12800 Tuckahoe Creek Parkway,Auto & Truck Dealerships,...,1.0,VA,7.0,1672444800.0,1654041600.0,6.0,7.0,,,
Value,Technology,7536,"Ciena Corporation provides network hardware, s...",Hanover,410 694 5700,United States,https://www.ciena.com,86400,7035 Ridge Road,Communication Equipment,...,3.0,MD,2.0,1640908800.0,1654041600.0,3.0,1.0,,,
Value,Technology,8800,"Euronet Worldwide, Inc. provides payment and t...",Leawood,913 327 4200,United States,https://www.euronetworldwide.com,86400,11400 Tomahawk Creek Parkway,Software—Infrastructure,...,3.0,KS,4.0,1640908800.0,1654041600.0,7.0,5.0,Suite 300,913 327 4120,


In [104]:
df.to_csv('Data/addition_company_data.csv')