# Webscraper Notebook

##### Order
- Scrape Nasdaq IPO Calendar
- Follow Links from Ininital Scrape and scrape company data
- Search Company Stocks scrape first 180 days stock data



#### Start By Importing what we will need

In [2]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pickle
import time
from bs4 import BeautifulSoup
import random
from requests_html import HTMLSession
import json

#### Define Functions that may be helpful later

In [3]:
def pickle_file(obj, path):
    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_pickle(path):
    file = open(path, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def start_driver(driver_path = './WebDriver/chromedriver'):
    ser = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument('--no-sandbox')   
    chrome_options.add_argument('--disable-dev-shm-usage')   
    chrome_options.add_argument("--incognito")
    chrome_options.page_load_strategy = 'eager'
    
    driver_path = './WebDriver/chromedriver'
    driver = webdriver.Chrome(service = ser, options=chrome_options)
    return driver

#Loading a webpage
# driver.get('https://www.nasdaq.com/market-activity/ipos')

## Scrape Nasdaq IPO Calendar

- Define Scrape Specific Functions
  - Navigate To Starting Date
  - Generate Dictionary from rows in table
  - Navigate to next page
- Define Main Loop
  - Load Driver
  - Iterate Through Months from starting date to end date
- Run

In [3]:
def get_starting_page(driver, starting_date = '01/1995'):
    #Open Date 
    date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/button'))
        ).click()

    #Set Date Picker Date
    date_picker = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/input'))
        )
    driver.execute_script(f"arguments[0].value = '{starting_date}';", date_picker)     
        # .setAttribute('value', '01/1999')

    #Navigate to New Page
    apply_date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/button[2]'))
        ).click()

def generate_data(driver):
    table = driver.find_element(By.XPATH,
        './/table',{'class':'market-calendar-table__table'})
    html = table.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    
    url_base = 'https://www.nasdaq.com'
    table_data = []
    # print(soup)
    # if soup.find('th').text == '':
    #     return []

    rows = soup.find_all('tr',{'class':'market-calendar-table__row'})
    for row in rows:
        row_data = {}

        th = row.find('th', {'role':'cell'})
        ticker = th.findChild().text
        if not ticker:
            continue

        row_data[th['data-column']] = ticker
        row_data['link'] = url_base + row.find('a')['href']

        cells = row.find_all('td',{'role':'cell'})        
        for cell in cells:
            row_data[cell['data-column']] = cell.findChild().text
        table_data.append(row_data)
    return table_data

# generate_data(driver)

def get_next_calendar_page(driver):
    next_scroll = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__next']"))
        ).click()

    next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__item']"))
        ).click()

# get_next_calendar_page(driver)


In [None]:
def main_loop(pickup_file = False, starting_date= '01/1995', delay = 10):
    if pickup_file:
        data = load_pickle(pickup_file)
    else:
        data = []

    driver = start_driver('./WebDriver/chromedriver')
    print('Started Driver')

    time.sleep(3)

    driver.get('https://www.nasdaq.com/market-activity/ipos')
    print('Loaded Initial Page')

    # time.sleep(delay)

    get_starting_page(driver, starting_date=starting_date)
    print('Loaded Starting Page')

    date_year = 0
    count = 0
    while date_year <= 2022:
        count+=1
        date_year = int(driver.find_element(By.XPATH, ".//button[@class='time-belt__item']").get_attribute('data-year'))

        data += generate_data(driver)
        pickle_file(data, f'Data/nasdaq_checkpoint_{date_year}')
        print(f'Checkpoint-{date_year}\t(Navigations: {count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(10,30))

        get_next_calendar_page(driver)
    
    print('\nAll Done')
    return data

data = main_loop(starting_date='02/1998', pickup_file='Data/nasdaq_checkpoint_1998')



## Follow Links from Ininital Scrape and scrape company data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Get Page from link in data
  - Generate Dictionary for page data
  - Navigate to next page (Row in data)
- Define Main Loop
- Run

In [121]:
def make_request(url):
    session = HTMLSession()
    r = session.get(url)
    if r.status_code == 200:
        response = json.loads(r.text)
        return response
    return False
    
def process_overview_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['poOverview'].values():
            data[d['label']] = d['value']
        data['company_description'] = response['data']['companyInformation']['companyDescription']
    return data

def process_financials_response(response, data = {}):
    if response['data'] !=None:
        for d in list(response['data']['financials'][0].values()):
            data[d['label']] = d['value']
            if type(response['data']['filings']) == list and len(response['data']['filings']) > 1:
                data['first_filing_type'] = response['data']['filings'][0]['FormType']
                data['first_filing_date'] = response['data']['filings'][0]['DateReceived']
                data['first_filing_link'] = response['data']['filings'][0]['FilingLink']['value']
    return data

def process_experts_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['tableModel']['rows']:
            data[d['role']] = d['expertName']
    return data

In [122]:
def main_loop(data = False):
    ipo_calendar_data = load_pickle('Data/nasdaq_ipos')
    if not data:
        data = []

    ipo_calendar_data = ipo_calendar_data[len(data):]
    skipped_count = 0
    count = 0

    for entry in ipo_calendar_data:
        
        count +=1
        ticker = entry['proposedTickerSymbol']   
        url = entry['link']
        
        if url:
            row_data = {}
            row_data ['proposedTickerSymbol']= ticker
            
            response = make_request('https://api.nasdaq.com/api/ipo/overview/?' + url.split('?')[1])
            if response:
                row_data =  process_overview_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/financials-filings/?' + url.split('?')[1])
            if response:
                row_data =  process_financials_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/experts/?' + url.split('?')[1])
            if response:
                row_data =  process_experts_response(response, row_data)

            data.append(row_data)
        
        else:
            skipped_count +=1
            print(f'Skipped-{ticker}',end='\r', flush=True)

        pickle_file(data, f'Data/nasdaq_company_checkpoint')
        print(f'Checkpoint-{ticker}\t(Progress: {count}/{len(ipo_calendar_data)};\tSkipped: {skipped_count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(1, 25)/100)

    print('\nAll Done')
    pickle_file(data, f'Data/nasdaq_company_checkpoint_complete')
    return data


data = load_pickle('Data/nasdaq_company_checkpoint_')
data = main_loop(data)


Checkpoint-SVRE	(Progress: 3367/3367;	Skipped: 0;	 Rows of Data: 7630))
All Done


## Search Company Stocks scrape first 180 days stock data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Search ticker in Yahoo_fin
  - Check if results valid
  - Navigate to next
- Define Main Loop
- Run

In [4]:
# Import Needed Libraries
import datetime as dt
import pandas as pd
import yahoo_fin.stock_info as si

In [7]:
#Load our initial data with ipos and dates
ipo_data = load_pickle('Data/nasdaq_ipos')
ipo_data[0]

{'proposedTickerSymbol': 'SVRE',
 'link': 'https://www.nasdaq.com/market-activity/ipos/overview?dealId=1205111-102356',
 'companyName': 'SaverOne 2014 Ltd.',
 'proposedExchange': 'NASDAQ Capital',
 'proposedSharePrice': '5.80',
 'sharesOffered': '2,241,379',
 'expectedPriceDate': '06/03/2022',
 'dollarValueOfSharesOffered': '$14,949,998.80'}

In [5]:
def get_start_end_dates(date_string):
    start_date = dt.datetime.strptime(date_string, "%m/%d/%Y") -dt.timedelta(days=90)
    end_date = start_date + dt.timedelta(days=370)
    return str(start_date), str(end_date)

# get_start_end_dates(ipo_data[1000]['expectedPriceDate'])

In [22]:
ipo_data[1000]

{'proposedTickerSymbol': 'ZD',
 'link': 'https://www.nasdaq.com/market-activity/ipos/overview?dealId=406-4754',
 'companyName': 'ZIFF DAVIS INC',
 'proposedExchange': 'New York Stock Exchange',
 'proposedSharePrice': '15.50',
 'sharesOffered': '25,800,000',
 'pricedDate': '04/28/1998',
 'dollarValueOfSharesOffered': '$399,900,000',
 'dealStatus': 'Priced'}

In [6]:
# keys = []
# for dct in ipo_data:
#     if type(dct) == dict:
#         keys += list(dct.keys())
# set(keys)

def check_for_date_entry(entry):
    for key in ['pricedDate','expectedPriceDate','filedDate']:
        if key in entry.keys():
            return entry[key]
    return False
# check_for_date_entry(ipo_data[1000])

In [69]:
temp = -3
print(ipo_data[temp])
date = check_for_date_entry(ipo_data[temp])
if date:
    start_date, end_date = get_start_end_dates(date)
si.get_data(ipo_data[temp]['proposedTickerSymbol'] , start_date = str(start_date) , end_date = str(end_date), index_as_date = False)

{'proposedTickerSymbol': 'SVRE', 'link': 'https://www.nasdaq.com/market-activity/ipos/overview?dealId=1205111-102356', 'companyName': 'SaverOne 2014 Ltd.', 'proposedExchange': 'NASDAQ Capital', 'proposedSharePrice': '5.80', 'sharesOffered': '2,241,379', 'expectedPriceDate': '06/03/2022', 'dollarValueOfSharesOffered': '$14,949,998.80'}


Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2022-06-03,3.5,3.5,2.22,2.56,2.56,1629900,SVRE


In [9]:
def main_loop(checkpoint_interval = 200):
    ipo_data = load_pickle('Data/nasdaq_ipos')
    skipped = 0
    list_of_dataframes = []
    completed_tickers = []

    for i in range(len(ipo_data)-1):
        
        if type(ipo_data[i]) != dict:
            skipped += 1
            print(f'Skipping Index-{i} (Not a valid dictionary)',end='\r', flush=True)

        elif ipo_data[i]["proposedTickerSymbol"] in completed_tickers:
            skipped += 1
            print(f'Repeated Ticker Skipping-{ipo_data[i]["proposedTickerSymbol"]}\t(Skipped: {skipped})',end='\r', flush=True)

        else:
            print(f'Processing-{ipo_data[i]["proposedTickerSymbol"]}\t(Progress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)

            date_of_ipo = check_for_date_entry(ipo_data[i])
            if not date_of_ipo:
                skipped +=1
                print(f'Skipping-{ipo_data[i]["proposedTickerSymbol"]}\t(Skipped: {skipped})',end='\r', flush=True)
                continue
            start_date, end_date = get_start_end_dates(date_of_ipo)
            try:
                list_of_dataframes.append(
                    si.get_data(ipo_data[i]['proposedTickerSymbol'] ,
                    start_date = start_date, end_date = end_date, index_as_date = False))
                completed_tickers.append(ipo_data[i]["proposedTickerSymbol"])
            except Exception as e:
                if e == KeyboardInterrupt():
                    raise e
                skipped+= 1
                print(f'Error Retrieving Data (Skipping-{ipo_data[i]["proposedTickerSymbol"]};\tSkipped: {skipped})',end='\r', flush=True)

        if i%checkpoint_interval ==0 and len(list_of_dataframes)>1:
            print(f'Checkpointing\t(Unique Tickers: {len(completed_tickers)};\tProgress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)
            data = pd.concat(list_of_dataframes)
            list_of_dataframes = [data]
            pickle_file(data, 'Data/stock_info')

    print(f'Saving Final\t(Progress: {i}/{len(ipo_data)};\tSkipped: {skipped})',end='\r', flush=True)
    data = pd.concat(list_of_dataframes)
    pickle_file(data, 'Data/stock_info')
    print(f'\nAll Done',end='\r', flush=True)
    return data

data = main_loop(checkpoint_interval=250)
# print(ipo_data[-1])


Saving Final	(Progress: 7628/7630;	Skipped: 4759)53))630;	Skipped: 4736)
All Done

In [12]:
len(data.ticker.unique())

2870

In [13]:
data

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2022-06-03,3.500,3.500,2.220,2.560,2.560,1629900.0,SVRE
0,1997-01-31,12.250,12.375,12.000,12.125,12.125,1241000.0,CERS
1,1997-02-03,12.375,12.375,12.125,12.250,12.250,179700.0,CERS
2,1997-02-04,12.250,12.375,12.000,12.000,12.000,113200.0,CERS
3,1997-02-05,12.000,12.375,12.000,12.000,12.000,236100.0,CERS
...,...,...,...,...,...,...,...,...
20,2022-05-27,14.604,14.900,14.060,14.530,14.530,44000.0,BLTE
21,2022-05-31,14.650,15.313,14.600,15.120,15.120,11900.0,BLTE
22,2022-06-01,15.150,15.200,14.950,14.950,14.950,14400.0,BLTE
23,2022-06-02,15.000,15.050,14.550,14.850,14.850,9700.0,BLTE


In [25]:
data['days_post_ipo'] = data.date
ipo_data = load_pickle('Data/nasdaq_ipos')
for i in range(len(ipo_data)-1):
        
    if type(ipo_data[i]) == dict:
        if ipo_data[i]["proposedTickerSymbol"] in list(data.ticker):
            start_date = check_for_date_entry(ipo_data[i])
            data[data.ticker == ipo_data[i]["proposedTickerSymbol"]]['days_post_ipo'] = ((
             data[data.ticker == ipo_data[i]["proposedTickerSymbol"]]['date'] -
               pd.to_datetime(start_date)).days
            )
data.head()

data['days_post_ipo'] = data.apply(lambda row: (row.date -
                    pd.to_datetime(date_keys[row.ticker])).days)

data.tail()
            
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data.ticker == ipo_data[i]["proposedTickerSymbol"]]['days_post_ipo'] = i


KeyboardInterrupt: 

In [36]:
data['days_post_ipo'] = data.apply(lambda row: (row.date -
                    pd.to_datetime(date_keys[row.ticker.lower()])).days, axis = 1)

data.tail(50)

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,days_post_ipo
20,2022-05-27,14.604,14.9,14.06,14.53,14.53,44000.0,BLTE,28
21,2022-05-31,14.65,15.313,14.6,15.12,15.12,11900.0,BLTE,32
22,2022-06-01,15.15,15.2,14.95,14.95,14.95,14400.0,BLTE,33
23,2022-06-02,15.0,15.05,14.55,14.85,14.85,9700.0,BLTE,34
24,2022-06-03,14.73,15.0,14.6,14.95,14.95,4200.0,BLTE,35


In [34]:
date_keys= {}
for i in range(len(ipo_data)-1):
        
    if type(ipo_data[i]) == dict:
        ticker = ipo_data[i]["proposedTickerSymbol"].lower()
        date_keys[ticker] = check_for_date_entry(ipo_data[i])

In [35]:
date_keys

{'svre': '06/03/2022',
 'vdim': '02/19/1997',
 'dtmc': '05/02/1997',
 'asdg': '05/13/1997',
 'adin': '07/12/1996',
 'digl': '02/06/1997',
 'aici': '08/05/1996',
 'tgh': '10/10/2007',
 'spsuu': '02/27/1997',
 'skp': '01/30/1997',
 'loanu': '04/28/1997',
 'eegl': '03/12/1999',
 'mrkf': '08/16/1996',
 'rsln': '01/13/1997',
 'pmco': '08/21/1996',
 'btic': '02/05/1997',
 'nacqu': '08/27/1997',
 'ampd': '01/22/1997',
 'cers': '01/31/1997',
 'ndhi': '11/10/1997',
 'globu': '02/12/1997',
 'ttrf': '02/14/1997',
 'cxot': '04/03/1997',
 'jlny': '09/13/1996',
 'bman': '02/12/1997',
 'blci': '05/02/1997',
 'hcom': '05/07/1997',
 'mdmd': '08/11/2010',
 'egasu': '01/24/1997',
 'oggi': '02/18/1997',
 'wni': '05/01/1997',
 'afbc': '01/02/1997',
 'ibco': '02/11/1997',
 'mmgr': '01/30/1997',
 'omqp': '10/01/1996',
 'judg': '02/14/1997',
 'erth': '10/02/1996',
 'rdo': '01/24/1997',
 'rwbd': '08/29/1997',
 'gntl': '02/13/1997',
 'cden': '02/11/1997',
 'mmac': '02/04/1997',
 'four': '06/05/2020',
 'nafi': '

In [39]:
data.to_csv('Data/stock_info.csv')