# Webscraper Notebook

##### Order
- Scrape Nasdaq IPO Calendar
- Follow Links from Ininital Scrape and scrape company data
- Search Company Stocks scrape first 180 days stock data



#### Start By Importing what we will need

In [18]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pickle
import time
from bs4 import BeautifulSoup
import random
import json
import request

#### Define Functions that may be helpful later

In [2]:
def pickle_file(obj, path):
    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_pickle(path):
    file = open(path, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def start_driver(driver_path = './WebDriver/chromedriver'):
    ser = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument('--no-sandbox')   
    chrome_options.add_argument('--disable-dev-shm-usage')   
    chrome_options.add_argument("--incognito")
    chrome_options.page_load_strategy = 'eager'
    
    driver_path = './WebDriver/chromedriver'
    driver = webdriver.Chrome(service = ser, options=chrome_options)
    return driver

#Loading a webpage
# driver.get('https://www.nasdaq.com/market-activity/ipos')

## Scrape Nasdaq IPO Calendar

- Define Scrape Specific Functions
  - Navigate To Starting Date
  - Generate Dictionary from rows in table
  - Navigate to next page
- Define Main Loop
  - Load Driver
  - Iterate Through Months from starting date to end date
- Run

In [3]:
def get_starting_page(driver, starting_date = '01/1995'):
    #Open Date 
    date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/button'))
        ).click()

    #Set Date Picker Date
    date_picker = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/input'))
        )
    driver.execute_script(f"arguments[0].value = '{starting_date}';", date_picker)     
        # .setAttribute('value', '01/1999')

    #Navigate to New Page
    apply_date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/button[2]'))
        ).click()

def generate_data(driver):
    table = driver.find_element(By.XPATH,
        './/table',{'class':'market-calendar-table__table'})
    html = table.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    
    url_base = 'https://www.nasdaq.com'
    table_data = []
    # print(soup)
    # if soup.find('th').text == '':
    #     return []

    rows = soup.find_all('tr',{'class':'market-calendar-table__row'})
    for row in rows:
        row_data = {}

        th = row.find('th', {'role':'cell'})
        ticker = th.findChild().text
        if not ticker:
            continue

        row_data[th['data-column']] = ticker
        row_data['link'] = url_base + row.find('a')['href']

        cells = row.find_all('td',{'role':'cell'})        
        for cell in cells:
            row_data[cell['data-column']] = cell.findChild().text
        table_data.append(row_data)
    return table_data

# generate_data(driver)

def get_next_calendar_page(driver):
    next_scroll = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__next']"))
        ).click()

    next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__item']"))
        ).click()

# get_next_calendar_page(driver)


In [6]:
def main_loop(pickup_file = False, starting_date= '01/1995', delay = 10):
    if pickup_file:
        data = load_pickle(pickup_file)
    else:
        data = []

    driver = start_driver('./WebDriver/chromedriver')
    print('Started Driver')

    time.sleep(3)

    driver.get('https://www.nasdaq.com/market-activity/ipos')
    print('Loaded Initial Page')

    # time.sleep(delay)

    get_starting_page(driver, starting_date=starting_date)
    print('Loaded Starting Page')

    date_year = 0
    count = 0
    while date_year <= 2022:
        count+=1
        date_year = int(driver.find_element(By.XPATH, ".//button[@class='time-belt__item']").get_attribute('data-year'))

        data += generate_data(driver)
        pickle_file(data, f'Data/nasdaq_checkpoint_{date_year}')
        print(f'Checkpoint-{date_year}\t(Navigations: {count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(10,30))

        get_next_calendar_page(driver)
    
    print('\nAll Done')
    return data

data = main_loop(starting_date='02/1998', pickup_file='Data/nasdaq_checkpoint_1998')



  driver = webdriver.Chrome(driver_path, options=chrome_options)


Started Driver
Loaded Initial Page
Loaded Starting Page
Checkpoint-2023	(Total: 302;	 Len: 7630)
All Done


## Follow Links from Ininital Scrape and scrape company data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Get Page from link in data
  - Generate Dictionary for page data
  - Navigate to next page (Row in data)
- Define Main Loop
- Run

In [5]:
def generate_company_overview_data(driver, data={}):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table_soup = soup.find('table')
    table_rows = table_soup.find_all('tr')
    for row in table_rows:
        data[row.find('th').text] = row.find('td').text
    return data

def generate_company_financials_data(driver, data={}):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table_soup = soup.find('table')
    table_rows = table_soup.find_all('tr')
    for row in table_rows:
        data[row.find('th').text] = row.find('td').text
    filing_table = soup.find_all('tbody')[1]
    filing_row = filing_table.find('tr')
    filing_cells = filing_row.find_all('td')
    data['form_type'] = filing_cells[1].text
    data['filing_date_received'] = filing_cells[2].text
    data['filing_link'] = filing_cells[3].find('a')['href']
    return data

def generate_company_experts_data(driver, data={}):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table_soup = soup.find('table')
    table_rows = table_soup.find_all('tr')
    for row in table_rows:
        data[row.find('td').text] = row.find('th').text
    return data

def driver_load_url(driver, url, delay = 5):
    driver.get(url)
    # time.sleep(delay)
    time.sleep(random.randint(1,5))
    driver.execute_script("window.stop();")


def navigate_to_financials(driver, url):
    financial_url = url.replace('overview','financial-filings')
    driver_load_url(driver, financial_url)
    return financial_url

def navigate_to_experts(driver, url):
    expert_url = url.replace('overview','experts')
    driver_load_url(driver, expert_url)
    return expert_url

def is_loading_error(driver):
    error_elements = driver.find_elements(By.XPATH, './/div[@class="error-state"]')
    for error in error_elements:
        if error.is_displayed():
            return True
    return False

def retry(func, driver, row_data):
    tries = 0
    while tries <= 10:
        try:
            row_data =  func(driver, row_data)
            tries = 100
        except:
            tries+=1
            print(f'Error\t(Try: {tries}/10)',end='\r', flush=True)
            if driver:
                driver.close()
            driver = False
            time.sleep(random.randint(1,3))
    return row_data

def retry_load(func, driver, url):
    tries = 0
    while tries <= 10:
        try:
            if driver:
                driver.close()
            driver = start_driver('./WebDriver/chromedriver')
            return func(driver, url)
        except:
            tries+=1
            print(f'Error Loading\t(Try: {tries}/10)',end='\r', flush=True)
            driver = False
            time.sleep(random.randint(1,3))
    return False


In [8]:
def main_loop():
    # driver = start_driver('./WebDriver/chromedriver')
    # print('Driver Loaded')

    ipo_calendar_data = load_pickle('Data/nasdaq_ipos')

    ipo_calendar_data = ipo_calendar_data[19:]
    # driver = start_driver('./WebDriver/chromedriver')
    data = []
    skipped_count = 0
    count = 0

    for entry in ipo_calendar_data:
        
        count +=1
        ticker = entry['proposedTickerSymbol']   
        url = entry['link']
        
        if url:
            row_data = {}
            row_data ['proposedTickerSymbol']= ticker
            tries = 0
            while tries <= 10:
                try:
                    if driver:
                        driver.close()

                    driver = start_driver('./WebDriver/chromedriver')
                    driver_load_url(driver, url) 
                    row_data = generate_company_overview_data(driver, row_data) if ~is_loading_error(driver) else row_data
                    tries = 10000
                except:
                    tries+=1
                    print(f'Error Loading\t(Try: {tries}/10)',end='\r', flush=True)
                    driver = False
                    time.sleep(random.randint(1,3))
            

            # if retry_load(navigate_to_financials, driver, url):   
            #     row_data = retry(generate_company_financials_data, driver, row_data) if ~is_loading_error(driver) else row_data

            # if retry_load(navigate_to_experts, driver, url):   
            #     row_data = retry(generate_company_experts_data, driver, row_data) if ~is_loading_error(driver) else row_data


                # driver.close()
                # driver = start_driver('./WebDriver/chromedriver')
                # navigate_to_financials(driver, url)
                # row_data = generate_company_financials_data(driver, row_data) if ~is_loading_error(driver) else row_data

                # driver.close()
                # driver = start_driver('./WebDriver/chromedriver')
                # navigate_to_experts(driver, url)
                # row_data = generate_company_experts_data(driver, row_data) if ~is_loading_error(driver) else row_data

                    # tries+=1
                    # print(f'Error-{ticker}\t(Try: {tries}/10)',end='\r', flush=True)
                    # driver = False
                    # time.sleep(random.randint(3,8))

            data.append(row_data)
        
        else:
            skipped_count +=1
            print(f'Skipped-{ticker}',end='\r', flush=True)

        pickle_file(data, f'Data/nasdaq_company_overview_checkpoint')
        print(f'Checkpoint-{ticker}\t(Progress: {count}/{len(ipo_calendar_data)};\tSkipped: {skipped_count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(1,8))

    print('\nAll Done')
    pickle_file(data, f'Data/nasdaq_company_overview_checkpoint_complete')
    return data

data = main_loop()



Checkpoint-SHP	(Progress: 961/7611;	Skipped: 0;	 Rows of Data: 961)))

KeyboardInterrupt: 

In [16]:
driver = start_driver()
driver.get('https://api.nasdaq.com/api/ipo/overview/?dealId=1205111-102356')

In [21]:
null = None

data = {"data":{"poOverview":{"Symbol":{"label":"Proposed Symbol","value":"SVRE"},"CompanyName":{"label":"Company Name","value":"SaverOne 2014 Ltd."},"Exchange":{"label":"Exchange","value":"NASDAQ Capital"},"ProposedSharePrice":{"label":"Share Price","value":"$4.13"},"NumberOfEmployees":{"label":"Employees","value":"41 (as of 05/12/2022)"},"DealStatus":{"label":"Status","value":"Filed"},"SharesOffered":{"label":"Shares Offered","value":2241379.0},"DollarValueOfSharesOffered":{"label":"Offer amount","value":"$12,150,121.34"},"SharesOverAllotment":{"label":"Shares Over Alloted","value":"--"},"Address":{"label":"Company Address","value":"EM HAMOSHAVOT RD. 94  PETAH TIKVAH  49130"},"PhoneNumber":{"label":"Company Phone","value":"972-3-9094177"},"CompanyWebsite":{"label":"Company Website","value":"<a href='http://www.saver.one' target='_blank'>www.saver.one</a>"},"CEO":{"label":"CEO","value":"Ori Gilboa"},"StateOfIncorp":{"label":"State of Inc","value":"--"},"FiscalYearEnd":{"label":"Fiscal Year End","value":"12/31"},"TotalExpenseOfTheOffering":{"label":"Total Offering Expense","value":"$600,000.00"},"ShareholderSharesOffered":{"label":"Shareholder Shares Offered","value":"--"},"SharesOutstanding":{"label":"Shares Outstanding","value":"3,838,320"},"LockupPeriodNumberofDays":{"label":"Lockup Period (days)","value":"180"},"LockupPeriodExpirationDate":{"label":"Lockup Expiration","value":"--"},"QuietPeriodExpirationDate":{"label":"Quiet Period Expiration","value":"--"},"SECCIK":{"label":"CIK","value":"0001894693"},"DealId":{"label":"DealId","value":"1205111-102356"}},"companyInformation":{"companyName":"SaverOne 2014 Ltd.","symbol":"SVRE","companyDescription":"We are a technology company engaged in the design, development and\ncommercialization of transportation and safety solutions designed to save lives\nby preventing car accidents resulting from the use of mobile phones while\ndriving. Our SaverOne system provides an advanced driver safety solution that\ncan identify and monitor mobile phones located in the driver’s vicinity and\nselectively block use of life-threatening applications. We have three\ngenerations of systems, the first two of which target the automobile aftermarket\nand the third which is intended to target vehicle manufacturers. We have\ncompleted the development of our Generation 1.0 system and are in the\npre-commercialization/early use phase while the development of our Generation\n2.0 system, which will replace Generation 1.0, is nearing completion. The\nGeneration 3.0 system is in the early stages of development.\n\nOur SaverOne system can be utilized in vehicles owned or leased by companies\nthat are provided to employees, or private vehicles, commercial trucks, buses\nand other forms of transportation. Our technology is based on our proprietary\nhardware, software and algorithms, and, because it meets the National Highway\nTraffic Safety Administration, or NHTSA’s, guidelines for a complete solution\nfor distracted driving as well as offers certain advantages that our competitors\ndo not, we believe we have significant advantages over our competitors in the\nmarket.\n\nIn the past several years, we believe that public awareness and demand for\ndriver safety technologies has grown substantially. While there are currently\nmany driver assistant products on the market, we believe that the safety of\ndrivers will be substantially improved with our technology. Our mission is to\nenhance driver safety by providing a solution that is highly reliable and able\nto prevent certain driver distractions related to mobile phone usage while\ndriving, which we believe is a major cause for driver distraction related\nautomobile accidents. Mobile phone distracted driving is a leading cause of\ntraffic accidents in the United States. According to a survey done by the NHTSA,\n660,000 drivers in the United States attempt to use their mobile phones while\ndriving at any given moment. The National Safety Council, or NSC reports that\nmobile phone use during driving causes approximately 1.6 million traffic\naccidents annually in the United States alone, leading to the death of\napproximately 4,600 people and injuring an additional 391,000 people. Moreover,\nthe Federal Motor Carrier Safety Administration, or FMCSA, reported that 71% of\ncommercially driven large-truck crashes occurred because of driver distraction.\nThe number of accidents caused by distracted driving increased each year from\n2015 – 2019 according to the NHTSA.\n\nDistracted driving due to mobile phone usage is not just a problem in the United\nStates. A number of surveys conducted across Europe and Oceania have revealed\ntroubling statistics about its prevalence across nations. In the Czech Republic,\n36% of drivers admitted to using their phone almost every time they get behind\nthe wheel. In both Spain and Ireland, 25% of drivers admitted to using their\nphone while driving. In Germany, at any given moment an average of 7% of all\ndrivers are distracted while driving. This problem of distracted driving\nextends to Australia as well, where one-quarter of drivers admitted to using\ntheir phone while driving.\n\nCurrently there are 277 million cars and trucks on the road in the US and 339\nmillion cars and trucks on the road in Europe and 32 million new cars and trucks\nare added each year.\n\nThe ramifications of mobile phone distracted driving exceed the bounds of just\nphysical damage, as they can be exceedingly costly for drivers as well. For\nexample, expressed on a per death basis, the cost of all motor-vehicle crashes\n(fatal, nonfatal injury, and property damage) was $11,880,000 according to the\nNSC. In addition, the total societal and economic costs of distracted driving\ncrashes in the United States was estimated at $871 billion according to the\nNHTSA. Specifically with regard to commercial vehicle crashes, the average total\ncosts of commercial motor vehicle crashes for the years of 2009-2011 was over\n$83 billion per year according to the FMCSA. Accordingly, we believe that there\nis a tremendous financial incentive for a solution to this grave problem.\n\nIn response to the need for a solution to distracted driving resulting from the\nuse of mobile phones, the NHTSA has published a comprehensive study suggesting\nthat a complete solution must contain the following features: (i) the ability to\ndistinguish between the driver’s area of the vehicle and the rest of the\nvehicle, (ii) does not depend on the cooperation of the driver, and (iii)\nselective blocking of cell phone applications. Our SaverOne system has been\ndesigned with these features in mind.\n\nThe NHTSA’s driving guidelines do not constitute U.S. law and compliance does\nnot result in compliance with U.S. driving safety regulations. In order to\nmarket our products to vehicle manufacturers we may be required to meet\ndifferent types of regulations requirements such as International Organization\nfor Standardization (ISO) 26262 Functional Safety Regulations (ASIL), the\nInternational Standard for Automotive Quality Management Systems (IAFT) 16949,\nAutomotive Software Process Improvement and Capability Determination (SPICE) or\nother common quality management standards. In order to meet the quality\nrequirements, we will have to cooperate with vehicle manufacturers, to receive\ntheir customers’ quality requirements that meet the requisite regulation of such\ncustomers and implement tools, processes and methodologies. Such implementation\nwill require significant resources and funds and is expected to consume\nsignificant time and effort. We expect that only our Generation 3.0 solution,\nwhich is a solution designed for the original equipment manufacturers, or OEM\nmarket, may require compliance with the foregoing regulations, whereas our\nGeneration 1.0 and 2.0 solutions, both after-market solutions, are not required\nto comply with the foregoing regulations.\n\nThe SaverOne system currently has achieved safety and radiation certifications\nfrom Hermon Laboratories, an internationally approved testing and certification\nlab. SaverOne’s solution is certified for operating in Israel, the United\nStates, Europe and Japan. These certifications assure that SaverOne product\ncomplies with the regulations/legislations in these countries/regions.\n\nGeneration 1.0 is our first-generation solution and is intended for private\nvehicles, trucks and buses as an aftermarket product. Our Generation 1.0 was\nlaunched in late 2019 for private cars, and thereafter made commercially\navailable to trucks and buses. It is currently marketed in Israel as part of our\npre-commercialization/early user campaign. To date, over 1,470 systems have been\nordered (which includes over 300 systems ordered as part of our ongoing\nGeneration 1.0 pilot program and over 1,150 systems purchased in commercial\norders by our Generation 1.0 pilot program customers) and nearly 800 of these\nsystems have been installed.\n\nGeneration 2.0 is our second-generation solution that will replace Generation\n1.0 and is intended as a solution for the automobile aftermarket. It includes\nvarious improvements to our Generation 1.0 solution for maximal performance,\ncompatibility with automobiles and cellular networks, market penetration and\nprofitability. We expect to launch the Generation 2.0 solution for sale in the\nsecond quarter of 2022 and we expect to target the global aftermarket automobile\nmarket starting with the U.S., Europe and Asia Pacific, or APAC. In Europe and\nAPAC, we are working on pilot programs with various fleet and system\nintegrators. With respect to the U.S., we are in the process of developing a\nstrategic marketing plan for the rollout of our Generation 2.0 solution in the\nU.S.\n\nOur Generation 3.0 solution is being designed as a solution for the OEM market\nand we plan on it being directly integrated into the vehicle manufacturing\nprocess for seamless integration in the driving experience. We are currently\nworking with one of the leading global OEMs in order to make the installation of\nthe SaverOne System into vehicles system an essential part of the vehicle\nmanufacturing process. The Generation 3.0 solution is in the early stage of\ndevelopment and we expect to launch the Generation 3.0 solution in 2025.\n---\n\nWe were incorporated in Israel on November 16, 2014. Our principal executive\noffice is located at Em Hamoshavot Rd. 94, Petah Tikvah, Israel and our phone\nnumber is +972-3909-4177. We maintain a corporate website at www.saver.one."}},"message":null,"status":{"rCode":200,"bCodeMessage":null,"developerMessage":null}}

In [37]:
print(data['data']['poOverview'].keys())

data['data']['companyInformation'].keys()
row = {}
for d in data['data']['poOverview'].values():
    row[d['label']] = d['value']
row

# data['data']['poOverview'].values()


dict_keys(['Symbol', 'CompanyName', 'Exchange', 'ProposedSharePrice', 'NumberOfEmployees', 'DealStatus', 'SharesOffered', 'DollarValueOfSharesOffered', 'SharesOverAllotment', 'Address', 'PhoneNumber', 'CompanyWebsite', 'CEO', 'StateOfIncorp', 'FiscalYearEnd', 'TotalExpenseOfTheOffering', 'ShareholderSharesOffered', 'SharesOutstanding', 'LockupPeriodNumberofDays', 'LockupPeriodExpirationDate', 'QuietPeriodExpirationDate', 'SECCIK', 'DealId'])


{'Proposed Symbol': 'SVRE',
 'Company Name': 'SaverOne 2014 Ltd.',
 'Exchange': 'NASDAQ Capital',
 'Share Price': '$4.13',
 'Employees': '41 (as of 05/12/2022)',
 'Status': 'Filed',
 'Shares Offered': 2241379.0,
 'Offer amount': '$12,150,121.34',
 'Shares Over Alloted': '--',
 'Company Address': 'EM HAMOSHAVOT RD. 94  PETAH TIKVAH  49130',
 'Company Phone': '972-3-9094177',
 'Company Website': "<a href='http://www.saver.one' target='_blank'>www.saver.one</a>",
 'CEO': 'Ori Gilboa',
 'State of Inc': '--',
 'Fiscal Year End': '12/31',
 'Total Offering Expense': '$600,000.00',
 'Shareholder Shares Offered': '--',
 'Shares Outstanding': '3,838,320',
 'Lockup Period (days)': '180',
 'Lockup Expiration': '--',
 'Quiet Period Expiration': '--',
 'CIK': '0001894693',
 'DealId': '1205111-102356'}

In [31]:
row

{}

## Search Company Stocks scrape first 180 days stock data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Generate Search Url for ticker
  - Check if search has valid results
  - Generate Dictionary for page data
  - Navigate to next
- Define Main Loop
- Run