# Webscraper Notebook

##### Order
- Scrape Nasdaq IPO Calendar
- Follow Links from Ininital Scrape and scrape company data
- Search Company Stocks scrape first 180 days stock data



#### Start By Importing what we will need

In [101]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pickle
import time
from bs4 import BeautifulSoup
import random
from requests_html import HTMLSession
import json

#### Define Functions that may be helpful later

In [102]:
def pickle_file(obj, path):
    file = open(path, 'wb')
    pickle.dump(obj, file)
    file.close()

def load_pickle(path):
    file = open(path, 'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def start_driver(driver_path = './WebDriver/chromedriver'):
    ser = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument('--no-sandbox')   
    chrome_options.add_argument('--disable-dev-shm-usage')   
    chrome_options.add_argument("--incognito")
    chrome_options.page_load_strategy = 'eager'
    
    driver_path = './WebDriver/chromedriver'
    driver = webdriver.Chrome(service = ser, options=chrome_options)
    return driver

#Loading a webpage
# driver.get('https://www.nasdaq.com/market-activity/ipos')

## Scrape Nasdaq IPO Calendar

- Define Scrape Specific Functions
  - Navigate To Starting Date
  - Generate Dictionary from rows in table
  - Navigate to next page
- Define Main Loop
  - Load Driver
  - Iterate Through Months from starting date to end date
- Run

In [3]:
def get_starting_page(driver, starting_date = '01/1995'):
    #Open Date 
    date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/button'))
        ).click()

    #Set Date Picker Date
    date_picker = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/input'))
        )
    driver.execute_script(f"arguments[0].value = '{starting_date}';", date_picker)     
        # .setAttribute('value', '01/1999')

    #Navigate to New Page
    apply_date_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, '/html/body/div[3]/div/main/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[2]/div/button[2]'))
        ).click()

def generate_data(driver):
    table = driver.find_element(By.XPATH,
        './/table',{'class':'market-calendar-table__table'})
    html = table.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    
    url_base = 'https://www.nasdaq.com'
    table_data = []
    # print(soup)
    # if soup.find('th').text == '':
    #     return []

    rows = soup.find_all('tr',{'class':'market-calendar-table__row'})
    for row in rows:
        row_data = {}

        th = row.find('th', {'role':'cell'})
        ticker = th.findChild().text
        if not ticker:
            continue

        row_data[th['data-column']] = ticker
        row_data['link'] = url_base + row.find('a')['href']

        cells = row.find_all('td',{'role':'cell'})        
        for cell in cells:
            row_data[cell['data-column']] = cell.findChild().text
        table_data.append(row_data)
    return table_data

# generate_data(driver)

def get_next_calendar_page(driver):
    next_scroll = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__next']"))
        ).click()

    next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((
        By.XPATH, ".//button[@class='time-belt__item']"))
        ).click()

# get_next_calendar_page(driver)


In [None]:
def main_loop(pickup_file = False, starting_date= '01/1995', delay = 10):
    if pickup_file:
        data = load_pickle(pickup_file)
    else:
        data = []

    driver = start_driver('./WebDriver/chromedriver')
    print('Started Driver')

    time.sleep(3)

    driver.get('https://www.nasdaq.com/market-activity/ipos')
    print('Loaded Initial Page')

    # time.sleep(delay)

    get_starting_page(driver, starting_date=starting_date)
    print('Loaded Starting Page')

    date_year = 0
    count = 0
    while date_year <= 2022:
        count+=1
        date_year = int(driver.find_element(By.XPATH, ".//button[@class='time-belt__item']").get_attribute('data-year'))

        data += generate_data(driver)
        pickle_file(data, f'Data/nasdaq_checkpoint_{date_year}')
        print(f'Checkpoint-{date_year}\t(Navigations: {count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(10,30))

        get_next_calendar_page(driver)
    
    print('\nAll Done')
    return data

data = main_loop(starting_date='02/1998', pickup_file='Data/nasdaq_checkpoint_1998')



## Follow Links from Ininital Scrape and scrape company data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Get Page from link in data
  - Generate Dictionary for page data
  - Navigate to next page (Row in data)
- Define Main Loop
- Run

In [None]:
def make_request(url):
    session = HTMLSession()
    r = session.get(url)
    if r.status_code == 200:
        response = json.loads(r.text)
        return response
    return False
    
def process_overview_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['poOverview'].values():
            data[d['label']] = d['value']
        data['company_description'] = response['data']['companyInformation']['companyDescription']
    return data

def process_financials_response(response, data = {}):
    if response['data'] !=None:
        for d in list(response['data']['financials'][0].values()):
            data[d['label']] = d['value']
            data['first_filing_type'] = response['data']['filings'][0]['FormType']
            data['first_filing_date'] = response['data']['filings'][0]['DateReceived']
            data['first_filing_link'] = response['data']['filings'][0]['FilingLink']['value']
    return data

def process_experts_response(response, data = {}):
    if response['data'] !=None:
        for d in response['data']['tableModel']['rows']:
            data[d['role']] = d['expertName']
    return data

In [98]:
def main_loop():
    ipo_calendar_data = load_pickle('Data/nasdaq_ipos')

    data = []
    skipped_count = 0
    count = 0

    for entry in ipo_calendar_data:
        
        count +=1
        ticker = entry['proposedTickerSymbol']   
        url = entry['link']
        
        if url:
            row_data = {}
            row_data ['proposedTickerSymbol']= ticker
            
            response = make_request('https://api.nasdaq.com/api/ipo/overview/?' + url.split('?')[1])
            if response:
                row_data =  process_overview_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/financials-filings/?' + url.split('?')[1])
            if response:
                row_data =  process_financials_response(response, row_data)

            response = make_request('https://api.nasdaq.com/api/ipo/experts/?' + url.split('?')[1])
            if response:
                row_data =  process_experts_response(response, row_data)

            data.append(row_data)
        
        else:
            skipped_count +=1
            print(f'Skipped-{ticker}',end='\r', flush=True)

        pickle_file(data, f'Data/nasdaq_company_overview_checkpoint')
        print(f'Checkpoint-{ticker}\t(Progress: {count}/{len(ipo_calendar_data)};\tSkipped: {skipped_count};\t Rows of Data: {len(data)})',end='\r', flush=True)
        time.sleep(random.randint(1, 25)/100)

    print('\nAll Done')
    pickle_file(data, f'Data/nasdaq_company_overview_checkpoint_complete')
    return data

data = main_loop()


Checkpoint-DIGL	(Progress: 6/6;	Skipped: 0;	 Rows of Data: 6)
All Done


[{'proposedTickerSymbol': 'SVRE',
  'Proposed Symbol': 'SVRE',
  'Company Name': 'SaverOne 2014 Ltd.',
  'Exchange': 'NASDAQ Capital',
  'Share Price': '$4.13',
  'Employees': '41 (as of 05/12/2022)',
  'Status': 'Filed',
  'Shares Offered': 2241379.0,
  'Offer amount': '$12,150,121.34',
  'Shares Over Alloted': '--',
  'Company Address': 'EM HAMOSHAVOT RD. 94  PETAH TIKVAH  49130',
  'Company Phone': '972-3-9094177',
  'Company Website': "<a href='http://www.saver.one' target='_blank'>www.saver.one</a>",
  'CEO': 'Ori Gilboa',
  'State of Inc': '--',
  'Fiscal Year End': '12/31',
  'Total Offering Expense': '$600,000.00',
  'Shareholder Shares Offered': '--',
  'Shares Outstanding': '3,838,320',
  'Lockup Period (days)': '180',
  'Lockup Expiration': '--',
  'Quiet Period Expiration': '--',
  'CIK': '0001894693',
  'DealId': '1205111-102356',
  'Revenue': '$145,000.00',
  'first_filing_type': {'label': 'Form Type', 'value': 'F-1'},
  'first_filing_date': {'label': 'Date Received', 'va

## Search Company Stocks scrape first 180 days stock data

- Define Scrape Specific Functions
  - Load Data From Initial Scrape
  - Generate Search Url for ticker
  - Check if search has valid results
  - Generate Dictionary for page data
  - Navigate to next
- Define Main Loop
- Run