<a href="https://colab.research.google.com/github/aknip/Coding-Cheatsheets/blob/main/Python-Playwright-Webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Playwright Basics

- In a Jupyter Notebook you must use async/await

# How to record
- In Terminal:
```
playWright codegen "http://scrapfly.io/"
```
- This starts Chromium and records all your interaction in Python code
- see https://playwright.dev/docs/codegen#running-codegen

In [None]:
%%bash
playWright codegen "http://scrapfly.io/"

# How to work interactively, exploring the website

Overview: Start REPL (eg. iPython in Terminal) and write Python code lines. See https://scrapfly.io/blog/web-scraping-with-playwright-and-python/#tip-playwright-in-repl

**Step 1:**

In Terminal:
```
ipython
```

**Step 2:**

In ipython:
```
import nest_asyncio; nest_asyncio.apply()  # This is needed to use sync API in repl
from playwright.sync_api import sync_playwright
pw = sync_playwright().start()
chrome = pw.chromium.launch(headless=False)
page = chrome.new_page()
page.goto("http://scrapfly.io/")
```

**Step 3:**

In ipython:
```
temp = page.get_by_text('Web Scraping API')
temp.highlight()
# or:
page.get_by_text('Web Scraping API').highlight()
page.locator('.btn-primary').highlight()
page.get_by_text('Try for free').highlight()
```

# Cheatsheet

# Locators
page.get_by_label("Username").fill("username")
page.get_by_label("Username").press("Tab")
page.get_by_label("Password").fill("passwd")
page.get_by_label("Password").press("Enter")

page.get_by_label("Username").click()
page.get_by_label("Username").fill("username")
page.get_by_label("Username").press("Tab")
page.get_by_label("Password").fill("passwd")
page.get_by_label("Password").press("Enter")

page.locator('.classname').click()

# wait time
page.wait_for_timeout(500)

# wait for (non-)visibility / availability in DOM
page.wait_for_selector(".spinner", state="visible")
page.wait_for_selector(".spinner", state="hidden")

# Parsing of DOM, with and without Beautifulsoup
https://scrapfly.io/blog/web-scraping-with-playwright-and-python/#parsing-data

# Installation

In [None]:
%%capture --no-stderr
import psutil
IN_NOTEBOOK = any(["jupyter-notebook" in i for i in psutil.Process().parent().cmdline()])
if IN_NOTEBOOK:
  try:
      from playwright.async_api import async_playwright
      from playwright.sync_api import Page, expect
      from loguru import logger
  except ImportError:
      !pip install pytest-playwright loguru --quiet
      !playwright install
      # same imports as above...
      from playwright.async_api import async_playwright
      from playwright.sync_api import Page, expect
      from loguru import logger

import json
import sys, os, shutil
import asyncio
import atexit
from getpass import getpass
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

In [None]:
if IN_NOTEBOOK:
  try: CREDS
  except NameError:
    CREDS = json.loads(getpass("Secrets (JSON string): "))
    os.environ['CREDS'] = json.dumps(CREDS)
    CREDS = json.loads(os.getenv('CREDS'))

# Project configuration, setup folder, logging

In [None]:
time_tracking_url = CREDS['Timebook-M1']['URL']['URL']
tickets_url = CREDS['Tickets-M1']['URL']['URL']
login_username = CREDS['myCred']['credential']['username']
login_password = CREDS['myCred']['credential']['password']
proj_folder = 'project'

In [None]:
# Delete / create directory
if os.path.exists(proj_folder):
    shutil.rmtree(proj_folder)
os.mkdir(proj_folder)

In [None]:
logger.remove()  # Remove all handlers added so far, including the default one.
logger.add(sys.stderr, level="TRACE", format="{time:HH:mm:ss} | <level>{level: <10}</level> | {message}")
logger.add(proj_folder + "/log_all.log", level="TRACE", format="{time:HH:mm:ss} | {level: <10} | {message}")
logger.add(proj_folder + "/log_success.log", level="SUCCESS", format="{time:HH:mm:ss} | {level: <10} | {message}")
logger.add(proj_folder + "/log_error.log", level="ERROR", format="{time:HH:mm:ss} | {level: <10} | {message}")

In [None]:
logger.trace("A trace message.")
logger.debug("A debug message.")
logger.info("An info message.")
logger.success("A success message.")
logger.warning("A warning message.")
logger.error("An error message.")
logger.critical("A critical message.")

# Run local browser (Headless=false)

Does not work in Colab

In [None]:
try:
  async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
    page = await context.new_page()

    # note all methods are async (use the "await" keyword)
    await page.goto("http://scrapfly.io/")

    # to stop browser on notebook close we can add a shutdown hook:
    #def shutdown_playwright():
    #    await browser.close()
    #    await pw.stop()
    #import atexit
    #atexit.register(shutdown_playwright())
except:
  logger.error('Error: Local browser (headless=false) not working.')

# Helper

In [None]:
# Create screenshot and log ("INFO")
async def my_screenshot(my_page, message):
  current_date_string = datetime.now().strftime("%Y%m%d-%H%M%S")
  shotfile = current_date_string + '-' + message + ".png"
  shotfile = "".join( x for x in shotfile.replace(' ', '_') if (x.isalnum() or x in "._-")) # cleanup filename
  await my_page.screenshot(path=proj_folder + "/" + shotfile)
  logger.info("Screenshot done: " + shotfile)

# Run headless browser

In [None]:
async with async_playwright() as p:
    browser = await p.chromium.launch()
    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
    page = await context.new_page()
    #launch browserstack demo
    await page.goto("https://bstackdemo.com")
    logger.info('Title of page is: ' + await page.title())
    #await page.screenshot(path=proj_folder + "/screenshot.png")
    await my_screenshot(page, 'Screenshot Demopage')
    await browser.close()

In [None]:
async with async_playwright() as p:
    browser = await p.chromium.launch()
    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
    page = await context.new_page()
    #launch browserstack demo
    await page.goto("https://bstackdemo.com")
    logger.info('bstackdemo.com: ' + await page.title())
    #click on sign button
    await page.click('#signin')
    #select Username
    await page.get_by_text("Select Username").click()
    await page.locator("#react-select-2-option-0-0").click()
    #select Password
    await page.get_by_text("Select Password").click()
    await page.locator("#react-select-3-option-0-0").click()
    await my_screenshot(page, 'Screenshot Login 1')
    #click login
    await page.get_by_role("button", name="Log In").click()
    await page.wait_for_timeout(500)
    #verify user have logged in
    await my_screenshot(page, 'Screenshot Login 2 Successful')
    logger.info('bstackdemo.com: Finished.')
    await browser.close()

# Download Time Tracking data

In [None]:
async with async_playwright() as p:
    browser = await p.chromium.launch()
    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
    page = await context.new_page()
    #launch timetracking system
    await page.goto(time_tracking_url)
    title = await page.title()
    #login
    await page.type("#username", login_username)
    await page.type("#password", login_password)
    await my_screenshot(page, 'Timetracker: Startpage - ' + title)
    await page.get_by_role("button", name="Login").click()
    await page.wait_for_timeout(500)
    await page.wait_for_selector(".loading__circle-layer", state="visible")
    await page.wait_for_selector(".loading__circle-layer", state="hidden")
    await my_screenshot(page, 'Timetracker: Report loaded')
    logger.info('Timetracker: Download clicked')
    async with page.expect_download() as download_info:
      # Perform the action that initiates download
      await page.get_by_role("button", name="Export").click()
    download = await download_info.value
    # Wait for the download process to complete and save the downloaded file somewhere
    await download.save_as("./" + proj_folder + "/" + download.suggested_filename)
    logger.info('Timetracker: Download done.')
    logger.info('Timetracker: Finished.')
    await browser.close()

# Download Ticket data

In [None]:
async with async_playwright() as p:
    browser = await p.chromium.launch()
    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
    page = await context.new_page()
    #launch ticket system
    await page.goto(tickets_url)
    title = await page.title()
    #login
    await page.get_by_label("Username").click()
    await page.get_by_label("Username").fill(login_username)
    await page.get_by_label("Username").press("Tab")
    await page.get_by_label("Password").fill(login_password)
    await my_screenshot(page, 'Ticketsystem: Startpage - ' + title)
    await page.get_by_label("Password").press("Enter")
    await page.wait_for_selector(".search-button", state="visible")
    await my_screenshot(page, 'Ticketsystem: Report loaded')
    await page.get_by_role("button", name="Export").click()
    await page.wait_for_timeout(1000)
    await my_screenshot(page, 'Ticketsystem: Select CSV')
    await page.get_by_role("menuitem", name="CSV (Current fields)").click()
    await page.wait_for_timeout(1000)
    logger.info('Ticketsystem: Download clicked')
    async with page.expect_download() as download_info:
      # Perform the action that initiates download
      await page.get_by_role("button", name="Export").click()
    download = await download_info.value
    # Wait for the download process to complete and save the downloaded file somewhere
    await download.save_as("./" + proj_folder + "/" + download.suggested_filename)
    logger.info('Ticketsystem: Download done.')
    logger.info('Ticketsystem: Finished.')
    await browser.close()