In [178]:
import pandas as pd
import threading
import time
import re
import os

#import selenium libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

DATA = pd.DataFrame()
def save_data(data, save=False):
    global DATA
    if DATA.empty:
        DATA = data
    else:
        DATA = pd.concat([DATA, data])
    
    if save:
        #Clean the data before saving it to the file
        DATA["Officer"] = DATA["Citation"].apply(lambda x: re.search(r'P(\d)', x).group(1))
        DATA["Fine"] = DATA["Fine"].apply(lambda x: float(x.replace('$', '').replace(',', '')))
        DATA["Residence"] = DATA["License Plate/Vin"].str.split().str[0]
        DATA["IssuedDate"]=pd.to_datetime(data["Issued"].apply(lambda x: " ".join(x.split()[:3]))) 
        DATA["IssuedTime"]=pd.to_datetime(data["Issued"]).dt.strftime('%I:%M %p')
        
        if os.path.exists("ParkingCitations.csv"):
            DATA.to_csv("ParkingCitations.csv", mode='a', index=False, header=False)
        else:
            DATA.to_csv("ParkingCitations.csv", index=False)
            
    return(DATA)

In [169]:
PATH = "Driver\chromedriver.exe"

def selenium_driver():
    
    service = Service(PATH)

    options = Options()
    driver = webdriver.Chrome(service=service, options=options)
    return driver

#Create the chrome driver that can be referred to globally
driver = selenium_driver()

In [32]:
def measure_time(start_time, Scraper: s, threshold=10.1):
    counting = True
    while counting and not s.citation_found:
        current = time.perf_count() - start_time
        if current >= threshold:
            counting = False
            s.flag = True
        time.sleep(0.1)

In [179]:
class Scraper():
    
    def __init__(self, url):
        #Define the url to scrape
        self._url = url
        driver.get(url)
        
        #Some properties that will help us see if the citation has been found
        self._citation_found = False
        self._flag = False
        
        #Define parameters of loop
        self.officers = range(1, 10+1)
        self.index = range(1, 99999+1)
        
    @property
    def url(self):
        return self._url
    
    @property
    def citation_found(self):
        return self._citation_found
    
    @property
    def flag(self):
        return self._flag
    
    @flag.setter
    def flag(self, change):
        self._flag = change
    
    @staticmethod
    def find_citation():
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "v-btn__content"))
        )
        find_citation_btn = driver.find_elements(By.CLASS_NAME, "v-btn__content")[0]
        find_citation_btn.click()
    
    @staticmethod
    def send_keys(data={"officer": 1, "index": 0}):
        index = "0"*(5-len(str(data["index"])))+str(data["index"])
        key = f"P{data['officer']}-{index}"
        print(key)
        
        #Search the key into the search bar
        
        #Sometimes dynamically loaded websites will spoof the identifiers of their web elements
        #This makes it harder to consistently identify.
        
        #One solution is to use XPATH
        #Another solution is to find the unique combination of nested HTML tags and target that.
        
        #i.e. if I know the link (<a> tag) I want to target is always the first link in the p tags of every
        #div that can be identified with a positional or classical identifier, then I can identify the <a> tag
        #even if the <a> isn't explicitly identifiable through other identifiers.
        
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, ".v-text-field__slot input"))
        )
        input_field = driver.find_elements(By.CSS_SELECTOR, ".v-text-field__slot input")[0]
        driver.execute_script("arguments[0].value = '';", input_field)
        input_field.send_keys(key)
        
        #Search the citation
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, ".v-input__append-inner button"))
        )
        search_btn = driver.find_elements(By.CSS_SELECTOR, ".v-input__append-inner button")[0]
        search_btn.click()
    
    def get_data(self):
        citation_data = driver.find_elements(By.CSS_SELECTOR, ".v-card__text .col")
        no_data_text = driver.find_elements(By.CSS_SELECTOR, ".v-card__text .text-center h4")
        while len(citation_data) < 1 and len(no_data_text) < 1:
            citation_data = driver.find_elements(By.CSS_SELECTOR, ".v-card__text .col")
            no_data_text = driver.find_elements(By.CSS_SELECTOR, ".v-card__text .text-center h4")
            time.sleep(0.1)
        
        if len(no_data_text) >= 1:
            return "No data"
        else:
            #Now we need to parse the data into a pandas data frame
            return self.format_text([el.text for el in citation_data])
    
    @staticmethod
    def format_text(text_arr):
        #Takes the text data and returns a formatted pandas frame
        cols = list(map(lambda x: x.split(":\n")[0], text_arr))
        data = list(map(lambda x: x.split(":\n")[1], text_arr))
        row = pd.DataFrame(data, cols).transpose()
        return row
        
    def main_loop(self):
        self.find_citation()
        for i in range(1,2): #self.officers:
            for j in range(0, 10+1): #self.index:
                self.send_keys({"officer": i, "index": j})
                
                #Store the data in the RAM
                #Save it to a file when it is the last index
                #i.e. save it to a file when we are done scraping each officer's citations
                scraped_data = self.get_data()
                save_data(scraped_data, save = j == 10) #len(self.index)-1))
        
    
scraper = Scraper("https://cars.byu.edu/citations/")
scraper.main_loop()

P1-00000
P1-00001
P1-00002
P1-00003
P1-00004
P1-00005
P1-00006
P1-00007
P1-00008
P1-00009
P1-00010


In [103]:
data={"officer": 1, "index": 
[el.text for el in scraper.get_data()]

['Citation:\nP1-00010',
 'License Plate/Vin:\nUT 182MXZ',
 'Fine:\n$0',
 'Issued:\nJun 01, 2010 02:37 PM']

['Citation:\nP1-00010',
 'License Plate/Vin:\nUT 182MXZ',
 'Fine:\n$0',
 'Issued:\nJun 01, 2010 02:37 PM']

In [159]:
cols = list(map(lambda x: x.split(":\n")[0], test_data))
data = list(map(lambda x: x.split(":\n")[1], test_data))
row = pd.DataFrame(data, cols).transpose()
row["Officer"] = row["Citation"].apply(lambda x: re.search(r'P(\d)', x).group(1))
row["Fine"] = row["Fine"].apply(lambda x: float(x.replace('$', '').replace(',', '')))
row["Residence"] = row["License Plate/Vin"].str.split().str[0]
row["IssuedDate"]=pd.to_datetime(row["Issued"].apply(lambda x: " ".join(x.split()[:3]))) 
row["IssuedTime"]=pd.to_datetime(row["Issued"]).dt.strftime('%I:%M %p')


TypeError: 'bool' object is not callable

In [147]:
' '.join("Jun 01, 2020 02:37 PM".split()[3:])

'02:37 PM'