In [1]:
# Import Dependencies
from splinter import Browser
import time
import re
from dataclasses import dataclass
from cryptography.fernet import Fernet
from config import user, passw
from secure.run_encrypter import get_key
import pymongo

In [2]:
# Create connection to database.
class Database:
    def __init__(self, db_addr):
        self.db_addr = db_addr

    def connect(self):
        self.conn=self.db_addr
        self.client=pymongo.MongoClient(self.conn)
        try:
            print(f"Connection to MongoDB Version {self.client.server_info()['version']} successful.")
        except Exception:
            print("Unable to connect to the server.")
        # Create/Connect to Database and Collection
        return self.client.Job_Boards.job_listings
    

In [3]:
class Website:
    def __init__(self, site_name, url, driver="chrome"):
        self.site_name = site_name
        self.url = url
        self.driver = driver
        self.browser = Browser(self.driver)
        self.browser.visit(self.url)


In [4]:
class WebLogin(Website):
    # Some desired data may require login while other data is available without login.
    # and to facilitate polymorphic design---
    # I choose to not have login within the parent Website class
    # and not have web navigation intially go to login page/section/window.
    
    # In the event the website landing page does not directly have login.
    @property
    def login_selector(self):
        pass
    
    def login(self):
        self.browser.find_by_id('username').fill(user)
        self.browser.find_by_id('password').fill(str(Fernet(get_key()).decrypt(passw), 'UTF-8'))
        self.browser.find_by_tag('button').click()        


In [5]:
@dataclass
class JobData:
    job_board_name: str
    job_title: str
    company_name: str
    job_description: str
    
    def render(self):
        return {"job_board_name": self.job_board_name, "job_title": self.job_title, "company_name": self.company_name, "job_description": self.job_description}
    

In [6]:
class DataCollector:
    def __init__(self, browser, jobs_board_name, db, search_term):
        self.browser = browser
        self.jobs_board_name = jobs_board_name
        self.db = db
        self.pn = 1
        self.job_nav = 1
        self.search_term = search_term
        
    def nav_to_data_page(self):
        self.browser.visit('{website_landing_page}')
        # Reasons you might want or need to navigate differently:
        # If you are trying to look more like a human.
        # Sometimes websites href is a reference to the page as opposed to the actual url extension
        # and the url, upon loading, may show uniquely generated reference as opposed to the hard url.
        # Also, sometimes this reference is uniquely generated each time the page loads.
        # Additionally, not following the path a website sets user on potentially causes,
        # depending upon scraper navigation implementation, Cross Site Request Forgery(CSRF) blocking.
        
        # Here is an example of how one might deal in script with these scenarios 
        # and avoid unnecessary website research:
            # data_page = self.browser.html
            # time.sleep(1)
            # job_page_link = (re.search(r'href="/jobsrefpattern/" id="idpattern', data_page)).group()
            # job_page_link_id=job_page_link.split('"')
            # job_page_link_id = job_page_link_id[3]
            # self.browser.find_by_id(job_page_link_id).click()
            
    def query_data(self):
        jobs_search_cont = self.browser.find_by_css("{search_field_container}")
        jobs_search_cont = jobs_search_cont.html
        time.sleep(1)
        job_keyword_search_field = (re.search(r"element_id_regex", jobs_search_cont)).group()
        self.browser.find_by_id(job_keyword_search_field).fill(self.search_term)
        time.sleep(1)
        self.browser.find_by_text('text_string').click()
        
    # Why not just make nav_data_pgs_obj an attribute of the class or simply a variable? 
    # Without giving a detailed exampled explanation:
    # The attributes of this object can change (depending on website implementation) 
    # as we navigate through the pages.
    @property
    def nav_data_pgs_obj(self):
        return self.browser.find_by_tag('element_tag')[int].find_by_tag('ul')[int]

    def traverse_data(self):
        while self.pn <= int(self.nav_data_pgs_obj.find_by_tag('li')[-1].text):
            print(f'pn number is now: {self.pn}')
            print("RUNNING PAGE JOB DESCRIPTION NAVIGATION")
            # Create splinter WebDriverElement object of ul element containing jobs.
            jobs_list_obj = self.browser.find_by_tag('element_tag')[2].find_by_tag('ul')[int]
            print("Ran part One")
            time.sleep(3)
            # Create string of html for regex search for job ids.
            jobs_list_obj_html_string = jobs_list_obj.html
            print("Ran part Two")
            time.sleep(2)

            # Divine list of Jobs li element ids.
            jobs_ids_list_not_cleaned = re.findall('<element_id_regex', jobs_list_obj_html_string)
            print(len(jobs_ids_list_not_cleaned))
            print("Ran part Three")
            time.sleep(2)

            #Create and fill list with cleaned job ids.
            job_ids = [job.split('"')[1] for job in jobs_ids_list_not_cleaned]
            
            # Navigate through jobs list for current page.
            self.job_nav = 1        
            for job_id in job_ids:
                self.job_post = jobs_list_obj.find_by_id(job_id)
                # Breakup click pattern with if else.
                if self.job_nav %2 == 0:
                    self.job_post.find_by_tag('element_tag')[int].mouse_over()
                    self.job_post.find_by_tag('element_tag')[int].click()
                    time.sleep(2)
                    self.etl_data()
                    
                else:
                    self.job_post.find_by_tag('element_tag')[int].mouse_over()
                    self.job_post.find_by_tag('element_tag')[int].click()
                    time.sleep(2)
                    self.etl_data()
            self.nav_data_pages()
                    
    def etl_data(self):
        # EXTRACT JOB DATA:(May be a good idea to split etl to separate methods to enhance polymorphism.)
        print(f"Job number: {self.job_nav} selected.")
        # Assign the company name for currently selected job posting to a variable
        company_name = self.job_post.find_by_tag("element_tag")[int].text
        print(company_name)
        # Create splinter ElementList of all elements that make up single job complete description.
        description_inner_span = self.browser.find_by_xpath('element_xpath')
        # Assign the company name for currently selected job posting to a variable
        # Query HTML(returns as string) from ElementList Object of complete job description.
        desc_html_span = description_inner_span.html

        # TRANSFORM JOB DATA:
        # Using Regex replace all element tags in job description HTML-string with a space.(Transform/Clean Data)
        desc_html_span = desc_html_span.replace('\n', "").replace('<br>\n', "").replace('<br>', " ").replace('&nbsp;', "").replace('/', " ").replace('&amp;', " ")
        job_description = re.sub("(<.+?>)+", " ", desc_html_span)
        print(job_description)
        print("Ran part Five")
        time.sleep(1)
        
        # print(JobData(self.jobs_board_name, self.search_term, company_name, job_description).render())
        # LOAD JOB DATA:
        db.insert_one(JobData(self.jobs_board_name, self.search_term, company_name, job_description).render())
        self.job_nav += 1
        
    def nav_data_pages(self):
        # Breakup the patter with if elif else.
        if self.job_nav % 2 == 0:
            time.sleep(3)
        elif self.job_nav % 3 == 0:
            time.sleep(4)
        elif self.job_nav % 7 == 0:
            time.sleep(5)
        else:
            time.sleep(2)

        self.pn += 1
        lis = self.nav_data_pgs_obj.find_by_tag('element_tag')
        # Account for page navigation button with alternate navigation attributes.
        if self.pn == 9:
            lis[8].click()
            time.sleep(3)
        # Account for typical page navigation button navigation attibutes.
        elif self.pn != (int(lis[-1].text)+1):
            for li in lis:
                if li.text == str(self.pn):
                    li.click()
                    time.sleep(3)
                    break
        # Message declaring jobs navigation completion. WooHoo!!!
        else:
            print(f"You have reached the end of job postings for {self.search_term}.")


In [3]:
db = Database("mongodb://localhost:27017").connect()

Connection to MongoDB Version 5.0.3 successful.


In [5]:
# db.insert_one({"test_key": "test_value"})

<pymongo.results.InsertOneResult at 0x10b8efa40>

In [8]:
site = WebLogin('{name_of_job_board}', '{url}')

In [9]:
site.login()

In [10]:
jobs = DataCollector(site.browser, site.site_name, db, "Chocolatier")

In [11]:
jobs.nav_to_data_page()

In [12]:
jobs.query_data()

In [1]:
jobs.traverse_data()