# Importing necessary resources/library's for web scraping

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from datetime import timedelta, date
import re
import os

# Request html from job listing and check the status.

In [2]:
html = requests.get('https://www.indeed.com/viewjob?cmp=Comforts-Suites-Corvallis-Oregon&t=Hotel+Front+Desk+Agent&jk=373c700661954885&q=front+desk&vjs=3')
assert html.status_code == 200, "HTML retrieval is not complete."

# Parse the HTML

In [3]:
soup = bs(html.content, 'lxml')

# Create job tuple for holding pertinant info of job listing

In [4]:
job = {
    "jobtitle" : '',
    "company": '',
    "postdate" : '',
    "dateapplied" : '',
    "cityzip": '',
    "salary": '',
    "jobtype": '',
    "numberofhires": '',
    "benefits": '',
    "schedule": '',
    "education": '',
    "experience": '',
    "remote": '',
    "worklocation": '',
    "interview": '',
    "followupcontact": '',
}

# Retrieve the job title and set directory/file name of job listing

In [5]:
job["jobtitle"] = re.sub('#|\/|%|&|{|}|\\|<|>|\*|\?|\$|!|\'|\"|:|@|\+|`|\||=', '-', soup.find("h1", class_="jobsearch-JobInfoHeader-title").text)
# This can be any folder, just set this to where you want the job listings to be saved
basedir = 'C:\\Users\Dean Akin\\Documents\\Professional Documents\\Employment\\Job Listings'
dirfile = job["jobtitle"]

# Retrieve company name

In [6]:
job['company'] = re.sub('#|\/|%|&|{|}|\\|<|>|\*|\?|\$|!|\'|\"|:|@|\+|`|\||=', '-', soup.find('div', class_='jobsearch-JobInfoHeader-subtitle').contents[0].text)
dirfile = dirfile + ' - ' + job['company']
print(dirfile)

Hotel Front Desk Agent - Comforts Suites Corvallis Oregon


# Calculate the post date and the date applied

In [7]:
postdate=soup.find("div", class_="jobsearch-JobMetadataFooter")
reg = re.compile('div class') 
match = reg.search(str(postdate.contents[0]))
postdate = postdate.contents[1].text if match else postdate.contents[0].text
if(postdate == "Just posted"):
    job['postdate'] = job['dateapplied'] = date.today.strftime("%m/%d/%Y")
else:
    postdate = postdate.split()
if(postdate[0] == "30+"):
    postdate = date.today() - timedelta(days=30)
else:
    postdate = date.today() - timedelta(days=int(postdate[0]))
job["postdate"] = postdate.strftime("%m/%d%Y")
job["dateapplied"]=date.today().strftime("%m/%d/%Y")

# Scrape the city and zip

In [8]:
job['cityzip'] = soup.find('div', class_='jobsearch-JobInfoHeader-subtitle').contents[1].text

# Scrape the job info seciton for salary, job type, and number of hires

In [9]:
salary=soup.find('div', id='jobDetailsSection')
jobDescSection = [[], [], []]
if(salary):
    for i in salary.contents:
        for j in i.contents:
            if(j.parent.contents[0].text == "Salary"):
                if(j.text == "Salary"):
                    continue
                jobDescSection[0].append(j.text)
            elif(j.parent.contents[0].text == "Job Type"):
                if(j.text == "Job Type"):
                    continue
                jobDescSection[1].append(j.text)
            elif(j.parent.contents[0].text == "Number of hires for this role"):
                if(j.text == "Number of hires for this role"):
                    continue
                jobDescSection[2].append(j.text)
if(jobDescSection[0]):
    job['salary'] =", ".join(jobDescSection[0])
else:
    job['salary'] = 'Not Listed'
if(jobDescSection[1]):
    job['jobtype']=", ".join(jobDescSection[1])
else:
    job['jobtype'] = 'Not Listed'
if(jobDescSection[2]):
    job['numberofhires'] = ", ".join(jobDescSection[2])
else:
    job['numberofhires'] = 'Not Listed'

# Scrape the webpage for Benefits

In [10]:
jobDesc = soup.find('div', class_='jobsearch-JobComponent-description')
reg = re.compile('(?<=<p>benefits:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['benefits']=match
else:
    job['benefits']="Not Listed"

# Scrape webpage for schedule

In [11]:
reg = re.compile('(?<=<p>schedule:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['schedule']=match
else:
    job['schedule']="Not Listed"

# Scrape webpage for education

In [12]:
reg = re.compile('(?<=<p>education:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['education']=match
else:
    job['education']="Not Listed"

# Scrape webpage for experience

In [13]:
reg = re.compile('(?<=<p>experience:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['experience']=match
else:
    job['experience']="Not Listed"

# Scrape work location

In [14]:
reg = re.compile('(?<=<p>work location:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['worklocation']=match
else:
    job['worklocation']="Not Listed"

# Check if its remote

In [15]:
reg = re.compile('(?<=<p>work remotely:</p><ul>)((<li>.+?</li>)+)', re.IGNORECASE)
match = reg.search(str(jobDesc))
if(match):
    match = re.split("<li>|</li>", match.group())
    match = [i for i in match if i]
    match = ', '.join(match)
    job['remote']=match
else:
    job['remote']="Not Listed"

# Set Interview or follow-up to no

In [16]:
job['interview'] = "No"
job['followupcontact'] = "No"

## Finally, make a directory and text file of the job listing as to keep full description of job if it is pulled of the internet

In [17]:
fulldir = basedir + '\\' + dirfile
print(fulldir)
try:
    os.mkdir(fulldir)
except:
    pass
html_text=soup.find('div', class_='jobsearch-jobDescriptionText')
with open(basedir + '\\' + dirfile + '\\' + dirfile + '.rtx', "w") as f:
    try:
        f.write(html_text.text)
    except:
        pass

C:\Users\Dean Akin\Documents\Professional Documents\Employment\Job Listings\Hotel Front Desk Agent - Comforts Suites Corvallis Oregon


In [18]:
print(job)

{'jobtitle': 'Hotel Front Desk Agent', 'company': 'Comforts Suites Corvallis Oregon', 'postdate': '08/122021', 'dateapplied': '08/15/2021', 'cityzip': 'Corvallis, OR 97330', 'salary': '$13.50 an hour', 'jobtype': 'Part-time', 'numberofhires': '2', 'benefits': 'Employee discount, Paid time off', 'schedule': '8 hour shift', 'education': 'Not Listed', 'experience': 'Hotel Experience: 1 year (Preferred), Customer Service: 1 year (Preferred)', 'remote': 'No', 'worklocation': 'One location', 'interview': 'No', 'followupcontact': 'No'}
