# Linkedin Job Listing Scrapper
This script will scrape through the job listings on Linkedin for data analyst roles in Canada.

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore") 
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

Execute chrome drivers and set the default zoom to 40% so all listings can be shown (this avoids having to scroll).

In [None]:
wd=webdriver.Chrome("chromedriver.exe")
wd.get('chrome://settings/')
wd.execute_script('chrome.settingsPrivate.setDefaultZoom(0.4);')
wd.get("https://www.google.ca/")

These chunks will open Linkedin and send the username and password keys to the Linkedin site for login.

In [None]:
user="email"
userpass="password"

In [None]:
wd.get("https://www.linkedin.com")
username=wd.find_element_by_id("session_key")
username.send_keys(user)
password=wd.find_element_by_id("session_password")
password.send_keys(userpass)

In [None]:
login_button=wd.find_element_by_class_name("sign-in-form__submit-button")
login_button.click()

Change the position and loc variables if you want to search for a different job title in a different location.

In [None]:
position='data analyst'
loc='canada'
position = position.replace(' ', "%20")
wd.get(f"https://www.linkedin.com/jobs/search/?currentJobId=3087504630&geoId=101174742&keywords={position}&location={loc}&refresh=true")

The next cell retrieves the total number of jobs.

In [None]:
no_of_jobs = wd.find_element_by_css_selector('small.display-flex.t-12.t-black--light.t-normal').text
no_of_jobs=int(no_of_jobs.split(' ')[0].replace(',',''))
print(no_of_jobs)
time.sleep(2)

The bread and butter of the script, this cell will begin to retrieve the data from the listings.

In [None]:
desc_list = []

for i in range(1, 30):
    # this loop is to click to the next page and retrieve the total number of jobs in a page
    wd.find_element_by_xpath(f'//button[@aria-label="Page {i}"]').click() # click to the next page
    jobs_lists = wd.find_element_by_class_name("jobs-search-results-list")  
    jobs = jobs_lists.find_elements_by_class_name('jobs-search-results__list-item')  
    for j in range(1, len(jobs)+1):
        # this loop clicks through each listing and retrieves the necessary elements
        # click on each job listing
        wd.find_element_by_xpath(f"/html/body/div[5]/div[3]/div[4]/div/div/main/div/section[1]/div/ul/li[{j}]/div/div[1]/div[1]/div[2]/div[1]/a").click()
        time.sleep(1)
        # get the job description from the right panel
        job_desc = wd.find_element_by_class_name('jobs-description-content__text.t-14.t-normal')
        soup = BeautifulSoup(job_desc.get_attribute('outerHTML'), 'html.parser')
        des=soup.text.strip()
        # get the top cards to extract elements like title, company name, etc.
        cards = wd.find_element_by_class_name('jobs-unified-top-card.t-14')
        title = cards.find_element_by_class_name('t-24.t-bold').text.strip() # job title
        try:
            company = cards.find_element_by_class_name('ember-view.t-black.t-normal').text.strip() # company name
        except NoSuchElementException:
            company=cards.find_element_by_class_name('jobs-unified-top-card__company-name')
        location = cards.find_element_by_class_name('jobs-unified-top-card__bullet').text.strip() # location of the job
        try:
            placetype = cards.find_element_by_class_name('jobs-unified-top-card__workplace-type').text.strip() #workplace type
        except NoSuchElementException:
            placetype = ''
        postdate = cards.find_element_by_class_name('jobs-unified-top-card__posted-date').text #how long ago the job was posted
        # add the elements to a list
        record=(title, company, location, placetype, postdate, des)
        desc_list.append(record)

The next cell will create the data frame and remove any unnecessary words.

In [None]:
df = pd.DataFrame(desc_list)

# deleting useless words
df = df.replace(['\n',
                 '^.*?Expect',
                 '^.*?Qualifications',
                 '^.*?Required',
                 '^.*?expected',
                 '^.*?Responsibilities',
                 '^.*?Requirements',
 ], '', regex=True)
df.rename(columns={0:'job_title', 1:'company_name', 2:'location', 3:'workplace_type', 4:'job_posted', 5:'job_desc'}, inplace=True)
df.to_csv('linkedin_jobs.csv')
