# We are going to write a script that will web scrape the latest data analytics positions from LinkedIn in a Location we choose:

####  Download libraries if you haven't done before:

In [None]:
!pip install --upgrade pip
!pip install selenium
!pip install webdriver_manager==4.0.2
!pip install openpyxl

####  Importing the necessary libraries

In [23]:
# this is the library that we will use to create break times in order to mimic human behaviour
import time
from getpass import getpass


# Juicy stuff- these are the Classes we will use for interaction with a webpage:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


# library for interacting with the operating system
import os

# you know pandas it's your best buddy
import pandas as pd

# library for directory location:

import pathlib
import os
from os.path import join

#Ignore warning -- Some methods are going to be deprecated and I didn't change all (mainly in the function scrapper)
import warnings
warnings.filterwarnings('ignore')

#### Webdrivers allows you to use a programming language in designing your test scripts

In [26]:
# driver path

## driver path should be the location of the folder where the driver you downloaded is ##

#driver_path = '/Users/alex/Documents/Tools/Drivers - Selenium/chromedriver'

## first we need to initiate the driver - probably the most important part of the code ##
#driver = webdriver.Chrome(executable_path = driver_path)

# or new version

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


# Mac users with issue with json driver:

# First do a backup:
# cp ~/.wdm/drivers.json ~/.wdm/drivers_backup.json

# Confirm existance of driver.json
# ls -l ~/.wdm/drivers.json

# Open the file with the below command and manually remove last json key
# #sudo nano ~/.wdm/drivers.json


In [27]:
# open the website
driver.get('https://www.linkedin.com/login/pt?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')

#### First let's do the login to the LinkedIn

In [28]:
# login into your LinkedIn account

# input your email

email = input('Enter your email : ')

# here we are going to find the box where we input the email
email_box = driver.find_element(By.ID, "username") #email_box = driver.find_element_by_id("username") #deprecated

#clear the search box if it has already something in there
email_box.clear()

# with the method send_keys() you can send the information from the script to the webpage
email_box.send_keys(email)
time.sleep(2)

# input the password
password = getpass('Enter your password : ')

# here we are going to find the box where we input the password
pass_box = driver.find_element(By.ID, 'password') #pass_box = driver.find_element_by_id('password') #deprecated

#clear the paword box if it has already something in there
pass_box.clear()

# here we will send the password to the driver
pass_box.send_keys(password)
time.sleep(1)


login = driver.find_element(By.CLASS_NAME, 'login__form_action_container')
login.click()
time.sleep(2)



In [29]:
try:
    login = driver.find_element(By.CLASS_NAME, 'login__form_action_container')
    login.click()
    time.sleep(2.5)
except:
    print("Login button already clicked.")

Login button already clicked.


In [30]:
#Disable messages tab in case appears inbox messages:
try:
    remove_message = driver.find_element(By.XPATH, '/html/body/div[5]/div[4]/aside[1]/div[1]/header/div[3]/button')
    remove_message.click()

except:
    try:
        remove_message = driver.find_element(By.XPATH, '/html/body/div[6]/div[4]/aside[1]/div[1]/header/div[3]/button')
        remove_message.click()

    except:
        print("No messages inbox.")
        



In [None]:
#remeber me on this web browser (if needed )
# not_now = driver.find_element_by_class_name('btn__secondary--large-muted')
# time.sleep(1)

# not_now.click()

#### Let's go to the job icon

In [31]:
# search bar

# locate the element by the id

# or new method
job_icon = driver.find_element(By.CSS_SELECTOR, "span[title='Jobs']")

job_icon.click()
time.sleep(2)



#### What is the job position you want to search for?

In [33]:
# this is not the best approach because even with selenium some elements keep on changing (automatically update themselves)
# because they are written in
# ember.js

search_job = driver.find_elements(By.CLASS_NAME,'jobs-search-box__text-input')[0] #search_job = driver.find_elements_by_class_name('jobs-search-box__text-input')[0]

job = input('What job do you want to search for: ')
search_job.clear()
search_job.send_keys(job)
time.sleep(2)

# switching to cross in jobs
search_job.send_keys(Keys.TAB)
time.sleep(2)

#switching for job location
cross_box = driver.switch_to.active_element
time.sleep(1)
cross_box.send_keys(Keys.TAB)


#### What is the job location you want to search for?

In [34]:
#Use the Tab key from the keyboard because it allows us to go from the job box to the location box
#get the element the cursor is on
location_box = driver.switch_to.active_element

location = input('Where do you want to search for jobs: ')
location_box.send_keys(location)

time.sleep(1)

#### You can also mimic your keyboard and click different keys

In [35]:
# click enter with the parameter keys.ENTER inside the method send_keys()

location_box.send_keys(Keys.ENTER)

In [36]:
# Maximize the window
#optional:
driver.maximize_window()

## you can also fullscreen the window

#driver.fullscreen_window()

In [37]:
# go to the end of the page in order for all the elements to be loaded
## to scroll a section you should find an a tag in that section, click that element and then scroll to the end

page = driver.find_element(By.CSS_SELECTOR, "div[class^='application-outlet']")
page.click()
time.sleep(1)

job_viewer = driver.find_element(By.CSS_SELECTOR, "a[class^='disabled ember-view jo']") #page = driver.find_element_by_css_selector("a[class^='disabled ember-view']")
job_viewer.send_keys(Keys.END)

In [38]:
## build a function ##

# GET ALL JOB LINKS

''' !!!ADVANCED: talk a little about how to find with partial class name!!!

e.g.:
      driver.find_elements_by_css_selector(<tagname>"[<attribute>^='<partial text of the attribute>']")

exammple in the line bellow '''

job_raw = driver.find_elements(By.CSS_SELECTOR,"a[class^='disabled ember-view']")


def scrapper(numb_j = len(job_raw)): ## add pages

    """ SUMMARY: This function retrieves all the job posts
    links from one page and returns a dataset with
    the name of the job in one column and the link to the post in the other.
    Also it will write the same info in different files
    for every single job post.

    HOW IT WORKS: Input the number of jobs you want to scrape.
    It will search on the page for the elements by css selector
    from all the job posts then loop for every single element and retrieve the 'href'.
    Also it will click on every job post and find the job name.
    This info will be saved in a dictionary that will in the end be converted to
    a dataset.
    Below we will open and create a text file with the name of the job post
    and inside save the link for further details"""

    # For scraper reasons it's required to duplicate the numb_j as it retrieves 2 times the same position:
    #numb_j = numb_j*2

    # empty list for saving the job names , link and extra info:
    job_list = []

    # reduce the page size in order to be able to find the name of the job in the right session
    # driver.execute_script("document.body.style.zoom='67%'")

    # all jobs in the page
    job_raw = driver.find_elements(By.CSS_SELECTOR,"a[class^='disabled ember-view']")

    # go to the end of the page for all the elements to be loaded
    page = driver.find_element(By.CSS_SELECTOR,"a[class^='disabled ember-view']")
    page.send_keys(Keys.END)
    # go to the top of the page for all the elements to be loaded
    page.send_keys(Keys.CONTROL + Keys.HOME) # combination of the two keys brings you to the top of the element



    for j_idx in range(numb_j):
        if (j_idx % 2 == 0) | (j_idx % 2 != 0):
            # get the job link
            ref = job_raw[j_idx].get_attribute('href')
            time.sleep(2)

            # increase the page size because the inspect for getting the job name where done wiht the page maximized
            driver.execute_script("document.body.style.zoom='100%'")

            ## let's click on the job post ##
            # driver.find_elements_by_css_selector("a[class^='disabled ember-view']")[j_idx].click()
            job_raw[j_idx].click()
            time.sleep(1.4)

            ## then we reduce the page size in order to be able to see the right part of the page
            # and find the element with the name of the job ##
            driver.execute_script("document.body.style.zoom='67%'")
            time.sleep(2.3)

            # get the job name with the .text method
            job_name = driver.find_element(By.CSS_SELECTOR,"h1[class^='t-24 t-bold inline']").text
            time.sleep(2.2)
#ember616 > span:nth-child(1) > strong

            # get job description with css selector + .text method:

            company_name = driver.find_element(By.CSS_SELECTOR,"div[class^='job-details-jobs-unified-top-card__company-name']").text

            # get company name:

            job_details = driver.find_element(By.ID,"job-details").text

            # increase the page size:
            driver.execute_script("document.body.style.zoom='100%'")

            # populate list:
            job_idx_list = [ref, job_name , company_name , job_details]
            time.sleep(3)

            # page.send_keys(Keys.PAGE_DOWN)

            job_list.append(job_idx_list)

            print(f"Collected job : {job_name} for company: {company_name}")

    #Create dataframe:
    Job_dataframe = pd.DataFrame(job_list,
                                 columns = ["job_link", "position", "company name", "job description"]
                                ).drop_duplicates()


    #Save dataframe in excel file to later use our job
    Job_dataframe.to_excel(pathlib.Path().joinpath('scraped_jobs.xlsx'),
                           sheet_name='Jobs',
                           index= False)

    return Job_dataframe



In [39]:
# How many job posts do you want to search for:
## let's keep it a low number because it takes some time ##
jobsN = int(input('How many job posts you want to retrieve: '))

scrapper(jobsN)


Collected job : Data Analyst / Decision Scientist, Help Experience for company: Vinted
Collected job : Data Analyst (w/m/d) for company: Billie
Collected job : Product Analyst (SME Product) for company: Vivid Money
Collected job : Senior Data Analyst (f/m/d) for company: Blacklane
Collected job : Data Analyst for company: GetYourGuide
Collected job : (Junior) Market Intelligence (m/f/d) for company: 1KOMMA5°
Collected job : Product Data Analyst for company: Deel
Collected job : Data Analyst for company: LANCH
Collected job : Customer Data Analyst (Customer Segmentation) (all genders) for company: ZEOS
Collected job : Data Scientist - Customer Data Platform (all genders) for company: Delivery Hero


Unnamed: 0,job_link,position,company name,job description
0,https://www.linkedin.com/jobs/view/4039064500/...,"Data Analyst / Decision Scientist, Help Experi...",Vinted,About the job\nBrief info about Vinted\n\nVint...
1,https://www.linkedin.com/jobs/view/4078618411/...,Data Analyst (w/m/d),Billie,"About the job\nWe are Billie, the leading prov..."
2,https://www.linkedin.com/jobs/view/4076613981/...,Product Analyst (SME Product),Vivid Money,About the job\nVivid is a dynamic fintech comp...
3,https://www.linkedin.com/jobs/view/4061546599/...,Senior Data Analyst (f/m/d),Blacklane,About the job\nAs a Senior Data Analyst at Bla...
4,https://www.linkedin.com/jobs/view/4070737917/...,Data Analyst,GetYourGuide,About the job\nAbout GetYourGuide\n\nGetYourGu...
5,https://www.linkedin.com/jobs/view/4081125268/...,(Junior) Market Intelligence (m/f/d),1KOMMA5°,"About the job\n1KOMMA5°\n\nAt 1KOMMA5°, we are..."
6,https://www.linkedin.com/jobs/view/4081039369/...,Product Data Analyst,Deel,About the job\nWho We Are Is What We Do.\n\nDe...
7,https://www.linkedin.com/jobs/view/3967151982/...,Data Analyst,LANCH,About the job\n5 Reasons to join LANCH\n\n Hug...
8,https://www.linkedin.com/jobs/view/4077089060/...,Customer Data Analyst (Customer Segmentation) ...,ZEOS,About the job\nTHE ROLE & THE TEAM\n\nYou work...
9,https://www.linkedin.com/jobs/view/4070110032/...,Data Scientist - Customer Data Platform (all g...,Delivery Hero,About the job\nAbout The Opportunity\n\nWe are...


In [None]:
driver.close() # closes the driver

### Extra

##### Get the html using selenium

In [40]:
# get the page you are in using page_source attribute

html = driver.page_source

In [41]:
html

'<html lang="en" class="theme theme--mercado app-loader--default artdeco osx"><head>\n    <script type="text/javascript" async="" charset="utf-8" id="utag_63" src="https://snap.licdn.com/li.lms-analytics/insight.min.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=AW-851212376&amp;l=dataLayer&amp;cx=c&amp;gtm=453e4bk0za200" nonce=""></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=DC-10880300&amp;l=dataLayer&amp;cx=c&amp;gtm=453e4bk0za200" nonce=""></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=DC-9261636&amp;l=dataLayer&amp;cx=c&amp;gtm=453e4bk0za200" nonce=""></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=AW-867846157&amp;l=dataLayer&amp;cx=c&amp;gtm=453e4bk0za200" nonce=""></script><script type="text/javascript" async="" charset="utf-8" id="utag_234" src="https://www.googletagmanage

In [42]:
# You can now save this one and use it with Beautiful Soup

from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(html)

# we can mix regex with beautifulsoup in order to find an attribut by its partial name
## where we want to find the names of all job positions based on the class attribut and how the class is named in the end
## this way we can avoid if the page source changes from time to time. The attributs can change but some parte of it's
## name can remain consistent trhought all the pages
job_list_dirty = soup.find_all('a', attrs= {'class': re.compile(r'job-card-list__title?')})
job_list_clean = [job.text.strip() for job in job_list_dirty]
job_list_clean

# the same for the company
job_company_dirty = soup.find_all('div', attrs={'class': re.compile(r'^artdeco-entity-lockup__subtitle')})
job_company_clean = [company.text.strip() for company in job_company_dirty]

# make it into a dataset
data = zip(job_list_clean, job_company_clean)
df = pd.DataFrame(data, columns=['Job', 'Company'])
df

Unnamed: 0,Job,Company
0,"Data Analyst / Decision Scientist, Help Experi...",Vinted
1,Data Analyst (w/m/d) \n \n \n\nData Anal...,Billie
2,Product Analyst (SME Product)Product Analyst (...,Vivid Money
3,Senior Data Analyst (f/m/d)Senior Data Analyst...,Blacklane
4,Data Analyst \n \n \n\nData Analyst with...,GetYourGuide
5,(Junior) Market Intelligence (m/f/d)(Junior) M...,1KOMMA5°
6,Product Data Analyst \n \n \n\nProduct D...,Deel
7,Data AnalystData Analyst,LANCH
8,Customer Data Analyst (Customer Segmentation) ...,ZEOS
9,Data Scientist - Customer Data Platform (all g...,Delivery Hero


In [None]:
#EXTRA
# Save cookies in a pickle file
import pickle

#create an empty folder

cookies_dir = 'saved_cookies'
lis_dir = os.listdir()

if cookies_dir not in lis_dir:
    os.mkdir(cookies_dir)
else:
    pass # os.removedirs(cookies_dir) --> to remove a directory

save_location = cookies_dir + '/cookies.pkl'
pickle.dump( driver.get_cookies() , open(save_location,"wb"))

In [None]:
# Load cookies

cookies = pickle.load(open(save_location, "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)

In [None]:
## If you want to scroll the job description

x = driver.find_elements(By.PARTIAL_LINK_TEXT, 'Retry Premium')[1]
x.send_keys(Keys.END)

#OR

# x = driver.find_element(By.XPATH, "/html/body/div[7]/div[3]/div[3]/div[2]/div/section[2]/div/div/div[1]/div/div[1]/div/div[2]/div[2]/ul/li[3]/span/a")
# x.send_keys(Keys.END)


In [None]:
# Input the number of jobs you want to scrap
# Save some information about the jobs (title, url, htlm,)


job_cards = driver.find_elements(By.CSS_SELECTOR, "li[class^='jobs-search-results__list-item occludable-update p']")


job_descriptions = driver.find_elements(By.PARTIAL_LINK_TEXT, 'Retry Premium')[1]
job_descriptions.send_keys(Keys.END)


# for job_card in job_cards:
#     time.sleep(2)
#     job_card.click()


len(job_cards)


In [None]:
#### TESTING ####

In [None]:
job_cards[0].click()
job_descriptions = driver.find_elements(By.PARTIAL_LINK_TEXT, 'Retry Premium for')[1]
job_descriptions.send_keys(Keys.END)

title = driver.find_element(By.CSS_SELECTOR, "h2[class^='t-24 t-bold']").text
location = driver.find_element(By.CSS_SELECTOR, "span[class^='jobs-unified-top-card__bullet']").text

In [None]:
location