# LinkedIn Market Analysis


### Process:
1. Scrape data from Linkedin and Glassdoor, using Selenium.
2. EDA, cleaning, and export to CSV.
3. Compare to previous years' data (2 and 4 years ago) using tableau.

### Additional Datasets:
- 2018: 
- 2020: 
- 2022: [my submission to Kaggle]

### Reference Notebooks:
- https://www.kaggle.com/code/gawainlai/us-data-science-job-salary-regression-w-visuals (beyond my skill level)
- https://www.kaggle.com/code/discdiver/the-most-in-demand-skills-for-data-scientists (top skills)

## Import Libraries

In [1]:
# selenium imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

# web scraping imports
from bs4 import BeautifulSoup
import requests

# database imports
import re as re
import time
import pandas as pd
import os
from pymongo import MongoClient
import numpy as np

# import and load file to login to LinkedIn
from dotenv import load_dotenv
load_dotenv()

True

## Scrape LinkedIn Using the Selenium Driver

In [2]:
# initialize the LinkedIn scrape

# Options
options = webdriver.ChromeOptions() # init for chrome
options.add_argument('--incognito') # runs chrome in a 'clean slate' window
#options.add_argument('--headless') # runs chromedriver in the background, without opening a window

# Initialize the selenium driver
driver = webdriver.Chrome(options = options, executable_path='./chromedriver')
login_url = "https://www.linkedin.com/uas/login"

# Start the page
driver.get(login_url)
time.sleep(3)

# Target the login elements
email = driver.find_element("id", "username")
password = driver.find_element("id", "password")

# Load env variables
my_email = os.getenv("linkedin_username")
my_password = os.getenv("linkedin_password")

# Input in the form
email.send_keys(my_email)
password.send_keys(my_password)
password.send_keys(Keys.RETURN)

  driver = webdriver.Chrome(options = options, executable_path='./chromedriver')


In [3]:
def scrape_links(data_role):
    """ Scrape 40 pages of a LinkedIn job search for job links, using the given data role as the search term """
    
    # SCRAPE 40 PAGES
    for i in range(2): # FIX: set 2 to 40 for final product
        print(f'running search for {i+1} of 40 pages for {data_role}.')
        
        # navigate to the correct page
        scrape_url = f"https://www.linkedin.com/jobs/search/?&keywords={data_role}&refresh=true&start={i*25}"
        if i == 0:
            scrape_url = f"https://www.linkedin.com/jobs/search/?&keywords={data_role}&refresh=true&start={1}"
        driver.get(scrape_url)
        time.sleep(5)

        # SCROLL DOWN THE JOB LIST
        #driver.execute_script("window.scrollTo(0, document.body.div.jobs-search-results-list.scrollHeight);")
        #left_rail_bottom = driver.find_element(By.CLASS_NAME, "global-footer-compact__content.t-12.t-normal.text-align-center.clear-both.compactfooter-copyright")
        #left_rail_bottom = driver.find_element(By.CSS_SELECTOR, "li.global-footer-compact__item")
        left_rail_bottom = driver.find_element(By.ID, "compactfooter-about")
        driver.execute_script("arguments[0].scrollIntoView();", left_rail_bottom)
        
        # scroll down for each job element
        #driver.execute_script("arguments[0].scrollIntoView();", job)
        time.sleep(1)
        
        # convert page text to beautiful soup
        src = driver.page_source
        soup_for_page = BeautifulSoup(src, 'lxml')
        
        # create a list of jobs on the current page, to iterate through after each scrape
        job_links = []
        jobs_on_page = soup_for_page.find_all("a", attrs={"class":"disabled ember-view job-card-container__link job-card-list__title"})
        for k in jobs_on_page: # length of jobs varies by page
            job_links.append(k["href"])
        print(f'total job links from page {i+1}:', len(job_links)) # DEBUG
        #print(f'job links from page {i+1}:',job_links) # DEBUG
        
    return job_links

In [4]:
def see_more():
    """ Load an entire LinkedIn job post page """
    
    # Click on the 'see more' button on a LinkedIn job post (there are several window types)
    try:
        driver.find_element_by_class_name("artdeco-card__actions").click()
        print('find_element_by_class_name("artdeco-card__actions") SUCCESS')
    except:
        try:
            driver.find_element(By.Class, "show-more-less-html__button show-more-less-html__button--more").click()
            print('driver.find_element(By.Class, "show-more-less-html__button ... SUCCESS')
        except:
            print('Find another solution to SEE MORE on this page')
                            
    # Scroll down the entire page for easier data collection
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
    
    time.sleep(1)

In [5]:
def scrape_listing(links):
    """ Returns all scraped data for each job listing from the links passed into the function """

    # VARIABLE ASSIGNMENT create lists to store all scraped data (10 criteria)
    titles, companies, locations, remote, post_dates, num_applicants, full_time, size, desc, salaries = [], \
        [], [], [], [], [], [], [], [], []
    
    # SCRAPE ALL LINKS scrape all jobs on the current page using passed in links
    #for key, value in job_links.items():
    for link in links:
    
        # GO TO PAGE Navigate to page
        #print('\n\n\nkey:', key, '\nvalue:', value[0][0]) # DEBUG TEXT
        driver.get(f'https://linkedin.com{link}')
        time.sleep(5)
        
        # SEE FULL PAGE click 'see more' and scroll down
        see_more()
        
        # BEAUTFUL SOUP EXTRACTION convert page text to beautiful soup
        src = driver.page_source
        soup = BeautifulSoup(src, 'lxml')
        
        # DATA COLLECTION return data of results on current selected sub-page to new lists
        titles.append(soup.select('h1.t-24.t-bold.jobs-unified-top-card__job-title')[0].get_text().replace('\n','').strip())
        companies.append(soup.select('span.jobs-unified-top-card__company-name')[0].get_text().replace('\n','').strip())
        locations.append(soup.select('span.jobs-unified-top-card__bullet')[0].get_text().replace('\n','').strip())
        remote.append(soup.select('span.jobs-unified-top-card__workplace-type')[0].get_text().replace('\n','').strip())
        
        # POST DATE
        try:
            post_dates.append(soup.select('span.jobs-unified-top-card__posted-date')[0].get_text().replace('\n','').strip())
        except:
            print("could not find 'posted date'")
            post_dates.append(None)
        
        # NUMBER of APPLICANTS
        try:
            num_applicants.append(soup.select('span.jobs-unified-top-card__applicant-count')[2].get_text().replace('\n','').strip())
        except:
            print("could not find 'number of applicants' in applicant count")
            try:
                num_applicants.append(soup.select('span.jobs-unified-top-card__bullet')[1].get_text().replace('\n','').strip())
            except:
                print("could not find 'number of applicants' in bullet")
                num_applicants.append(None)
        
        # FULL TIME
        try:
            full_time.append(soup.select('li.jobs-unified-top-card__job-insight.span').get_text()
                .replace('\n','').strip().rsplit(' ', 1)[-1])
        except:
            print('could not find "full time" in job insights')
            try:
                full_time.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                    .replace('\n','').strip().rsplit(' ', 1)[-1])
            except:
                print("could not find 'full time'")
                full_time.append(None)
        
        # COMPANY SIZE
        try:
            size.append(soup.select('li.jobs-unified-top-card__job-insight')[1].get_text().replace('\n','').strip())
        except:
            print("could not find 'company size'")
            size.append(None)
        
        # DESCRIPTION
        desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip())

        # SALARY
        try:
            salaries.append(soup.select('a.app-aware-link')[6].get_text().replace('\n','').strip())
            if '$' not in salaries[-1]:
                salaries.pop()
                salaries.append(None)
        except:
            print("could not find 'salary' with #SALARY tag")
            try:
                salaries.append(soup.select('p.t-16')[0].get_text().replace('\n','').strip())
            except:
                print("could not find 'salary' on p.t-16")
                try:
                    salaries.append(re.find('($.)', desc).replace('\n','').strip())
                except:
                    print("could not find 'salary' in description")
                    try: 
                        salaries.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                            .replace('\n','').strip().rsplit(' ', 1)[0].rstrip(' ·'))
                    except:
                        print("could not find 'salary' in full-time")
                        salaries.append(None)

    # DICTIONARY ASSIGNMENT pass data from lists into a dictionary
    dict_from_scrape = {'title':titles, 'company':companies, 'location':locations, 'post_date':post_dates, 
                        'num_applicants':num_applicants, 'full_time':full_time, 'size':size, 
                        'description':desc, 'salary':salaries}

    # DATAFRAME ASSIGNMENT
    df_from_scrape = pd.DataFrame(dict_from_scrape)

    return df_from_scrape

In [6]:
def clean_list(list_):
    list_flattened = [a for y in list_ for a in y]
    return list(set(list_flattened))

In [7]:
data_roles = ['data analyst', 'data scientist', 'data architect'] # Titles taken from previous market analysis, used to capture all links to scrape with matching search term. # FIX: add all roles
job_links = [] # init list to capture all job links
#df = pd.DataFrame() # init dataframe to capture all scraped jobs # FIX: might not need

# for each role title given above, return a list of up to 1,000 jobs from LinkedIn
for title in data_roles:
    job_links.append(scrape_links(title.replace(' ','%20')))
#print(job_links) # DEBUG

job_links_cleaned = clean_list(job_links) # DUPLICATES remove dupliate links before scraping

# Scrape each new link
#df = pd.concat([df, scrape_listing(job_links_cleaned)], axis=0).reset_index(drop=True) # FIX: might not need
df = pd.DataFrame(scrape_listing(job_links_cleaned))#.reset_index(drop=True)
driver.close()

df

running search for 1 of 40 pages for data%20analyst.


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="compactfooter-about"]"}
  (Session info: chrome=104.0.5112.101)
Stacktrace:
0   chromedriver                        0x000000010631f149 chromedriver + 4469065
1   chromedriver                        0x00000001062a9233 chromedriver + 3985971
2   chromedriver                        0x0000000105f3ffe8 chromedriver + 409576
3   chromedriver                        0x0000000105f761d8 chromedriver + 631256
4   chromedriver                        0x0000000105f76451 chromedriver + 631889
5   chromedriver                        0x0000000105fa88f4 chromedriver + 837876
6   chromedriver                        0x0000000105f93c8d chromedriver + 752781
7   chromedriver                        0x0000000105fa6611 chromedriver + 828945
8   chromedriver                        0x0000000105f93b53 chromedriver + 752467
9   chromedriver                        0x0000000105f69905 chromedriver + 579845
10  chromedriver                        0x0000000105f6a955 chromedriver + 584021
11  chromedriver                        0x00000001062f06ad chromedriver + 4277933
12  chromedriver                        0x00000001062f4b3a chromedriver + 4295482
13  chromedriver                        0x00000001062f9cdf chromedriver + 4316383
14  chromedriver                        0x00000001062f5857 chromedriver + 4298839
15  chromedriver                        0x00000001062ce64f chromedriver + 4138575
16  chromedriver                        0x00000001063101f8 chromedriver + 4407800
17  chromedriver                        0x000000010631037f chromedriver + 4408191
18  chromedriver                        0x0000000106326cb5 chromedriver + 4500661
19  libsystem_pthread.dylib             0x00007ff8167944e1 _pthread_start + 125
20  libsystem_pthread.dylib             0x00007ff81678ff6b thread_start + 15


In [None]:
df.description[0]

## Scrape Glassdoor using the Selenium Driver

In [None]:
# search glassdoor for all job titles
# save to glass_df: job title, salary range (or salary avg? both?)
# add glassdoor_salary column to df
# fill glassdoor_salary based on role name

## EDA and Cleaning

In [None]:
"""
# replace null values
df.isnull().sum()
df.dropna(inplace=True)

# given previous data sets, check salary info
df.salary.unique()

# remove duplicates
clean.rm_dups(df)

df
"""

## Export Data to CSV for Visualization

In [None]:
# export df to csv

## Challenges & Lessons

### Text extraction & different libraries: 
I couldn't extract text from my scraped LinkedIn data because I was try to pass data from one library's format (Selenium) into another library (Beautiful Soup). I restarted my kernel, rewrote my code (a lot), and one solution I found online used a function similar to others I had found. This seems to transfer text into a different format so that it's readable by other libraries.