# LinkedIn Market Analysis


### Process:
1. Scrape data from Linkedin and Glassdoor, using Selenium.
2. EDA, cleaning, and export to CSV.
3. Compare to previous years' data (2 and 4 years ago) using tableau.

### Additional Datasets:
- 2018: 
- 2020: 
- 2022: [my submission to Kaggle]

### Reference Notebooks:
- https://www.kaggle.com/code/gawainlai/us-data-science-job-salary-regression-w-visuals (beyond my skill level)
- https://www.kaggle.com/code/discdiver/the-most-in-demand-skills-for-data-scientists (top skills)

## Import Libraries

In [1]:
# selenium imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.support.ui import Select

# web scraping imports
from bs4 import BeautifulSoup
import requests

# database imports
import re as re
import time
import pandas as pd
import os
from pymongo import MongoClient
import numpy as np

# import and load file to login to LinkedIn
from dotenv import load_dotenv
load_dotenv()

True

## Scrape LinkedIn Using the Selenium Driver

In [2]:
# initialize the LinkedIn scrape

# Options
options = webdriver.ChromeOptions() # init for chrome
options.add_argument('--incognito') # runs chrome in a 'clean slate' window
options.add_argument('--headless') # runs chromedriver in the background, without opening a window

# Initialize the selenium driver
driver = webdriver.Chrome(options = options, executable_path='./chromedriver')
login_url = "https://www.linkedin.com/uas/login"

# Start the page
driver.get(login_url)
time.sleep(3)

# Target the login elements
email = driver.find_element("id", "username")
password = driver.find_element("id", "password")

# Load env variables
my_email = os.getenv("linkedin_username")
my_password = os.getenv("linkedin_password")

# Input in the form
email.send_keys(my_email)
password.send_keys(my_password)
password.send_keys(Keys.RETURN)

  driver = webdriver.Chrome(options = options, executable_path='./chromedriver')


In [3]:
def scrape(data_role):
    """ Scrape 40 pages of LinkedIn job search, for the given data role """
    job_num = 1
    
    # create lists to store all scraped data (12 criteria)
    titles = []
    companies = []
    locations = []
    remote = []
    post_dates = []
    num_applicants = []
    full_time = []
    size = []
    easy_apply = []
    desc = []
    salaries = []
    
    while job_num < 25: # scrape jobs until page=40 (job_num=975)
        # navigate to the correct page
        scrape_url = f"https://www.linkedin.com/jobs/search/?&keywords={data_role}&refresh=true&start={job_num}"
        time.sleep(5)
        driver.get(scrape_url)

        # scrape 25 jobs on the current page
        for i in range(1): # FIX: 1 --> 25
            
            # convert page text to beautiful soup
            src = driver.page_source
            soup = BeautifulSoup(src, 'lxml')
            
            # Return data of results on current selected sub-page to new lists
            time.sleep(5)
            titles.append(soup.select('h2.t-24.t-bold.jobs-unified-top-card__job-title')[0].get_text().replace('\n','').strip())
            companies.append(soup.select('span.jobs-unified-top-card__company-name')[0].get_text().replace('\n','').strip())
            locations.append(soup.select('span.jobs-unified-top-card__bullet')[1].get_text().replace('\n','').strip())
            remote.append(soup.select('span.jobs-unified-top-card__workplace-type')[0].get_text().replace('\n','').strip())
            try:
                post_dates.append(soup.select('span.jobs-unified-top-card__posted-date')[0].get_text().replace('\n','').strip())
            except:
                print("could not find 'posted date'")
                post_dates.append(None)
            try:
                num_applicants.append(soup.select('span.jobs-unified-top-card__bullet')[2].get_text())
            except:
                print("could not find 'number of applicants'")
                num_applicants.append(None)
            try:
                full_time.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text())
            except:
                print("could not find 'full time'")
                full_time.append(None)
            time.sleep(5)
            try:
                print(soup.select('li.jobs-unified-top-card__job-insight')[3].get_text())
                size.append(soup.select('li.jobs-unified-top-card__job-insight')[3].get_text())
            except:
                print("could not find 'company size'")
                size.append(None)
            try:
                print(soup.select('span.artdeco-button__text')[42].get_text())
                easy_apply.append(soup.select('span.artdeco-button__text')[42].get_text())
                #alt: div.jobs-apply-button--top-card
            except:
                print("could not find 'easy apply button'")
                easy_apply.append(None)
            desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch'))
            try:
                salaries.append(soup.select('p.t-16')[0].get_text())
            except:
                print("could not find 'salary' on page")
                try: 
                    salaries.append(re.find('($.)', desc))
                except:
                    print("could not find 'salary' in description")
                    salaries.append(None)
            
            
            
            # use selenium to click on next job (in list of 25)
            
            #"div#ember606.flex-grow-1.artdeco-entity-lockup__content.ember-view"
            print(soup.select('a#ember608.disabled.ember-view.job-card-container__link.job-card-list__title'))
            
            #ul = soup.select('ul.scaffold-layout__list-container')[0].find_all('li')
            #print(ul)
            # seems like ul[0] is the current page, opened
            #print('ul index 1: ', ul[1])
            #for tag in ul:
            #    print('test', tag.text)
            
            tag = soup.select('a#ember608.disabled.ember-view.job-card-container__link.job-card-list__title')
            driver.find_element(tag).click()
            
            
            
        # increment to next page
        job_num +=25
    
    # pass data from lists into a dictionary
    dict_from_scrape = {'title':titles, 'company':companies, 'location':locations, 'post_date':post_dates, 
                        'num_applicants':num_applicants, 'full_time':full_time, 'size':size, 
                        'easy_apply':easy_apply, 'description':desc, 'salary':salaries}
    
    return dict_from_scrape

In [4]:
#scrape('data%20analyst')
#driver.close()

"""
data_roles = ['data analyst', 'data scientist', etc] # taken from previous data analyst analysis

for elem in data_roles:
    df.append(scrape(elem), axis=1)

df
"""
scrape('data%20analyst')

could not find 'posted date'
could not find 'number of applicants'
could not find 'full time'
could not find 'company size'
could not find 'easy apply button'
could not find 'salary' on page
could not find 'salary' in description
[]


{'title': ['Solutions Engineer'],
 'company': ['LeapXpert'],
 'location': ['New York, United States'],
 'post_date': [None],
 'num_applicants': [None],
 'full_time': [None],
 'size': [None],
 'easy_apply': [None],
 'description': [[<div class="jobs-box__html-content jobs-description-content__text t-14 t-normal jobs-description-content__text--stretch" id="job-details" tabindex="-1">
   <!-- -->
   <!-- -->
   <!-- --> <span>
   <!-- --><!-- --> </span>
   </div>]],
 'salary': [None]}

## Scrape Glassdoor using the Selenium Driver

In [5]:
# search glassdoor for all job titles
# save to glass_df: job title, salary range (or salary avg? both?)
# add glassdoor_salary column to df
# fill glassdoor_salary based on role name

## EDA and Cleaning

In [6]:
# replace null values
df.isnull().sum()
df.dropna(inplace=True)

# given previous data sets, check salary info
df.salary.unique()

# remove duplicates
clean.rm_dups(df)

df

NameError: name 'df' is not defined

## Export Data to CSV for Visualization

In [None]:
# export df to csv

## Challenges & Lessons

### Text extraction & different libraries: 
I couldn't extract text from my scraped LinkedIn data because I was try to pass data from one library's format (Selenium) into another library (Beautiful Soup). I restarted my kernel, rewrote my code (a lot), and one solution I found online used a function similar to others I had found. This seems to transfer text into a different format so that it's readable by other libraries.