# LinkedIn Market Analysis


### Process:
1. Scrape data from Linkedin and Glassdoor, using Selenium.
2. EDA, cleaning, and export to CSV.
3. Compare to previous years' data (2 and 4 years ago) using tableau.

### Additional Datasets:
- 2018: https://www.kaggle.com/datasets/discdiver/data-scientist-general-skills-2018-revised (skills specific)
- 2020: https://www.kaggle.com/datasets/andrewmvd/data-analyst-jobs (jobs, salary, and location)
- 2022: [my submission to Kaggle]

### Reference Notebooks:
- https://www.kaggle.com/code/gawainlai/us-data-science-job-salary-regression-w-visuals (beyond my skill level)
- https://www.kaggle.com/code/discdiver/the-most-in-demand-skills-for-data-scientists (top skills)

## Import Libraries

In [1]:
# selenium imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

# web scraping imports
from bs4 import BeautifulSoup
import requests

# database imports
import re as re
import time
import pandas as pd
import os
import numpy as np

# import and load file to login to LinkedIn
from dotenv import load_dotenv
load_dotenv()

True

## Scrape LinkedIn Using the Selenium Driver

In [22]:
# initialize the LinkedIn scrape

# Options
options = webdriver.ChromeOptions() # init for chrome
options.add_argument('--incognito') # runs chrome in a 'clean slate' window
#options.add_argument('--headless') # runs chromedriver in the background, without opening a window

# Initialize the selenium driver
driver = webdriver.Chrome(options = options, executable_path='./chromedriver')
login_url = "https://www.linkedin.com/uas/login"

# Start the page
driver.get(login_url)
time.sleep(3)

# Target the login elements
email = driver.find_element("id", "username")
password = driver.find_element("id", "password")

# Load env variables
my_email = os.getenv("linkedin_username")
my_password = os.getenv("linkedin_password")

# Input in the form
email.send_keys(my_email)
password.send_keys(my_password)
password.send_keys(Keys.RETURN)

  driver = webdriver.Chrome(options = options, executable_path='./chromedriver')


In [3]:
def scrape_links(data_role, location):
    """ Scrape 40 pages of a LinkedIn job search for job links, using the given data role as the search term """
    
    # SCRAPE 40 PAGES
    for i in range(1): # FIX: change back to 40 for final analysis
        print(f'Scraping {i+1} of 40 pages for {data_role} in {location}.')
        
        # navigate to the correct page
        scrape_url = f"https://www.linkedin.com/jobs/search/?&keywords={data_role}&location={location}&refresh=true&start={i*25}"
        # TEST: https://www.linkedin.com/jobs/search/?&keywords=data%20analyst&location=Los%20Angeles%2C%20California%2C%20United%20States&refresh=true&start=1
        if i == 0:
            scrape_url = f"https://www.linkedin.com/jobs/search/?&keywords={data_role}&location={location}&refresh=true&start={1}"
        driver.get(scrape_url)
        time.sleep(5)

        # convert page text to beautiful soup
        src = driver.page_source
        soup_for_page = BeautifulSoup(src, 'lxml')
        
        # create a list of jobs on the current page, to iterate through after each scrape
        job_links = []
        jobs_on_page = soup_for_page.find_all("a", attrs={"class":"disabled ember-view job-card-container__link job-card-list__title"})
        for k in jobs_on_page: # length of jobs varies by page
            job_links.append(k["href"])
        print(f'Job links collected from page {i+1}:', len(job_links)) # DEBUG
        #print(f'job links from page {i+1}:',job_links) # DEBUG
        
    return job_links

In [4]:
def see_more():
    """ Load an entire LinkedIn job post page """
    
    # Click on the 'see more' button on a LinkedIn job post (there are several window types)
    try:
        driver.find_element_by_class_name("artdeco-card__actions").click()
        print('find_element_by_class_name("artdeco-card__actions") SUCCESS')
    except:
        try:
            driver.find_element(By.Class, "show-more-less-html__button show-more-less-html__button--more").click()
            print('driver.find_element(By.Class, "show-more-less-html__button ... SUCCESS')
        except:
            print('Find another solution to SEE MORE on this page')
                            
    # Scroll down the entire page for easier data collection
    driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
    
    time.sleep(1)

In [14]:
def scrape_listing(links):
    """ Returns all scraped data for each job listing from the links passed into the function """

    # VARIABLE ASSIGNMENT create lists to store all scraped data (10 criteria)
    titles, companies, locations, remote, post_dates, num_applicants, contract, size, desc, salaries = [], \
        [], [], [], [], [], [], [], [], []
    
    # SCRAPE ALL LINKS scrape all jobs on the current page using passed in links
    for idx, link in enumerate(links):
        print(f'\nScraping job {idx} of {len(links)}.') # DEBUG TEXT
        
        # GO TO PAGE Navigate to page
        #print('\n\n\nkey:', key, '\nvalue:', value[0][0]) # DEBUG TEXT
        driver.get(f'https://linkedin.com{link}')
        time.sleep(10)
        
        # SEE FULL PAGE click 'see more' and scroll down
        #see_more() # FIX
        
        # BEAUTFUL SOUP EXTRACTION convert page text to beautiful soup
        src = driver.page_source
        soup = BeautifulSoup(src, 'lxml')
        
        # DATA COLLECTION return data of results on current selected sub-page to new lists
        
        # TITLE
        try:
            titles.append(soup.select('h1.t-24.t-bold.jobs-unified-top-card__job-title')[0].get_text().replace('\n','').strip())
        except:
            print('title could not be found')
            titles.append(None)
         
        # COMPANY
        try:
            companies.append(soup.select('span.jobs-unified-top-card__company-name')[0].get_text().replace('\n','').strip())
        except:
            print('company could not be found')
            companies.append(None)
            
        # LOCATION
        try:
            locations.append(soup.select('span.jobs-unified-top-card__bullet')[0].get_text().replace('\n','').strip())
        except:
            print('location could not be found')
            locations.append(None)
        
        # REMOTE POSITION
        try: # check in header
            remote.append(soup.select('span.jobs-unified-top-card__workplace-type')[0].get_text().replace('\n','').strip())
        except:
            try: # check the description for remote term.
                desc_temp = soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip()
                print('remote could not be found in header')
                if 'remote' in desc_temp:
                    remote.append('remote?')
                elif 'hybrid' in desc:
                    remote.append('hybrid?')
                else:
                    remote.append(None)
            except:
                try: # check in title
                    title_temp = soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip()
                    if 'remote' in title_temp:
                        remote.append('remote?')
                    else:
                        remote.append(None)
                    print('remote position could not be found in header or description')
                except:
                    print('remote position could not be found in header , description, or title')
                    remote.append(None)
            
        # POST DATE
        try:
            post_dates.append(soup.select('span.jobs-unified-top-card__posted-date')[0].get_text().replace('\n','').strip())
        except:
            print("could not find 'posted date'")
            post_dates.append(None)
        
        # NUMBER of APPLICANTS
        try:
            num_applicants.append(soup.select('span.jobs-unified-top-card__applicant-count')[2].get_text().replace('\n','').strip())
        except:
            try:
                num_applicants.append(soup.select('span.jobs-unified-top-card__bullet')[1].get_text().replace('\n','').strip())
                print("could not find 'number of applicants' in applicant count")
            except:
                print("could not find 'number of applicants' in applicant count or bullet")
                num_applicants.append(None)
        
        # FULL TIME
        try:
            contract.append(soup.select('li.jobs-unified-top-card__job-insight.span').get_text()
                .replace('\n','').strip().rsplit(' ', 1)[-1])
        except:
            try:
                contract.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                    .replace('\n','').strip().rsplit(' ', 1)[-1])
                print('could not find "contract type" in job insights.span')
            except:
                print("could not find 'contract type' in job insights.span or job insights")
                contract.append(None)
        
        # COMPANY SIZE
        try:
            size.append(soup.select('li.jobs-unified-top-card__job-insight')[1].get_text().replace('\n','').strip())
        except:
            print("could not find 'company size'")
            size.append(None)
        
        # DESCRIPTION
        try:
            desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip())
        except:
            
            print("could not find description (probably shouldn't apply!)")
            
        # SALARY
        try:
            salaries.append(soup.select('a.app-aware-link')[6].get_text().replace('\n','').strip())
            if '$' not in salaries[-1]:
                salaries.pop()
                salaries.append(None)
        except:
            try:
                salaries.append(soup.select('p.t-16')[0].get_text().replace('\n','').strip())
                print("could not find 'salary' with #SALARY tag")
            except:
                try:
                    salaries.append(re.find('($.)', desc).replace('\n','').strip())
                    print("could not find 'salary' with #SALARY tag or in p.t-16")
                except:
                    try: 
                        salaries.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                            .replace('\n','').strip().rsplit(' ', 1)[0].rstrip(' ·'))
                        print("could not find 'salary' with #SALARY tag, in p.t-16, or in description")
                    except:
                        print("could not find 'salary' with #SALARY tag, in p.t-16, in description, or in full-time")
                        salaries.append(None)

    # DICTIONARY ASSIGNMENT pass data from lists into a dictionary
    dict_from_scrape = {'title':titles, 'company':companies, 'location':locations, 'remote':remote, 
                        'post_date':post_dates, 'num_applicants':num_applicants, 'contract_type':contract, 
                        'company_size':size, 'description':desc, 'salary':salaries}

    # DATAFRAME ASSIGNMENT
    df_from_scrape = pd.DataFrame(dict_from_scrape)
    
    os.system("say -v Monica ayam don escreipin")
    return df_from_scrape

In [6]:
def flatten_2d_list(list_):
    list_flattened = [a for y in list_ for a in y]
    return list(set(list_flattened))

In [7]:
# LISTS FOR SCRAPING

# 5 titles taken from market analysis, used to capture all links to scrape with matching search term.
data_roles = ['data analyst','data scientist','data engineer','data architect','data manager']
# removed from final scrape to reduce noise and risk of account ban: 'finance analyst','data warehouse analyst','data manager','data marketing analyst'

# 32 locations chosen from top tech cities across US (excluding search results yielding the same listings on LinkedIn)
locations = ['San Francisco, California, United States','Los Angeles, California, United States','San Jose, California, United States',
             'San Diego, California, United States','Portland, Oregon, United States','Seattle, Washington, United States',
             'Denver, Colorado, United States', 'Colorado Springs, Colorado, United States','Indianapolis, Indiana, United States',
             'New York, New York, United States','Secaucus, New Jersey', 'Boston, Massachusetts, United States', 
             'Baltimore, Maryland, United States','Chicago, Illinois, United States','Philadelphia, Pennsylvania, United States'
             'Phoenix, Arizona, United States','Salt Lake City, Utah, United States','Minneapolis, Minnesota, United States',
             'Detroit, Michigan, United States','Columbus, Ohio, United States','Kansas City, Missouri, United States',
             'Austin, Texas, United States','Dallas, Texas, United States','Houston, Texas, United States', 
             'Atlanta, Georgia, United States','Jackson, Mississippi, United States','Washington, District of Columbia, United States',
             'Charlotte, North Carolina, United States','Raleigh, North Carolina, United States',
             'Jacksonville, Florida, United States','Miami, Florida, United States','Tampa, Florida, United States']

"""
HOW MUCH DATA IS ENOUGH ?
35 locations * 9 titles * 2 pages = 630 pages. At the full 40 pages, the real total of my scrape will be 
12,600 pages. Assuming I don't get banned for scraping 10,000 pages, let alone 100 pages, I will still need 
to scrape the links that come from them. That's 630 pages * 7 links = 4,410 links for the sample and 
12,600 * 25 = 315,000 links for the real scrape. Of course, most jobs will be duplicates, but that doesn't 
change that I will have to wait a long time for data and I may get banned several times before the scrape is
complete. In reality, it may be safer to limit my searches to fewer titles, locations, pages and links.

The new scrape of 5 roles * 32 cities * 40 pages = 6,400 (or 320 for the 2-page sample) is more reasonable.

state_locations = ['Washington', 'California', 'Colorado', 'Texas', 'Illinois', 'Florida', 'Atlanta', 'New York']

global_locations = [Barcelona, Madrid, Berlin, Munich, Amsterdam, London, Dublin, Stockholm, Copenhagen, Oslo,
             Luxembourg, Eindhoven, Manchester, Belfast, Bristol, Paris, Budapest, Bucharest, Warsaw, Prague, 
             Lisbon, Rome, Zurich, vancouver, ontario, montreal, toronto, 
             Melbourne, Moscow, Seoule, Jakarta, Kyiv, tokyo, rejkjavik,
             argentina, mexico city, lima, rio, buenos aires, sao paolo, panama,] 
"""

job_links = [] # init list to capture all job links

# SCRAPE search LinkedIn for each role title and location given above and return a list of up to 1,000 jobs
for title in data_roles:
    for location in locations:
        print(f'Searching for {title} jobs in {location}...')
        job_links.append(scrape_links(title.replace(' ','%20'), location.replace(',','%2C').replace(' ','%20')))
#print(job_links) # DEBUG

# DUPLICATES remove dupliate links and flatten 2d array before scraping
job_links_cleaned = flatten_2d_list(job_links)

Searching for data analyst jobs in San Francisco, California, United States...
Scraping 1 of 40 pages for data%20analyst in San%20Francisco%2C%20California%2C%20United%20States.
Job links collected from page 1: 7
Searching for data analyst jobs in Los Angeles, California, United States...
Scraping 1 of 40 pages for data%20analyst in Los%20Angeles%2C%20California%2C%20United%20States.
Job links collected from page 1: 7
Searching for data analyst jobs in San Jose, California, United States...
Scraping 1 of 40 pages for data%20analyst in San%20Jose%2C%20California%2C%20United%20States.
Job links collected from page 1: 7
Searching for data analyst jobs in San Diego, California, United States...
Scraping 1 of 40 pages for data%20analyst in San%20Diego%2C%20California%2C%20United%20States.
Job links collected from page 1: 7
Searching for data analyst jobs in Portland, Oregon, United States...
Scraping 1 of 40 pages for data%20analyst in Portland%2C%20Oregon%2C%20United%20States.
Job links co

Job links collected from page 1: 7
Searching for data scientist jobs in Secaucus, New Jersey...
Scraping 1 of 40 pages for data%20scientist in Secaucus%2C%20New%20Jersey.
Job links collected from page 1: 7
Searching for data scientist jobs in Boston, Massachusetts, United States...
Scraping 1 of 40 pages for data%20scientist in Boston%2C%20Massachusetts%2C%20United%20States.
Job links collected from page 1: 7
Searching for data scientist jobs in Baltimore, Maryland, United States...
Scraping 1 of 40 pages for data%20scientist in Baltimore%2C%20Maryland%2C%20United%20States.
Job links collected from page 1: 7
Searching for data scientist jobs in Chicago, Illinois, United States...
Scraping 1 of 40 pages for data%20scientist in Chicago%2C%20Illinois%2C%20United%20States.
Job links collected from page 1: 7
Searching for data scientist jobs in Philadelphia, Pennsylvania, United StatesPhoenix, Arizona, United States...
Scraping 1 of 40 pages for data%20scientist in Philadelphia%2C%20Pennsyl

Job links collected from page 1: 7
Searching for data engineer jobs in Kansas City, Missouri, United States...
Scraping 1 of 40 pages for data%20engineer in Kansas%20City%2C%20Missouri%2C%20United%20States.
Job links collected from page 1: 7
Searching for data engineer jobs in Austin, Texas, United States...
Scraping 1 of 40 pages for data%20engineer in Austin%2C%20Texas%2C%20United%20States.
Job links collected from page 1: 7
Searching for data engineer jobs in Dallas, Texas, United States...
Scraping 1 of 40 pages for data%20engineer in Dallas%2C%20Texas%2C%20United%20States.
Job links collected from page 1: 7
Searching for data engineer jobs in Houston, Texas, United States...
Scraping 1 of 40 pages for data%20engineer in Houston%2C%20Texas%2C%20United%20States.
Job links collected from page 1: 7
Searching for data engineer jobs in Atlanta, Georgia, United States...
Scraping 1 of 40 pages for data%20engineer in Atlanta%2C%20Georgia%2C%20United%20States.
Job links collected from page

Job links collected from page 1: 7
Searching for data architect jobs in Jacksonville, Florida, United States...
Scraping 1 of 40 pages for data%20architect in Jacksonville%2C%20Florida%2C%20United%20States.
Job links collected from page 1: 7
Searching for data architect jobs in Miami, Florida, United States...
Scraping 1 of 40 pages for data%20architect in Miami%2C%20Florida%2C%20United%20States.
Job links collected from page 1: 7
Searching for data architect jobs in Tampa, Florida, United States...
Scraping 1 of 40 pages for data%20architect in Tampa%2C%20Florida%2C%20United%20States.
Job links collected from page 1: 7
Searching for data manager jobs in San Francisco, California, United States...
Scraping 1 of 40 pages for data%20manager in San%20Francisco%2C%20California%2C%20United%20States.
Job links collected from page 1: 7
Searching for data manager jobs in Los Angeles, California, United States...
Scraping 1 of 40 pages for data%20manager in Los%20Angeles%2C%20California%2C%20Un

In [61]:
import urllib.request

def scrape_listing_manually(links):
    """ Returns all scraped data for each job listing from the links passed into the function """

    # VARIABLE ASSIGNMENT create lists to store all scraped data (10 criteria)
    titles, companies, locations, remote, post_dates, num_applicants, contract, size, desc, salaries = [], \
        [], [], [], [], [], [], [], [], []
    
    # SCRAPE ALL LINKS scrape all jobs on the current page using passed in links
    for idx, link in enumerate(links[:2]):
        print(f'\nScraping job {idx+1} of {len(links)}.\n') # DEBUG TEXT
        print(f'https://linkedin.com{link}\n') # DEBUG TEXT
        #opener = urllib.request.FancyURLopener({})
        opener = urllib.request.urlopen(f'https://linkedin.com{link}')
        #with opener.open(f'https://linkedin.com{link}') as f: 
        #    f.read().decode('utf-8')
            #content = f.read()
        #with open('data_page.html', 'r') as f:
            #contents = f.read()
        soup = BeautifulSoup(opener, 'lxml')
        #print('empty soup', soup)

        # DATA COLLECTION return data of results on current selected sub-page to new lists
        # TITLE
        try:
            titles.append(soup.select('h1.t-24.t-bold.jobs-unified-top-card__job-title')[0].get_text().replace('\n','').strip())
        except:
            try:
                titles.append(soup.select('h1.top-card-layout__title.font-sans.text-lg.papabear:text-xl.font-bold.leading-open.text-color-text.mb-0.topcard__title').get_text().strip())
            except:
                print('title could not be found')
                titles.append(None)

        # COMPANY
        try:
            companies.append(soup.select('span.jobs-unified-top-card__company-name')[0].get_text().replace('\n','').strip())
        except:
            try:
                companies.append(soup.select('a.topcard__org-name-link.topcard__flavor--black-link').get_text().strip())
            except:    
                print('company could not be found')
                companies.append(None)

        # LOCATION
        try:
            locations.append(soup.select('span.jobs-unified-top-card__bullet')[0].get_text().replace('\n','').strip())
        except:
            try:
                locations.append(soup.select('span.topcard__flavor.topcard__flavor').get_text().strip())
            except: 
                print('location could not be found')
                locations.append(None)

        # REMOTE POSITION
        try: # check in header
            remote.append(soup.select('span.jobs-unified-top-card__workplace-type')[0].get_text().replace('\n','').strip())
        except:
            try: # check the description for remote term.
                desc_temp = soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip()
                print('remote could not be found in header')
                if 'remote' in desc_temp:
                    remote.append('remote?')
                elif 'hybrid' in desc:
                    remote.append('hybrid?')
                else:
                    remote.append(None)
            except:
                try: # check in title
                    title_temp = soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip()
                    if 'remote' in title_temp:
                        remote.append('remote?')
                    else:
                        remote.append(None)
                    print('remote position could not be found in header or description')
                except:
                    print('remote position could not be found in header , description, or title')
                    remote.append(None)

        # POST DATE
        try:
            post_dates.append(soup.select('span.jobs-unified-top-card__posted-date')[0].get_text().replace('\n','').strip())
        except:
            print("❌ could not find 'posted date'")
            post_dates.append(None)

        # NUMBER of APPLICANTS
        try:
            num_applicants.append(soup.select('span.jobs-unified-top-card__applicant-count')[2].get_text().replace('\n','').strip())
        except:
            try:
                num_applicants.append(soup.select('span.jobs-unified-top-card__bullet')[1].get_text().replace('\n','').strip())
                print("could not find 'number of applicants' in applicant count")
            except:
                try:
                    num_applicants.append(soup.select('num_applicants__caption').get_text().replace('\n','').strip())
                    print("could not find 'number of applicants' in applicant count or bullet")
                except:
                    print("❌ could not find 'number of applicants' in applicant count, bullet, or caption")
                    num_applicants.append(None)

        # FULL TIME
        try:
            contract.append(soup.select('li.jobs-unified-top-card__job-insight.span').get_text()
                .replace('\n','').strip().rsplit(' ', 1)[-1])
        except:
            try:
                contract.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                    .replace('\n','').strip().rsplit(' ', 1)[-1])
                print('could not find "contract type" in job insights.span')
            except:
                print("❌ could not find 'contract type' in job insights.span or job insights")
                contract.append(None)

        # COMPANY SIZE
        try:
            size.append(soup.select('li.jobs-unified-top-card__job-insight')[1].get_text().replace('\n','').strip())
        except:
            print("❌ could not find 'company size'")
            size.append(None)

        # DESCRIPTION
        try:
            desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip())
        except:
            try:
                desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch.span').get_text().strip())
            except:
                try:
                    desc.append(soup.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch').get_text().strip())
                except:
                    try:
                        desc.append(soup.select('div.job-details.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip())
                    except:
                        try:
                            print('text rich\n', soup.select('div.description__text.description__text--rich')[0].get_text().strip())
                            desc.append(soup.select('div.description__text.description__text--rich')[0].get_text().strip())
                        except:
                            try:
                                desc.append(soup.select('div.show-more-less-html__markup.show-more-less-html__markup--clamp-after-5')[0])
                                print('clamp after 5\n', soup.select('div.show-more-less-html__markup.show-more-less-html__markup--clamp-after-5')[0])
                            except:
                                desc.append(None)
                                print("⚠️ could not find description.")

        # SALARY
        try:
            salaries.append(soup.select('a.app-aware-link')[6].get_text().replace('\n','').strip())
            if '$' not in salaries[-1]:
                salaries.pop()
                salaries.append(None)
        except:
            try:
                salaries.append(soup.select('p.t-16')[0].get_text().replace('\n','').strip())
                print("could not find 'salary' with #SALARY tag")
            except:
                try:
                    salaries.append(re.find('($.)', desc).replace('\n','').strip())
                    print("could not find 'salary' with #SALARY tag or in p.t-16")
                except:
                    try: 
                        salaries.append(soup.select('li.jobs-unified-top-card__job-insight')[0].get_text()
                            .replace('\n','').strip().rsplit(' ', 1)[0].rstrip(' ·'))
                        print("could not find 'salary' with #SALARY tag, in p.t-16, or in description")
                    except:
                        print("❌ could not find 'salary' with #SALARY tag, in p.t-16, in description, or in full-time")
                        salaries.append(None)

    # DICTIONARY ASSIGNMENT pass data from lists into a dictionary
    dict_from_scrape = {'title':titles, 'company':companies, 'location':locations, 'remote':remote, 
                        'post_date':post_dates, 'num_applicants':num_applicants, 'contract_type':contract, 
                        'company_size':size, 'description':desc, 'salary':salaries}

    # DATAFRAME ASSIGNMENT
    df_from_scrape = pd.DataFrame(dict_from_scrape)

    os.system("say -v Monica ayam don escreipin")
    return df_from_scrape

df = scrape_listing_manually(job_links_cleaned)
df

SyntaxError: invalid syntax (1893671553.py, line 31)

In [13]:
# DEBUG beautiful soup test

with open('data_page.html', 'r') as f:

    contents = f.read()

    soup_local = BeautifulSoup(contents, 'lxml')

    #print(soup_local)

soup_local.select('div.jobs-box__html-content.jobs-description-content__text.t-14.t-normal.jobs-description-content__text--stretch')[0].get_text().strip()

"About the job\n          \n\n \n                Do you have a passion for artificial intelligence, machine learning, and data analysis? Do you yearn to have the impact of your work recognized and valued by more than just your development team? Do you constantly wonder what you could build if only you had access to world-class data sets and computing resources?\n\nIf yes, we have just the role for you.\n\nIn Deloitte's Audit and Assurance business, we make businesses and markets better. An audit is more than an obligation; it is an opportunity to see further and deeper into businesses. In our role as independent auditors, we enhance trust in the companies we audit, helping a multitrillion dollar capital markets system function with greater confidence. As we aspire to the very highest standards of audit quality, we deliver deeper insights that can help clients become more effective organizations.\n\nDeloitte's Audit and Assurance business embraces the promise of artificial intelligence 

In [19]:
def scrape_pages_for_df(links):
    """ SCRAPE each link in the newly cleaned list """
    #df = pd.DataFrame(scrape_listing(job_links_cleaned))# FIX: maybe add: .reset_index(drop=True)
    
    # DEBUG: use this method to get a partial dataframe when you cut the code off early
    df = pd.DataFrame(scrape_listing(job_links_cleaned[0:50]))
    for i in len(job_links_cleaned)//50:
        df = pd.concat([df, scrape_listing(job_links_cleaned[i*50:i*50+50])], axis=0).reset_index(drop=True)
    
    return df

# SCRAPE each page for dataframe info
df = scrape_pages_for_df(job_links_cleaned)
driver.close()

# DISPLAY our new dataframe (uncleaned)
df


Scraping job 0 of 1085.
remote could not be found in header
could not find 'number of applicants' in applicant count or bullet
could not find "contract type" in job insights.span
could not find description (probably shouldn't apply!)

Scraping job 1 of 1085.


KeyboardInterrupt: 

## Scrape Glassdoor using the Selenium Driver

In [None]:
# search glassdoor for all job titles
# save to glass_df: job title, salary range (or salary avg? both?)
# add glassdoor_salary column to df
# fill glassdoor_salary based on role name

## Export Data to CSV for Cleaning

In [None]:
df.to_csv('/output/linkedin_jobs_uncleaned.csv', index = False, encoding='utf-8')