## Reference guides:
https://realpython.com/python-web-scraping-practical-introduction/  
https://realpython.com/beautiful-soup-web-scraper-python/

## Real world test:
https://ai-jobs.net/

In [82]:
import re
import time
import requests
import numpy as np

from bs4 import BeautifulSoup
from collections import Counter
from selenium import webdriver

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd

In [83]:
url = 'https://ai-jobs.net/'

In [84]:
## launch headless Edge instance
options = webdriver.EdgeOptions()
options.add_argument('--headless')

driver = webdriver.Edge(options = options)
driver.get(url)

## grab html from page
page = driver.page_source.encode('utf-8')
soup = BeautifulSoup(page, 'html.parser')

## find total number of jobs found
job_count_ele = soup.find('em', id = 'job-search-count')
job_count = int(re.search(r'[0-9]+(?= jobs)', str(job_count_ele)).group())

In [85]:
## click 'load more' button to load all jobs
for i in range((job_count // 100)):
    button = driver.find_element('id', 'load-more-jobs')
    driver.execute_script('arguments[0].click()', button)
    time.sleep(1)
    if i % 10 == 0:
        print('Progress: ' + str(i) + ' clicks')

Progress: 0 clicks
Progress: 10 clicks
Progress: 20 clicks
Progress: 30 clicks


In [86]:
## recapture html from page with all jobs loaded. Grab all jobs listings as rows
page = driver.page_source.encode('utf-8')
soup = BeautifulSoup(page, 'html.parser')
job_rows = soup.find_all('a', class_ = 'col pt-2 pb-3')

In [89]:
## initiate variables for loop
df = pd.DataFrame(columns = ['job_title','location','salary_range','level','skills'])
words = []
counter = 0
start = time.time()

## loop through rows, grabbing unique job posting url from each  
for i in range(len(job_rows)):
    
    job_title = re.search('(<h3.*>)(.*)(</h3>)', str(job_rows[i])).group(2)
    location = re.search('(<span class="d-block.*>)(.*)(</span>)', str(job_rows[i])).group(2)
    if re.search('(<span class="badge rounded-pill text-bg-success d-none d-md-inline-block">)(.*)(</span>)', str(job_rows[i])) != None:
        salary_range = re.search('(<span class="badge rounded-pill text-bg-success d-none d-md-inline-block">)(.*)(</span>)', str(job_rows[i])).group(2)
    else: 
        salary_range = np.nan

    if re.search('(<span class="badge rounded-pill text-bg-info my-md-1 d-none d-md-inline-block">)(.*)(</span>)', str(job_rows[i])) != None:
        level = re.search('(<span class="badge rounded-pill text-bg-info my-md-1 d-none d-md-inline-block">)(.*)(</span>)', str(job_rows[i])).group(2)
    else: 
        level = np.nan

    skills_list = re.findall('(<span class="badge rounded-pill text-bg-light">)(.*)(</span>)', str(job_rows[i]))
    skills = []

    for skill in skills_list:
        skills.append(skill[1]) 

    skills = ', '.join(skills)

    df.loc[i,'job_title'] = job_title
    df.loc[i,'location'] = location
    df.loc[i,'salary_range'] = salary_range
    df.loc[i,'level'] = level
    df.loc[i,'skills'] = skills
    
    ## check progress
    counter += 1
    if counter % 250 == 0:
        print('Progress: ' + str(counter) + ' of ' + 
              str(len(job_rows)) + '. Time Elapsed: ' + 
              '{:0>2}:{:0>2}'.format(int((time.time()-start)//60),
                                     int((time.time()-start)%60))
             )

Progress: 250 of 3924. Time Elapsed: 00:01
Progress: 500 of 3924. Time Elapsed: 00:03
Progress: 750 of 3924. Time Elapsed: 00:04
Progress: 1000 of 3924. Time Elapsed: 00:06
Progress: 1250 of 3924. Time Elapsed: 00:08
Progress: 1500 of 3924. Time Elapsed: 00:09
Progress: 1750 of 3924. Time Elapsed: 00:11
Progress: 2000 of 3924. Time Elapsed: 00:13
Progress: 2250 of 3924. Time Elapsed: 00:14
Progress: 2500 of 3924. Time Elapsed: 00:16
Progress: 2750 of 3924. Time Elapsed: 00:18
Progress: 3000 of 3924. Time Elapsed: 00:20
Progress: 3250 of 3924. Time Elapsed: 00:22
Progress: 3500 of 3924. Time Elapsed: 00:24
Progress: 3750 of 3924. Time Elapsed: 00:26


In [90]:
df

Unnamed: 0,job_title,location,salary_range,level,skills
0,Data Engineer,"Remote, United States",USD 100K - 130K,Mid-level,"Airflow, Architecture, AWS, Computer Science, ..."
1,Lead AI/ML Engineer,"Remote, Europe",,Senior-level,"APIs, Architecture, Computer Science, Engineer..."
2,Founding AI Engineer,"San Francisco, CA",USD 120K - 180K,,"Agile, Asana, AWS, Computer Vision, DevOps, GC..."
3,"Staff Research Scientist, AI/ML","Redwood City, CA",USD 270K - 405K,Mid-level,"Architecture, Biology, Computer Science, Data ..."
4,"Senior Machine Learning Engineer, Science","Redwood City, California",USD 190K - 285K,Mid-level,"Architecture, Biology, Computer Science, Data ..."
...,...,...,...,...,...
3919,Data Engineer,"Lagos, Nigeria",USD 59K - 135K *,Entry-level,"Engineering, MS SQL, SQL, SSIS, Testing"
3920,Machine Learning Engineer - LLM Infrastructure,San Francisco,USD 212K - 255K,Senior-level,"APIs, Architecture, AWS, CUDA, Docker, Enginee..."
3921,[PARIS] Data Engineer (H/F),"Paris, France",USD 115K - 180K *,Senior-level,"Agile, AWS, Azure, Big Data, DevOps, Engineeri..."
3922,Data Engineer,"Monterrey, Nuevo Leon, Mexico",USD 73K - 120K *,Mid-level,"APIs, Architecture, Azure, Big Data, Computer ..."


In [91]:
df.to_csv('C:/Users/Shoshana/Documents/CUNY SPS/cuny-sps/DATA_608/Story4/jobs_scrape.csv')