# Job Market for Data Professionals in NL - Web Scraping

This project was conducted as the module-2 end project in the data analytics bootcamp at Ironhack. 

The goal of the project is to analyze the current job market for data professionals in the Netherlands.

This notebook is the first part of the project, and is focused on scraping the current job listings from indeed.nl

In [3]:
# Import Libraries 
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import re

In [4]:
# Search keywords Data Engineer, Data Analyst, Data Scientist, Business Intelligence
url = 'https://www.indeed.nl/jobs?q=data+analist%2C+data+scientist%2C+data+engineer%2C+business+intelligence&l=Nederland&sort=date'

In [5]:
driver = webdriver.Chrome("./chromedriver/chromedriver.exe")

In [6]:
jobs = pd.DataFrame(columns=["Title", "Location", "Company", "Salary", "Since", "Description", 'Company Url'])


# Iterate through the result pages 
for i in range(0, 1000, 10):
    
    driver.get(url+'&start='+str(i))
    driver.implicitly_wait(4)
    
    all_jobs = driver.find_elements_by_class_name('result')
    
    for job in all_jobs:
        result_html = job.get_attribute('innerHTML')
        soup = BeautifulSoup(result_html, 'html.parser')

        try:
            title = soup.find("a", class_="jobtitle").text.replace('\n', '')
        except:
            title = 'N/A'

        try:
            company = soup.find(class_="company").text.replace('\n', '').strip()
        except:
            company = 'N/A'
     

        try: 
            location = soup.find(class_="location accessible-contrast-color-location").text
        except:
            location = 'N/A'

        try:
            salary = soup.find(class_="salary").text.replace("\n", "").strip()
        except:
            salary = 'N/A'
        
        try:
            posted = soup.find(class_="date").text
        except:
            posted = 'N/A'
            
        articles = soup.find_all('div', class_='sjcl')
        try: 
            for article in articles:
                a = article.find('a', class_='turnstileLink')['href']
        except:
            a = 'N/A'
        

        jobs = jobs.append({'Title':title, 'Salary': salary, 'Location': location, 'Company': company, 'Since':posted, 'Company Url': a}, ignore_index=True)

In [7]:
# Get some info from the companies 

company_url = jobs['Company Url']

base_url = 'https://www.indeed.nl'

industries = []
company_size = []

for url in company_url:  
    if url != "N/A":
        html = requests.get(base_url+url).content 
        soup = BeautifulSoup(html, "lxml") 
    
        company_info = soup.find_all('div', class_="cmp-AboutMetadata-itemInner")        
        industry = 'N/A'
        size = 'N/A'

        for info in company_info:
            if re.search('Branche', str(info)):
                industry = info.text.split('Branche')[1]   
            if re.search('Medewerkers', str(info)):
                size = info.text.split('Medewerkers')[1]
        
        industries.append(industry)
        company_size.append(size)
           
    else:
        industries.append("N/A")
        company_size.append("N/A")

In [8]:
# Append company information
jobs['Industry'] = industries
jobs['Company size'] = company_size

In [9]:
jobs.head()

Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size
0,CUSTOMS ADVISOR DATA ANALIST BREDA,Breda,DHL,€2.900 - €3.500 per maand,Net geplaatst,,/cmp/DHL,Transport en vrachtvervoer,Meer dan 10.000
1,Specialist Business Intelligence (BI),Rotterdam,Milgro,,Net geplaatst,,/cmp/Milgro,,
2,Business Intelligence Analist,Nieuwegein,Fysioholland,,Net geplaatst,,/cmp/Fysioholland,,
3,Digital Analyst,Rijswijk,TUI Nederland N.V.,,Net geplaatst,,/cmp/Tui-Group,"Restaurants, reizen en vrije tijd",Meer dan 10.000
4,Data analist parttime,Amsterdam,Work-on,€11 - €13 per uur,Net geplaatst,,/cmp/Work--on,Human resources en personeel,11 tot 50


In [10]:
jobs.to_csv('jobs_all_base.csv', index=False)

In [11]:
len(jobs)

1500