In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm # this is a fancy progress bar!
from time import sleep
import numpy as np
from datetime import datetime
import pandas as pd

# Individual Data Challenge: Scraping Jobs.ch

(Weekend homework recap)

As a job seeker, one has to search through job portals to find most relevant jobs related to your profile.

In this challenge, your goal is to find all jobs related to keywords: “Data Scientist”, “Data Analyst”, “Python Developer”, “Data Engineer”, “Data Manager”, “Data Architect”, “Big Data Analyst” and “Data Python” on jobs.ch.

## Questions

Download all necessary information (including job text, job rank, company name, job keyword…) for all webpages.
Using the information obtained, perform a descriptive analysis on this data including questions:

1. How many jobs are shared between these categories?
2. How much the keywords: “Data Analyst” and “Big Data Analyst” overlap?
3. Are there some companies doing more hires than average?
4. How many jobs are there in different Kantons?
5. Is “machine learning” keyword most often in data scientist or data analyst jobs?
6. What is the distribution of most common keywords between and across categories?
7. Produce a report in the form of a clean notebook (or jupyter slides), with commented code and markdown cells for structuring and interpretations.

In [2]:
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Scientist"

In [3]:
link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

In [4]:
link

'https://www.jobs.ch/en/vacancies/?page=&term=Data%20Scientist'

In [5]:
response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
response.status_code

200

In [7]:
soup.find('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')

<div class="Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS"><a class="x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr" href="/en/vacancies/detail/9295777/?source=vacancy_search_promo"><div class="Box-sc-7ekkso-0 Paper-sc-10r6qoa-0 VacancySerpItem__StyledSerpItem-p4qu0m-2 exQanO" data-cy="vacancy-serp-item-active"><div class="Box-sc-7ekkso-0 Flex-zfb5z9-0 fKIKYn"><div class="Box-sc-7ekkso-0 Flex-zfb5z9-0 FlexItem-sc-15bj19c-0 hRmJUa"><div class="Div-v2w9ke-0 Flex-sc-4aokm-0 CompanyLogoWrapper-sc-1n5gkoi-0 VacancySerpItem___StyledCompanyLogoWrapper-p4qu0m-4 dklvqW" display="flex" height="48px" width="48px"><div class="Box-sc-7ekkso-0 Flex-zfb5z9-0 FlexItem-sc-15bj19c-0 kdXYoj" height="100%" width="100%"><img alt="Swiss Life Asset Managers" class="Box-sc-7ekkso-0 Image-ywupxk-0 jcoTeH VacancySerpItem__StyledCompanyLogo-p4qu0m-3 hxQAnS" src="https://img.jobs.ch/www/img/toplogos/6715.gif"/

In [8]:
job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))


In [9]:
max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

In [10]:
for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))




In [11]:
job_links

['https://www.jobs.ch/en/vacancies/detail/9295777/?jobposition=1-1&source=vacancy_search_promo',
 'https://www.jobs.ch/en/vacancies/detail/9302383/?jobposition=1-2&source=vacancy_search_promo',
 'https://www.jobs.ch/en/vacancies/detail/9333628/?jobposition=1-3&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/8689304/?jobposition=1-4&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9339388/?jobposition=1-5&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9303907/?jobposition=1-6&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9335860/?jobposition=1-7&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9333112/?jobposition=1-8&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9319652/?jobposition=1-9&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9339842/?jobposition=1-10&source=vacancy_search',
 'https://www.jobs.ch/en/vacancies/detail/9338941/?jobposition=1-11&source=va

In [12]:
#select a random job link from above to test that it's working
job_link = 'https://www.jobs.ch/en/vacancies/detail/9309672/?jobposition=1-1&source=vacancy_search_promo'

In [13]:
response = requests.get(job_link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')



In [14]:
def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location


In [15]:
get_content(soup)

('Job descriptionInfoCompanyMöchtest Du Teil der KLARA Erfolgsgeschichte werden und deinen Beitrag zur Digitalisierung der Schweizer KMU Landschaft beisteuern, dann bewirb dich jetzt. Wir suchen in Zug eine/n KLARA Maschine Learning Engineer 80 -100%KLARA Machine Learning Engineer | Zug        \xa0   Standort Zug            \xa0        KLARA MACHINE LEARNING ENGINEER 80 - 100% (w/m)            Möchtest Du Teil der KLARA Erfolgsgeschichte werden und deinen Beitrag zur Digitalisierung der Schweizer KMU Landschaft beisteuern, dann bewirb dich jetzt. Wir suchen in Zug eine/n KLARA Maschine Learning Engineer 80 -100%   Deine Aufgaben   Du spielst eine Schlüsselrolle bei der Entwicklung der Intelligenz in KLARA. Mit Deiner Erfahrung im Bereich des maschinellen Lernens und in der Software Entwicklung realisierst du echten Mehrwert für unsere Nutzer. Bei grossen Datenmengen behältst Du den Durchblick. Die Identifikation geeigneter Algorithmen und Ansätze zur Lösung maschineller Lernprobleme li

In [16]:
cols = ['date', 'company', 'title', 'published', 'content', 'location']
df = pd.DataFrame(columns = cols)
df

Unnamed: 0,date,company,title,published,content,location


In [None]:
for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df = df.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)

df

HBox(children=(IntProgress(value=0, max=1195), HTML(value='')))

In [112]:
df['location'] = df['location'].str.replace('—', '')
df

Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 16:08:25.869767,Leica Geosystems AG,Machine Learning Engineer (f/m),09.11.2019,Job descriptionInfoCompanyMachine Learning Eng...,Heerbrugg
1,2019-11-16 16:08:27.078153,Banian AG,Data Engineer / Solution Architect,28.10.2019,Job descriptionInfoCompany,Deutschschweiz
2,2019-11-16 16:08:28.597206,Experis Schweiz Zürich,Senior Data Engineer,04.11.2019,Job descriptionInfoCompanySenior Data Engineer...,Zürich
3,2019-11-16 16:08:30.133922,The Stamford Group AG,DevOps Engineer,13.11.2019,Job descriptionInfoCompanyDevOps EngineerJob D...,Zürich
4,2019-11-16 16:08:31.643662,uniqFEED AG,DevOps / SRE (80-100%),05.11.2019,"Job descriptionInfoCompanyuniqFEED, a Spin-off...",Glattbrugg
