In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm # this is a fancy progress bar!
from time import sleep
import numpy as np
from datetime import datetime
import pandas as pd

# Individual Data Challenge: Scraping Jobs.ch

(Weekend homework recap)

As a job seeker, one has to search through job portals to find most relevant jobs related to your profile.

In this challenge, your goal is to find all jobs related to keywords: “Data Scientist”, “Data Analyst”, “Python Developer”, “Data Engineer”, “Data Manager”, “Data Architect”, “Big Data Analyst” and “Data Python” on jobs.ch.

## Questions

Download all necessary information (including job text, job rank, company name, job keyword…) for all webpages.
Using the information obtained, perform a descriptive analysis on this data including questions:

1. How many jobs are shared between these categories?
2. How much the keywords: “Data Analyst” and “Big Data Analyst” overlap?
3. Are there some companies doing more hires than average?
4. How many jobs are there in different Kantons?
5. Is “machine learning” keyword most often in data scientist or data analyst jobs?
6. What is the distribution of most common keywords between and across categories?
7. Produce a report in the form of a clean notebook (or jupyter slides), with commented code and markdown cells for structuring and interpretations.

### DATA SCIENTIST

In [3]:
#DATA SCIENTIST
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Scientist"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_datascientist = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_datascientist = df_datascientist.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_datascientist['location'] = df_datascientist['location'].str.replace('—', '')
df_datascientist

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=23), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=457), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 17:51:46.116394,Swiss Life Asset Managers,Market Data Analyst,24.10.2019,Job descriptionInfoCompanySwiss Life Asset Man...,Zürich
1,2019-11-16 17:51:48.352983,PwC,Data Scientist (Consultant),28.10.2019,Job descriptionInfoCompany Du arbeitest bei in...,Zürich
2,2019-11-16 17:51:50.351856,Die Schweizerische Post,Data Analyst Prozessentwicklung CRM Kundendien...,13.11.2019,Job descriptionInfoCompanyData Analyst Prozess...,Bern
3,2019-11-16 17:51:52.717220,EY,Senior Data Scientist - Forensics in Zurich,01.11.2019,Job descriptionInfoCompanySenior Data Scientis...,"Zurich, CH-ZH"
4,2019-11-16 17:51:54.868550,Geberit AG,Product Data Analyst (m/w),15.11.2019,Job descriptionInfoCompanyProduct Data Analyst...,Rapperswil-Jona
...,...,...,...,...,...,...
452,2019-11-16 18:07:56.499864,,Senior ETL Informatica and Oracle Developer - ...,11.07.2019,Job descriptionInfoSenior ETL Informatica and ...,
453,2019-11-16 18:07:58.868150,,Senior Business Analyst Regulatory & Complianc...,14.10.2019,Job descriptionInfoSenior Business Analyst Reg...,
454,2019-11-16 18:08:00.999260,PwC,Senior Consultant / Manager Data Analytics (Ad...,05.11.2019,Job descriptionInfoCompanySenior Consultant / ...,Zürich
455,2019-11-16 18:08:03.173268,Credit Suisse AG,"Data Sourcing, Integration & Transformation",22.10.2019,"Job descriptionInfoCompanyData Sourcing, Integ...",Zürich


### DATA ANALYST

In [4]:
#DATA ANALYST
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Analyst"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_dataanalyst = pd.DataFrame(columns = cols)

for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_dataanalyst = df_dataanalyst.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_dataanalyst['location'] = df_dataanalyst['location'].str.replace('—', '')
df_dataanalyst

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=474), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 18:43:38.288270,Swiss Life Asset Managers,Market Data Analyst,24.10.2019,Job descriptionInfoCompanySwiss Life Asset Man...,Zürich
1,2019-11-16 18:43:40.413052,RM IT Professional Resources AG,Data Engineer/Data Analyst,24.10.2019,Job descriptionInfoCompanyData Engineer / Data...,Bern
2,2019-11-16 18:43:42.678876,Geberit AG,Product Data Analyst (m/w),15.11.2019,Job descriptionInfoCompanyProduct Data Analyst...,Rapperswil-Jona
3,2019-11-16 18:43:44.859396,Vorwerk International & Co. KmG,Senior Quality Data Analyst,14.11.2019,Job descriptionInfoCompanySenior Quality Data ...,Wollerau
4,2019-11-16 18:43:47.045298,Die Schweizerische Post,Data Analyst Prozessentwicklung CRM Kundendien...,13.11.2019,Job descriptionInfoCompanyData Analyst Prozess...,Bern
...,...,...,...,...,...,...
469,2019-11-16 19:00:13.040840,Credit Suisse AG,Senior Big Data Engineer,23.10.2019,Job descriptionInfoCompanySenior Big Data Engi...,Zürich
470,2019-11-16 19:00:15.039116,Novartis AG,Biotransformation Scientist,10.10.2019,Job descriptionInfoCompanyBiotransformation Sc...,Basel
471,2019-11-16 19:00:17.189509,F. Hoffmann-La Roche AG,Postdoctoral Fellow Drug product design method...,29.10.2019,Job descriptionInfoCompanyPostdoctoral Fellow ...,Basel
472,2019-11-16 19:00:19.342773,Université de Lausanne,Bioinformatician 60%-80%,08.10.2019,Job descriptionInfoCompanyBioinformatician 60%...,Lausanne


### PYTHON DEVELOPER

In [5]:
#PYTHON DEVELOPER
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Python%20Developer"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_pythondeveloper = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_pythondeveloper = df_pythondeveloper.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_pythondeveloper['location'] = df_pythondeveloper['location'].str.replace('—', '')
df_pythondeveloper

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=187), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 19:14:18.056235,Axpo Solutions AG,Internship Quant Developer,01.11.2019,Job descriptionInfoCompany Work closely with a...,Baden
1,2019-11-16 19:14:20.134483,Valora Schweiz AG,"Senior Software Engineer, Fullstack 80-100%",29.10.2019,"Job descriptionInfoCompanyMuttenz, ZürichSenio...",Zurich / Muttenz / Remote
2,2019-11-16 19:14:22.251018,Camptocamp SA,Python Developer,05.11.2019,Job descriptionInfoCompany,Olten
3,2019-11-16 19:14:24.273791,MBA Michael Bailey Associates GmbH,DevOps Python Developer,08.11.2019,Job descriptionInfoCompany,Zurich
4,2019-11-16 19:14:26.551666,Labour Search GmbH,Junior Python Developer 100%,31.10.2019,Job descriptionInfoCompany,Wettingen
...,...,...,...,...,...,...
182,2019-11-16 19:20:46.264190,Genedata AG,Scientific Consultant/Field Application Scient...,22.08.2019,Job descriptionInfoCompanyScientific Consultan...,Basel
183,2019-11-16 19:20:48.290253,Swisscom (Schweiz) AG,DevOps Engineer IT Security as a Service 80% b...,07.10.2019,Job descriptionInfoDevOps Engineer IT Security...,"Bern, Zürich"
184,2019-11-16 19:20:50.492494,Atos AG,Data Science Consultant,07.11.2019,Job descriptionInfoCompanyData Science Consult...,"Basel, Basel"
185,2019-11-16 19:20:52.700846,QualySense AG,System Architect,29.03.2019,Job descriptionInfoCompanySystem ArchitectSYST...,Glattbrugg


### DATA ENGINEER

In [6]:
#DATA ENGINEER
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Engineer"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_dataengineer = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_dataengineer = df_dataengineer.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_dataengineer['location'] = df_dataengineer['location'].str.replace('—', '')
df_dataengineer

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=981), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 19:35:49.321755,Leica Geosystems AG,Machine Learning Engineer (f/m),09.11.2019,Job descriptionInfoCompanyMachine Learning Eng...,Heerbrugg
1,2019-11-16 19:35:51.546587,Eniwa AG,Enterprise Data Engineer 80-100% (m/w/d),01.11.2019,Job descriptionInfoCompany Ausarbeiten der unt...,Buchs AG
2,2019-11-16 19:35:53.674975,Experis Schweiz Zürich,Senior Data Engineer,04.11.2019,Job descriptionInfoCompanySenior Data Engineer...,Zürich
3,2019-11-16 19:35:55.837169,The Stamford Group AG,DevOps Engineer,13.11.2019,Job descriptionInfoCompanyDevOps EngineerJob D...,Zürich
4,2019-11-16 19:35:58.206135,Stamford Consultants AG,Cloud Site Reliability Engineer,15.11.2019,Job descriptionInfoCompany,Zürich
...,...,...,...,...,...,...
976,2019-11-16 20:10:39.629413,TE Connectivity Solutions GmbH,SUPPLIER DEVELOPMENT ANALYST IV,08.10.2019,Job descriptionInfoCompanySUPPLIER DEVELOPMENT...,Chochin
977,2019-11-16 20:10:41.559033,,Senior Account Executive - Manufacturing,05.06.2019,Job descriptionInfoSenior Account Executive - ...,
978,2019-11-16 20:10:43.784260,,CONSULTANT FÜR SYSTEMARCHITEKTUR GA / ICT,09.07.2019,Job descriptionInfoCONSULTANT FÜR SYSTEMARCHIT...,
979,2019-11-16 20:10:46.122390,ti&m AG,(Senior) Consultant Digital Transformation,24.01.2019,Job descriptionInfoCompany(Senior) Consultant ...,Zürich


### DATA MANAGER

In [7]:
#DATA MANAGER
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Manager"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_datamanager = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_datamanager = df_datamanager.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_datamanager['location'] = df_datamanager['location'].str.replace('—', '')
df_datamanager

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=66), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1335), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 20:33:54.926564,Luzerner Kantonsspital,Datenmanager/in 80%,28.10.2019,Job descriptionInfoCompanyDatenmanager/in 80%I...,Luzern
1,2019-11-16 20:33:57.016967,LGT,Client Data Officer (100%),01.11.2019,Job descriptionInfoCompanyClient Data Officer ...,Zürich
2,2019-11-16 20:33:59.834496,Sonova,Data Protection Manager,18.10.2019,Job descriptionInfoCompanyData Protection Mana...,Stäfa und Zug
3,2019-11-16 20:34:01.849725,maxon motor ag,Service Responsible Datacenter / Oracle 90-100...,06.11.2019,Job descriptionInfoCompanyFür unsere Abteilung...,Sachseln
4,2019-11-16 20:34:04.003303,Geberit AG,Master Data Manager (m/w),15.11.2019,Job descriptionInfoCompanyMaster Data Manager ...,Rapperswil/Jona
...,...,...,...,...,...,...
1330,2019-11-16 21:21:41.128190,,Technical Business Analyst - Swiss Social Insu...,24.09.2019,Job descriptionInfoTechnical Business Analyst ...,
1331,2019-11-16 21:21:43.432278,Jones Lang LaSalle AG,Sales & Solutions Director,17.10.2019,Job descriptionInfoCompanySales & Solutions Di...,"Frankfurt, DEU, Mehr..."
1332,2019-11-16 21:21:45.549402,,Associate Solutions Engineer - Bachelor/Master...,01.10.2019,Job descriptionInfoAssociate Solutions Enginee...,
1333,2019-11-16 21:21:47.451995,,Fachspezialist/-in Datenmanagement / Pflanzenö...,30.10.2019,Job descriptionInfoFachspezialist/-in Datenman...,


### DATA ARCHITECT

In [8]:
#DATA ARCHITECT
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Architect"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_dataarchitect = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_dataarchitect = df_dataarchitect.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_dataarchitect['location'] = df_dataarchitect['location'].str.replace('—', '')
df_dataarchitect

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=230), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 21:24:04.572754,Banian AG,Data Engineer / Solution Architect,28.10.2019,Job descriptionInfoCompany,Deutschschweiz
1,2019-11-16 21:24:06.584656,Syngenta,Senior Solution Architect E-commerce,07.11.2019,Job descriptionInfoCompany,Basel
2,2019-11-16 21:24:08.543106,RM IT Professional Resources AG,Data Warehouse Developer - Oracle PL/SQL,15.11.2019,Job descriptionInfoCompanyData Warehouse Devel...,Bern
3,2019-11-16 21:24:10.651867,Credit Suisse AG,Agile DWH Analyst 80 - 100%,21.10.2019,Job descriptionInfoCompanyAgile DWH Analyst 80...,Zürich
4,2019-11-16 21:24:12.632350,ROSEN Swiss AG,Microsoft Data Warehouse/BI-Entwickler (m/w),08.11.2019,Job descriptionInfoCompanyJobbeschreibung Mit ...,Stans
...,...,...,...,...,...,...
225,2019-11-16 21:32:13.498389,Swisscom (Schweiz) AG,Performance Testing Engineer 80% bis 100%,30.09.2019,Job descriptionInfoPerformance Testing Enginee...,Liebefeld
226,2019-11-16 21:32:15.697258,Swisscom (Schweiz) AG,DevOps Engineer Container Platform 80% bis 100%,04.09.2019,Job descriptionInfoDevOps Engineer Container P...,Zürich oder Bern
227,2019-11-16 21:32:18.348728,Vicara Infotech Group AG,DWH EXPERT,04.12.2017,Job descriptionInfoCompanyDWH EXPERT We are l...,Zürich
228,2019-11-16 21:32:20.360377,Softcom Technologies SA,Senior Entwickler - Architekt (m/w) 80-100%,30.08.2019,Job descriptionInfoSenior Entwickler - Archite...,Bern


### BIG DATA ANALYST

In [9]:
#BIG DATA ANALYST
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Big%20Data%20Analyst"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_bigdataanalyst = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_bigdataanalyst = df_bigdataanalyst.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_bigdataanalyst['location'] = df_bigdataanalyst['location'].str.replace('—', '')
df_bigdataanalyst

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=210), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 22:20:16.236028,Swiss Life Asset Managers,Market Data Analyst,24.10.2019,Job descriptionInfoCompanySwiss Life Asset Man...,Zürich
1,2019-11-16 22:20:18.263458,RM IT Professional Resources AG,Data Engineer/Data Analyst,24.10.2019,Job descriptionInfoCompanyData Engineer / Data...,Bern
2,2019-11-16 22:20:20.423064,Die Schweizerische Post,Data Analyst Prozessentwicklung CRM Kundendien...,13.11.2019,Job descriptionInfoCompanyData Analyst Prozess...,Bern
3,2019-11-16 22:20:23.324456,Geberit AG,Product Data Analyst (m/w),15.11.2019,Job descriptionInfoCompanyProduct Data Analyst...,Rapperswil-Jona
4,2019-11-16 22:20:25.482604,Vorwerk International & Co. KmG,Senior Quality Data Analyst,14.11.2019,Job descriptionInfoCompanySenior Quality Data ...,Wollerau
...,...,...,...,...,...,...
205,2019-11-16 22:27:40.209296,La Prairie Group AG,Global Digital Media Manager,01.10.2019,Job descriptionInfoCompanyGlobal Digital Media...,Volketswil
206,2019-11-16 22:27:42.307424,F. Hoffmann-La Roche AG,Postdoctoral Fellow Drug product design method...,29.10.2019,Job descriptionInfoCompanyPostdoctoral Fellow ...,Basel
207,2019-11-16 22:27:44.434497,Novartis AG,DSAI Advanced Visual Analytics Scientist,28.07.2019,Job descriptionInfoCompanyDSAI Advanced Visual...,Basel
208,2019-11-16 22:27:46.473890,Université de Lausanne,Bioinformatician 60%-80%,08.10.2019,Job descriptionInfoCompanyBioinformatician 60%...,Lausanne


### DATA PYTHON

In [10]:
#DATA PYTHON
# Create the link (it is a better idea to constuct the especially link if you notice a specific pattern..)
link_first_part = 'https://www.jobs.ch'
link_mid_1_part = '/en/vacancies/?page='
link_mid_2_part = '&term='
link_mid_3_part = "Data%20Python"

link = link_first_part + link_mid_1_part + link_mid_2_part + link_mid_3_part

response = requests.get(link, timeout = 15)
soup = BeautifulSoup(response.content, 'html.parser')

job_links = []
def get_links(soup, job_links):
    all_links = soup.find_all('div', class_ = 'Box-sc-7ekkso-0 Position-b2pct5-0 Position__Relative-b2pct5-1 VacancySerpItem__ShadowBox-p4qu0m-0 hthPRS')
    for job_add in all_links:
        job_links.append(link_first_part+job_add.find('a', {'class':'x--job-link t--job-link SearchVacancyResultsComponent__StyledVacancySerpItem-n25jij-0 dQDQbr'}).get('href'))



max_pages = soup.find('div', class_ = 'Div-v2w9ke-0 Flex-sc-4aokm-0 eykbax').text.split()[2]

for page in tqdm(range(1, int(max_pages)+1)[:]):
    url = link_first_part +link_mid_1_part + str(page) + link_mid_2_part + link_mid_3_part
    response = requests.get(url, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    get_links(soup, job_links)
    sleep(0.6)

def get_content(soup):
    try:
        content = soup.find('div', class_ = 'Div-v2w9ke-0 fjQgMg').get_text()
    except:
        content = np.nan
    try:    
        title = soup.find('div', {'class' : 'Div-v2w9ke-0 hPuVjT'}).find('h1').get('title')
    except:
        title = np.nan
    try:
        company = soup.find('div', class_ = 'Div-v2w9ke-0 cvwY').find('a').get('title')
    except:
        company = np.nan
    try:
        published = soup.find('span', class_ = 'Span-bhy2uh-0 Badge-ndaeev-0 krRxxu').get_text()
    except: 
        published = np.nan
    try:
        location = soup.find('span', class_ = 'Span-bhy2uh-0 Text__span-sc-1vcmz87-8 YbklG Span-bhy2uh-0 Text__span-sc-1vcmz87-8 Text-sc-1vcmz87-9 gdfMMD').get_text()
    except: 
        location = np.nan
        
    return content, title, company, published, location

get_content(soup)

cols = ['date', 'company', 'title', 'published', 'content', 'location']
df_datapython = pd.DataFrame(columns = cols)


for job in tqdm(job_links[:]):
    response = requests.get(job, timeout = 15)
    soup = BeautifulSoup(response.content, 'html.parser')
    content, title, company, published, location = get_content(soup)
    
    df_datapython = df_datapython.append({
        'date': datetime.now(), 
        'company': company, 
        'title': title, 
        'published': published, 
        'content': content,
        'location': location
        
        
    }, ignore_index = True)
    sleep(0.6)


df_datapython['location'] = df_datapython['location'].str.replace('—', '')
df_datapython

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=571), HTML(value='')))




Unnamed: 0,date,company,title,published,content,location
0,2019-11-16 22:30:49.532482,gateB AG,Senior Consultant Data Science,24.10.2019,Job descriptionInfoCompanySenior Consultant Da...,Steinhausen
1,2019-11-16 22:30:51.448850,Leica Geosystems AG,Machine Learning Engineer (f/m),09.11.2019,Job descriptionInfoCompanyMachine Learning Eng...,Heerbrugg
2,2019-11-16 22:30:53.386530,Arcanite Solutions Sàrl,Développeur/-euse Python orienté Web,14.11.2019,Job descriptionInfoCompanyArcanite est une jeu...,Puidoux
3,2019-11-16 22:30:55.391208,Noser Engineering AG,Data Engineer mit Flair für Analytics,15.11.2019,Job descriptionInfoCompanyData Engineer mit Fl...,Winterthur
4,2019-11-16 22:30:58.303359,Geberit AG,Product Data Analyst (m/w),15.11.2019,Job descriptionInfoCompanyProduct Data Analyst...,Rapperswil-Jona
...,...,...,...,...,...,...
566,2019-11-16 22:51:31.597390,Swisscom (Schweiz) AG,DevOps Engineer Container Platform 80% bis 100%,04.09.2019,Job descriptionInfoDevOps Engineer Container P...,Zürich oder Bern
567,2019-11-16 22:51:34.100681,Swisscom (Schweiz) AG,Cloud System Engineer 60%,28.10.2019,Job descriptionInfoCloud System Engineer 60% ...,Bern-Ittigen or Zurich
568,2019-11-16 22:51:36.329481,Microsoft Schweiz GmbH,Developer Engagement Lead,08.11.2019,Job descriptionInfoCompanyDeveloper Engagement...,"Wallisellen, Zürich"
569,2019-11-16 22:51:38.539638,Swisscom (Schweiz) AG,Senior System Engineer 80% bis 100%,16.10.2019,Job descriptionInfoSenior System Engineer 80% ...,Ittigen oder Zürich
