Importing necessary libraries

In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import plotly

## Scraping the pages

In [54]:
def scrape_job_openings(page_num,query):
    # query = "Data Scientist"
    # page_num = 1

    url = f"https://www.jobs.ch/en/vacancies/?page={page_num}&term={query.replace(' ', '%20')}"
    print(url)

    page = requests.get(url, timeout=2)
    
    soup = BeautifulSoup(page.content, "html.parser")

    job_ads = soup.find_all('article')
    job_dicts = []

    for job in job_ads:
        
        job_link_tag = job.find('a',{'data-cy' : 'job-link'})
        # title = job_link_tag.get('title')
        url = job_link_tag.get('href')
        
        try:
            location = job.find_all('p')[1].get_text()
        except AttributeError:
            location = None

        try:
            date = job.find_all('p')[0].get_text(';').split(';')[0]
            # print(date)
        except AttributeError:
            date = None
        try:
            company_elem = job.find('div', {'class': 'd_grid ai_center gap_s12 grid-tc_[auto_1fr] mt_auto pt_s16'}).find_all('p', {'class':'textStyle_p2'})
            company = company_elem[0].get_text() if company_elem else ''
        except AttributeError:
            company = None
        try:
            salary_range = job.find_all('p')[2].get_text()
            # print(salary_range)
        except AttributeError:
            salary_range = None
        try:
            title = job.find('div', {'class': 'mb_s8'}).get_text()
            title_parts = title.split()
            new_title_parts = []
            for part in title_parts:
                if not any(char.isdigit() for char in part):
                    new_title_parts.append(part)
                else:
                    break
            title = ' '.join(new_title_parts)
            if '(' in title or ')' in title:
                title = title.split('(')[0].strip()
            if not title:
                title = None
            # print(title)
        except AttributeError:
            title = None
        
        job_dict = {
                    "title": title,
                    'salary_range': salary_range,
                    "company": company,
                    "location": location,
                    "date": date,
                    "query": query,
                    "url": url
                }
        
        job_dicts.append(job_dict)
    return job_dicts


## Scrapping pages based on the number page and the keyWords

In [55]:
keywords = ["Data Scientist", "Data Analyst", "Python Developer", "Data Engineer", "Data Manager", "Data Architect", "Big Data Analyst", "Data Python"]
df_data = {}

for keyword in keywords:
    data = scrape_job_openings(1, keyword)
    df_data[keyword] = data


https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Scientist
https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Analyst
https://www.jobs.ch/en/vacancies/?page=1&term=Python%20Developer
https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Engineer
https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Manager
https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Architect
https://www.jobs.ch/en/vacancies/?page=1&term=Big%20Data%20Analyst
https://www.jobs.ch/en/vacancies/?page=1&term=Data%20Python


## Creating a Dataframe out of the data returned in the dictionary based on keywords

In [56]:
df = pd.concat([pd.DataFrame(df_data[keyword]) for keyword in keywords])
df.head(3)

Unnamed: 0,title,salary_range,company,location,date,query,url
0,Junior Data Scientist,80 – 100%,7Days Media Services GmbH,Egerkingen,Published: 11 September 2024,Data Scientist,/en/vacancies/detail/f3f91a19-5d06-4b20-be15-c...
1,IT MDM Analyst Data Management,100%,Franke Group,Aarburg,Published: 21 October 2024,Data Scientist,/en/vacancies/detail/374d8592-2266-4440-9dad-d...
2,Intern - Technology Consulting - AI & Data,100%,EY (Ernst & Young AG),Zurich,Published: 05 October 2024,Data Scientist,/en/vacancies/detail/f135255f-9e64-417c-8584-6...


* How many jobs are shared between these categories?

In [67]:
duplicates = df[df.duplicated(subset='title', keep=False)]
grouped_data = duplicates.groupby('title').count()
grouped_data.head()

Unnamed: 0_level_0,salary_range,company,location,date,query,url
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Analytics Specialist / Data Scientist,3,3,3,3,3,3
Central Data Agent - CDD,3,3,3,3,3,3
Cloud Data Platform Expert/Architect,3,3,3,3,3,3
Data Architect,2,2,2,2,2,2
Data Engineer,6,6,6,6,6,6


In [85]:
import plotly.graph_objects as go
import plotly.express as px

fig = px.scatter(grouped_data,
                #  color='salary_range',
                 size="query",#Shape of a Marker
                 log_x=False # Sets the scale of the x-axis into linear instead of logarithmic scael
                )
fig.show()

* How much the keywords: “Data Analyst” and “Big Data Analyst” overlap?


In [None]:
overlap_count = df[df['query'].str.contains('Data Analyst') & df['query'].str.contains('Big Data Analyst')]
print(len(overlap_count))
overlap_count['title']

* Are there some companies doing more hires than average?


In [60]:

# Group the DataFrame by the 'company' column and count the number of rows for each group
company_counts = df.groupby('company').count()
# Calculate the average number of hires per company
avg_count = company_counts['title'].mean()
high_hires = company_counts[(company_counts['title'] > avg_count) & (company_counts.index != '')]
highest_company_hiring = high_hires.max()
# print(high_hires)
print(avg_count)
highest_company_hiring.head


1.696629213483146


<bound method NDFrame.head of title           6
salary_range    6
location        6
date            6
query           6
url             6
dtype: int64>

* How many jobs are there in different Kantons?


In [61]:
duplicates = df[df.duplicated(subset='location', keep=False)]
grouped_data = duplicates.groupby('location').count()
grouped_data.head()

Unnamed: 0_level_0,title,salary_range,company,date,query,url
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1007 Lausanne,2,2,2,2,2,2
1296 Coppet,2,2,2,2,2,2
Aarau,3,3,3,3,3,3
Aarburg,3,3,3,3,3,3
Adliswil,4,4,4,4,4,4


In [80]:

fig = px.scatter(grouped_data,
                 color='location',
                 size="query",#Shape of a Marker
                 log_x=False # Sets the scale of the x-axis into linear instead of logarithmic scael
                )

# Set the area of the bubbles proportional to the petal's length
fig.update_traces(marker=dict(sizemode='area', sizeref=0.1))

# Update layout
fig.update_layout(title='Jobs in different Cantons')

# Show plot
fig.show()


* Is “machine learning” keyword more often in data scientist or data analyst jobs?


In [63]:
# Select the rows where the job title contains 'data scientist' or 'data analyst'
#na_counts = df.isna().sum()
#na_counts
# na_counts = 
df.dropna()



Unnamed: 0,title,salary_range,company,location,date,query,url
0,Junior Data Scientist,80 – 100%,7Days Media Services GmbH,Egerkingen,Published: 11 September 2024,Data Scientist,/en/vacancies/detail/f3f91a19-5d06-4b20-be15-c...
1,IT MDM Analyst Data Management,100%,Franke Group,Aarburg,Published: 21 October 2024,Data Scientist,/en/vacancies/detail/374d8592-2266-4440-9dad-d...
2,Intern - Technology Consulting - AI & Data,100%,EY (Ernst & Young AG),Zurich,Published: 05 October 2024,Data Scientist,/en/vacancies/detail/f135255f-9e64-417c-8584-6...
3,Expert Monitoring & Data Analytics Claims Non-...,60 – 100%,Generali Personenversicherung AG,Adliswil,Published: 14 October 2024,Data Scientist,/en/vacancies/detail/790dee41-e8e5-4d6a-b825-b...
4,Fachspezialist/in internes Meldewesen,100%,Kernkraftwerk Leibstadt AG,Leibstadt,Published: 23 October 2024,Data Scientist,/en/vacancies/detail/91d8d0ce-0119-444c-bca9-1...
...,...,...,...,...,...,...,...
14,Senior Data Engineer,80 – 100%,Comet AG,Flamatt,Published: 18 September 2024,Data Python,/en/vacancies/detail/a0c65b24-8ad0-4e95-b3c9-a...
15,Data & AI Platform Engineer,80 – 100%,Migros Bank AG,Wallisellen,Published: 14 October 2024,Data Python,/en/vacancies/detail/2220c422-4a0a-4b4b-a0bb-7...
16,Senior Data Engineer for IoT Products,80 – 100%,Sensirion AG,Stäfa,Published: 14 October 2024,Data Python,/en/vacancies/detail/043ff5e2-3f32-4118-b41a-5...
17,Backend Engineer,100%,42matters AG (Similarweb company),Zürich,Published: 11 October 2024,Data Python,/en/vacancies/detail/05e6c7bf-1f0c-42e4-9017-4...
