# Simple Web Scraping in Python
### My first attempt at trying to gather static HTML content from Pracuj.pl and RocketJobs employment websites and present it in tabular format. Searches include keywords 'Junior Data' and 'Data Analyst'.

In [2]:
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

## Scraping content from RocketJobs

In [3]:
rocket_jobs_dict = {
    'Job':[],
    'Employer':[],
    'Localization':[],
    'Keywords':[]
}

html_txt = requests.get('https://rocketjobs.pl/wszystkie-lokalizacje/bi-data?orderBy=DESC&sortBy=newest&keyword=data+analyst').text
soup = BeautifulSoup(html_txt, 'html.parser')
offers = soup.find_all('div', class_='css-1tksz28')

for offer in offers:
    rocket_jobs_dict['Job'].append(offer.h2.text)
    rocket_jobs_dict['Employer'].append(offer.find('div', class_='css-jx23jo').text)
    city = offer.find('div', class_='css-1wao8p8').text
    rocket_jobs_dict['Localization'].append(re.search('^[A-Za-zÓóŃńŁłŹźĆćŚśŻżü]+', city).group(0))
    rocket_jobs_dict['Keywords'].append([keyword.text for keyword in offer.find_all('div', class_='css-12973y2')])

rocket_jobs_df = pd.DataFrame(rocket_jobs_dict)
rocket_jobs_df.insert(0, 'Site', 'RocketJobs')

## Scraping content from Pracuj.pl

In [4]:
pracuj_pl_dict = {
    'Job':[],
    'Employer':[],
    'Localization':[],
    'Keywords':[]
}

pp_urls = ('https://www.pracuj.pl/praca/junior%20data;kw',
           'https://www.pracuj.pl/praca/junior%20data;kw?pn=2',
           'https://www.pracuj.pl/praca/junior%20data;kw?pn=3',
           'https://www.pracuj.pl/praca/junior%20data;kw?pn=4')

def pp_fill(urls):
    for url in urls:
        html_txt = requests.get(url).text
        soup = BeautifulSoup(html_txt, 'html.parser')
        offers = soup.find_all('div', class_='tiles_c1lzjarj')

        for offer in offers:
            pracuj_pl_dict['Job'].append(offer.h2.text)
            pracuj_pl_dict['Employer'].append(offer.h4.text)
            pracuj_pl_dict['Localization'].append(offer.h5.text)
            keywords = offer.ul.find_all('li', class_='mobile-hidden tiles_iwlrcdk')
            pracuj_pl_dict['Keywords'].append([keyword.text for keyword in keywords])

pp_fill(pp_urls)
pracuj_pl_df = pd.DataFrame(pracuj_pl_dict)
pracuj_pl_df.insert(0, 'Site', 'Pracuj.pl')

## Display of scraped data

In [5]:
job_offers = pd.concat((rocket_jobs_df, pracuj_pl_df), ignore_index=True)
job_offers.loc[(True if len(loc) > 50 else False for loc in job_offers['Localization']),
               'Localization'] = 'Cała Polska'

display(HTML(job_offers.to_html()))

Unnamed: 0,Site,Job,Employer,Localization,Keywords
0,RocketJobs,Junior Data Analyst 📈,PROXI.CLOUD SP. Z O.O.,Poznań,"[SQL, data-driven marketing, Analiza Danych, New]"
1,RocketJobs,Product Data Analyst,STS,Katowice,"[SQL, BigQuery, Google Tag Manager, New]"
2,RocketJobs,Data Engineer,Profitroom,Poznań,"[Zdalnie, PySpark, Python, Data modeling, New]"
3,RocketJobs,BI Analyst,Lite e-Commerce,Poznań,"[SQL, PowerBI, data analyst]"
4,RocketJobs,Lider Zespołu Analiz,Moliera2 S.A.,Warszawa,"[Snowflake, Big Query, machine learning]"
5,RocketJobs,Data Governance Specialist - EDM (Hybrid),Northvolt,Gdańsk,"[Process Improvement, process implementation, communication skills]"
6,RocketJobs,Data Analyst w marce Mohito,LPP S.A.,Kraków,"[MS Excel, Angielski, Google Analytics, New]"
7,RocketJobs,Młodszy Analityk Internetowy,CANAL+ Polska S.A.,Warszawa,"[data analyst, analiza, e-commerce, New]"
8,RocketJobs,Senior Business Analyst,Motorola Solutions,Kraków,"[Angielski, data analyst, New]"
9,RocketJobs,Business Data Analyst,RocketJobs.pl,Gdańsk,"[Angielski, SQL, PowerBI, New]"
