1. Wyszukać wszystkie pliki z data/raw
2. Dla każdego pliku podzielić na
    a. kawałki odpowiadające firmie
    b. wyciągnie z nich potrzebne informacje jako słownik
    c. doda wyniki do jednej listy
3. Z list zrobi dataframe
4. Zapisze dataset w katalogu data/interim z aktualną datą

In [37]:
import os
import re
from decimal import Decimal
from datetime import date

import pandas as pd
from bs4 import BeautifulSoup

In [14]:
def parse_html(offer_html, job):
    name = offer_html.select_one('.posting-title__position').text.strip()
    company = offer_html.select_one('.company-name').text.strip()
    technologies_html = offer_html.select('.tiles-container span:last-child')
    technologies = [tech.text.strip() for tech in technologies_html]
    city_raw = offer_html.select_one('nfj-posting-item-city > div:first-of-type').text.strip()

    salary_raw = offer_html.select_one(".salary").text.strip()
    salary_low, salary_high, currency = parse_salary(salary_raw)
    
    return {
        'name': name,
        'company': company,
        'technology': technologies,
        'job': job,
        'location': parse_city(city_raw),
        'salary': {'low': salary_low, 'high': salary_high, 'currency': currency}
    }
    

In [4]:
def parse_city(city_raw):
    remote_pattern = re.compile('(Zdalnie)')
    city_country_pattern = re.compile(r'(\w+)\s*,\s*(\w+)')

    match_remote = remote_pattern.search(city_raw)
    match_city_country = city_country_pattern.search(city_raw)

    if match_remote:
        city = 'Zdalna'
        country = 'N/A'
    elif city_raw == '':
        city = 'N/A'
        country = 'N/A'
    elif match_city_country:
        city, country = match_city_country.groups()
    else:
        city = city_raw.split("+")[0].strip()
        country = 'PL'

    return {'city': city, 'country': country}

In [28]:


def parse_salary(salary_raw):
    salary_parsed = salary_raw.replace("\xa0", "–").replace("–", "").split()
    if salary_parsed[0].isalpha():
        salary_parsed = ["0", "0", "N/A"]
    elif len(salary_parsed) == 2:
        salary_parsed = [salary_parsed[0], salary_parsed[0], Decimal(salary_parsed[1])]

    return Decimal(salary_parsed[0]), Decimal(salary_parsed[1]), salary_parsed[2]

In [5]:
def get_offers(html):
    return html.select('.posting-list-item')
    

In [15]:
with open(os.path.join('..', 'data', 'raw', 'data_analyst_20240311_204547.html')) as file:
    soup = BeautifulSoup(file, 'html.parser')
    job = file.name.rsplit('/')[-1].split('_20')[0].replace('_', ' ')
    
    offers_html = get_offers(soup)
    print(parse_html(offers_html[0], job))
    print(parse_html(offers_html[10], job))
    print(parse_html(offers_html[11], job))
    

{'name': 'Senior Data Analyst - Tech Support', 'company': 'RTB House', 'technology': ['Data', 'Python', 'SQL', 'Node.js'], 'job': 'data analyst', 'location': {'city': 'Zdalna', 'country': 'N/A'}, 'salary': {'low': 15000, 'high': 22000, 'currency': 'PLN'}}
{'name': 'Medior/Senior Business Data Analyst / BI engineer', 'company': 'NIX Tech Kft.', 'technology': ['Business Analysis', 'Hungarian Language', 'DWH', 'SQL'], 'job': 'data analyst', 'location': {'city': 'Budapest', 'country': 'HU'}, 'salary': {'low': 5979, 'high': 11958, 'currency': 'PLN'}}
{'name': 'Data Scientist/Data Analyst', 'company': 'Object First', 'technology': ['Data', 'Python', 'Power BI', 'Jupyter'], 'job': 'data analyst', 'location': {'city': 'Zdalna', 'country': 'N/A'}, 'salary': {'low': 24000, 'high': 36000, 'currency': 'PLN'}}


In [31]:
def process_data(path):
    results = []
    
    for entry in os.scandir(path):
        with open(os.path.join(path, entry.name), encoding='UTF-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            job = file.name.rsplit('/')[-1].split('_20')[0].replace('_', ' ')

            offers_html = get_offers(soup)
            
            for offer in offers_html:
                results.append(parse_html(offer, job))
                
    return results
            

In [18]:
path = os.path.join('..', 'data', 'raw')

In [33]:
job_offers = process_data(path)

In [35]:
df = pd.json_normalize(job_offers)
df

Unnamed: 0,name,company,technology,job,location.city,location.country,salary.low,salary.high,salary.currency
0,Senior Data Engineer,Showpad,"[Data, SQL, Data Lake, Python]",data engineer,Wroclaw,PL,22000,30000,PLN
1,Senior Data Engineer - Remote,Link Group,"[Data, SQL, Data pipelines, Cloud computing]",data engineer,Zdalna,,26880,31920,PLN
2,Snowflake Data Engineer,GTV POLAND,"[Data, Snowflake, ETL, Python]",data engineer,Zdalna,,14000,18000,PLN
3,Data Engineer,Finture,"[Data, Bash, GCP, Storage]",data engineer,Warszawa,PL,20000,27000,PLN
4,Senior Data Engineer (Azure),GetInData | Part of Xebia,"[Data, Microsoft azure, Python, SQL]",data engineer,Zdalna,,26880,33600,PLN
...,...,...,...,...,...,...,...,...,...
114,DevOps Engineer (DataOps),Avenga,"[DevOps, Data analyst, Python, SQL]",data analyst,Zdalna,,27720,31080,PLN
115,Data Engineer,Avenga,"[Business Intelligence, Data analyst, BI, ETL]",data analyst,Warszawa,PL,25200,31080,PLN
116,Data Engineer,Profitroom,"[Data, Data analyst, Spark, Data engineering]",data analyst,Zdalna,,14900,21400,PLN
117,Inżynier / Analityk danych,Dogtronic Solutions,"[Data, Data analyst, SQL, BigQuery]",data analyst,Warszawa,PL,19950,28350,PLN


In [39]:
date_ = date.today().strftime("%Y_%m_%d")
df.to_csv(os.path.join('..', 'data', 'interim', f'job_offers_{date_}.csv'), sep=';', index=False)

AttributeError: 'str' object has no attribute 'today'