In [46]:
import re
import json
from bs4 import BeautifulSoup
from functools import lru_cache
from scripting.parser.base_parser import BaseParser


class PracujPLParser(BaseParser):
    JSON_PATTERN = r"window\['kansas-offerview'\]\s*=\s*(\{.*?\});"
    RESULT_TEMPLATE = {
        'url': None,
        'site': 'pracuj.pl',
        'company_name': None,
        'company_url': None,
        'company_description': None,
        'offer_title': None,
        'position_level': None,
        'technology_list': None,
        'offer_description': None,
        'requirements': None,
        'responsibilities': None,
        'language': None,
        'city': None,
        'salary': None,
        'work_type': None,
    }
    WORKING_TIME = 8
    WORKING_DAYS = 20

    JSON_PATHS = {
        'company_name': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'displayEmployerName'],
        'company_description': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'offer_title': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'jobTitle'],
        'position_level': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'positionLevels'],
        'technologies': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'textSections'],
        'requirements': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'responsibilities': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'language': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'jobOfferLanguage', 'isoCode'],
        'salary': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'typesOfContracts'],
        'work_type': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'workModes'],
        'location': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'workplaces'],
    }

    def __init__(self, parsed_site):
        super().__init__(parsed_site)

    def initialize_variables(self, page_content, url):
        self.url = url
        self.doc = BeautifulSoup(page_content, 'html.parser')
        self.page_json = self.extract_json()
        self._cached_base_section


    def parse(self, page_content, url):
        result = self.RESULT_TEMPLATE.copy()
        self.initialize_variables(page_content, url)
        self.parse_offer_data(result)
        return result

    def parse_offer_data(self, result):
        result.update({
            'url': self.url,
            'company_name': self.get_json_value(self.page_json, self.JSON_PATHS['company_name']),
            'company_description': self.parse_company_description(),
            'offer_title': self.get_json_value(self.page_json, self.JSON_PATHS['offer_title']),
            'position_level': self.get_json_value(self.page_json, self.JSON_PATHS['position_level']),
            'technology_list': self.parse_technology_list(),
            'requirements': self.parse_rr_section('requirements'),
            'responsibilities': self.parse_rr_section('responsibilities'),
            'language': self.get_json_value(self.page_json, self.JSON_PATHS['language']),
            'salary': self.parse_salary(),
            'work_type': self.parse_work_type(),
            'city': self.parse_location('city'),
            'country': self.parse_location('country'),
        })

    def parse_work_type(self):
        work_type = self.get_json_value(self.page_json, self.JSON_PATHS['work_type'])
        return [wt['code'] for wt in work_type] if work_type else None

    def parse_location(self, location_type):
        location_data = self.get_json_value(self.page_json, self.JSON_PATHS['location'])
        location_result = set()

        for location in location_data:
            match location_type:
                case 'city':
                    location_result.add(location['inlandLocation']['location']['name'])
                case 'country':
                    location_result.add(location['country']['name'])
                case _:
                    location_result.add(None)
        return location_result

    def parse_salary(self):
        salary_data = self.get_json_value(self.page_json, self.JSON_PATHS['salary'])
        process_salary = lambda salary: {
            'salary_amount': self.salary_amount(salary['salary']),
            'employment_type': salary['name'],
            'currency': salary['salary']['currency']['code']
        }
        return list(map(process_salary, salary_data)) if salary_data else None

    def salary_amount(self, salary):
        return (
            f"{salary['from'] * self.WORKING_TIME * self.WORKING_DAYS}-{salary['to'] * self.WORKING_TIME * self.WORKING_DAYS}"
            if salary['timeUnit']['longForm']['name'] != 'monthly'
            else f"{salary['from']}-{salary['to']}"
        )

    def parse_rr_section(self, section_name):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        description_section = self.find_section(base_section, section_name)
        join_bullets = lambda section: "@@@@".join(section['model']['bullets'])

        if description_section.get('subSections'):
            return "@@@@".join(
                [join_bullets(sub_section) for sub_section in description_section['subSections']]
            ) if description_section else None
        return join_bullets(description_section) if description_section else None

    def parse_technology_list(self):
        technology_list = {}
        offer_technologies = self.get_json_value(self.page_json, self.JSON_PATHS['technologies'])
        get_technology = lambda tech: next(
            (t for t in offer_technologies if t['sectionType'] == tech), {}
        )

        technology_list['required'] = get_technology('technologies-expected').get('textElements')
        technology_list['optional'] = get_technology('technologies-optional').get('textElements')
        return technology_list

    def parse_company_description(self):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        description_section = self.find_section(base_section, 'about-us-description')
        return self.squish(" ".join(description_section['model']['paragraphs'])) if description_section else None

    def find_section(self, section_list, section_name):
        return next((section for section in section_list if section['sectionType'] == section_name), None)

    @staticmethod
    def squish(text):
        return re.sub(r'\s+', ' ', text).strip()

    def extract_json(self):
        json_text = self.doc.find('script', id='__NEXT_DATA__').string
        return json.loads(json_text.replace("'", '"').replace('undefined', '"undefined"'))

    def get_json_value(self, json_obj, json_path, key_index=0):
        try:
            return self.get_json_value(json_obj[json_path[key_index]], json_path, key_index + 1)
        except (IndexError, KeyError, TypeError):
            return json_obj


In [47]:
parser = PracujPLParser('pracuj.pl')
with open('/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/test.html', 'r') as f:
    html = f.read()
    
parser.parse(html, 'https://www.pracuj.pl/praca/ml-ops-engineer-warszawa,oferta,1003733553')

{'url': 'https://www.pracuj.pl/praca/ml-ops-engineer-warszawa,oferta,1003733553',
 'site': 'pracuj.pl',
 'company_name': 'DCG',
 'company_url': None,
 'company_description': None,
 'offer_title': 'ML Ops Engineer',
 'position_level': [{'id': 4,
   'name': 'specialist (Mid / Regular)',
   'pracujPlName': 'specjalista (Mid / Regular)'}],
 'technology_list': {'required': ['Databricks', 'Azure Cosmos DB', 'ML Ops'],
  'optional': ['Dash', 'Shiny R', 'Streamlit']},
 'offer_description': None,
 'requirements': 'At least 2 years of professional experience in ML Ops or ML engineering, with a focus on deploying and scaling machine learning models in production.@@@@Proficient in English, both written and spoken, suitable for business communication.@@@@Extensive knowledge of the Azure cloud environment and Databricks, including setup and maintenance as an ML platform.@@@@Demonstrated expertise in software development best practices, such as testing, continuous integration, and the use of DevOps t

In [55]:
import json
from playwright.sync_api import sync_playwright

class PlaywrightLogin:
    def __init__(self):
        with open('/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/MORE_JOBS/scripting/login/proxy_list.json', 'r') as f:
            proxy_list = json.load(f)
        print(proxy_list)
        valid_proxy = next((proxy for proxy in proxy_list if not proxy['banned']), None)
        self.proxy = valid_proxy.get('url') if valid_proxy else None
        
    def login(self):
        with sync_playwright() as p:
            browser = p.chromium.launch(proxy={"server": self.proxy}, headless=False)
            context = browser.new_context()
            page = context.new_page()

            login_url = "https://it.pracuj.pl/praca"
            page.goto(login_url)

            page.locator("//button[@data-test=\"button-submitCookie\"]").click()
            page.wait_for_load_state("networkidle")
            
            cookies = context.cookies()
            user_agent = page.evaluate("() => navigator.userAgent")

            browser.close()

            return {
                "proxy": self.proxy,
                "cookies": cookies,
                "user_agent": user_agent
            }

# Использование
login_instance = PlaywrightLogin()
credentials = login_instance.login()
print(credentials)


[{'url': 'socks5://83.142.126.147:80', 'banned': False}, {'url': 'socks5://197.234.13.61:36902', 'banned': False}, {'url': 'socks5://51.83.133.132:22027', 'banned': False}, {'url': 'socks5://39.187.67.226:1080', 'banned': False}, {'url': 'socks4://185.56.180.14:5678', 'banned': False}, {'url': 'socks4://172.67.75.202:80', 'banned': False}, {'url': 'socks4://146.88.203.171:5678', 'banned': False}, {'url': 'socks5://54.38.242.224:31143', 'banned': False}, {'url': 'socks5://191.103.253.89:63909', 'banned': False}, {'url': 'socks4://156.34.105.58:5678', 'banned': False}, {'url': 'socks4://115.73.220.114:5678', 'banned': False}, {'url': 'socks4://121.239.249.90:8989', 'banned': False}, {'url': 'socks4://46.28.111.54:1080', 'banned': False}, {'url': 'socks5://154.19.185.66:18825', 'banned': False}, {'url': 'socks4://103.48.58.213:8080', 'banned': False}, {'url': 'socks5://45.32.131.86:3000', 'banned': False}, {'url': 'socks4://167.71.164.35:46303', 'banned': False}, {'url': 'socks4://51.89.0

Error: It looks like you are using Playwright Sync API inside the asyncio loop.
Please use the Async API instead.

In [60]:
with open('/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/Free_Proxy_List.json', 'r') as f:
    proxy_data = json.load(f)

In [77]:
def transform_proxy_list(input_file, output_file):
    with open(input_file, 'r') as f:
        proxy_data = json.load(f)
    proxy_data = sorted(proxy_data, key=lambda x: x.get('latency', float('-inf')), reverse=True)

    #  .sort(key=lambda x: x.get('latency', float('-inf')))
    # proxy_data = sorted(proxy_data, key=proxy_data['la'])
    transformed_list = []
    for proxy in proxy_data:
        transformed_list.append({
            "url": f"{proxy['protocols'][0]}://{proxy['ip']}:{proxy['port']}",
            "banned": False
        })

    with open(output_file, 'w') as f:
        json.dump(transformed_list, f, indent=4)

In [78]:
transform_proxy_list('../Free_Proxy_List.json', 'proxy_lust_transformed.json')

In [51]:
from playwright.sync_api import sync_playwright

In [13]:
import json
import re
from bs4 import BeautifulSoup
from functools import lru_cache
from scripting.parser.base_parser import BaseParser

class PracujPLParser(BaseParser):
    JSON_PATTERN = r"window\['kansas-offerview'\]\s*=\s*(\{.*?\});"
    WORKING_TIME = 8
    WORKING_DAYS = 20
    JSON_PATHS = {
        'company_name': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'displayEmployerName'),
        'company_description': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'),
        'offer_title': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'jobTitle'),
        'position_level': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'positionLevels'),
        'technologies': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'textSections'),
        'requirements': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'),
        'responsibilities': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'),
        'language': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'jobOfferLanguage', 'isoCode'),
        'salary': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'typesOfContracts'),
        'work_type': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'workModes'),
        'location': ('props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'workplaces'),
    }

    def __init__(self, parsed_site):
        super().__init__(parsed_site)

    def initialize_variables(self, page_content, url):
        self.url = url
        self.doc = BeautifulSoup(page_content, 'html.parser')
        self.page_json = self.extract_json()

    def parse(self, page_content, url):
        result = self.RESULT_TEMPLATE.copy()
        self.initialize_variables(page_content, url)
        self.parse_offer_data(result)
        return result

    def parse_offer_data(self, result):
        result.update({
            'url': self.url,
            'company_name': self.get_json_value(self.page_json, self.JSON_PATHS['company_name']),
            'company_description': self.parse_company_description(),
            'offer_title': self.get_json_value(self.page_json, self.JSON_PATHS['offer_title']),
            'position_level': self.get_json_value(self.page_json, self.JSON_PATHS['position_level']),
            'technology_list': self.parse_technology_list(),
            'offer_description': self.parse_offer_description(),
            'requirements': self.parse_rr_section('requirements'),
            'responsibilities': self.parse_rr_section('responsibilities'),
            'language': self.get_json_value(self.page_json, self.JSON_PATHS['language']),
            'salary': self.parse_salary(),
            'work_type': self.parse_work_type(),
            'city': self.parse_location('city'),
            'country': self.parse_location('country'),
        })

    def parse_work_type(self):
        work_type = self.get_json_value(self.page_json, self.JSON_PATHS['work_type'])
        return [wt['code'] for wt in work_type] if work_type else None

    def parse_location(self, location_type):
        location_data = self.get_json_value(self.page_json, self.JSON_PATHS['location'])
        location_result = set()

        for location in location_data:
            match location_type:
                case 'city':
                    location_result.add(location['inlandLocation']['location']['name'])
                case 'country':
                    location_result.add(location['country']['name'])
                case _:
                    location_result.add(None)
        return location_result

    def parse_offer_description(self):
        desc_section = self.find_section(self.get_json_value(self.page_json, self.JSON_PATHS['company_description']), 'about-project')
        return self.squish(" ".join(desc_section['model']['paragraphs']))
    
    def parse_salary(self):
        salary_data = self.get_json_value(self.page_json, self.JSON_PATHS['salary'])
        if salary_data[0]['salary'] is None:
            return '-'

        process_salary = lambda salary: {
            'salary_amount': self.salary_amount(salary['salary']),
            'employment_type': salary['name'],
            'currency': salary['salary']['currency']['code']
        }
        return list(map(process_salary, salary_data)) if salary_data else None

    def salary_amount(self, salary):
        if salary is None:
            return '-'
        return (
            f"{salary['from'] * self.WORKING_TIME * self.WORKING_DAYS}-{salary['to'] * self.WORKING_TIME * self.WORKING_DAYS}"
            if salary['timeUnit']['longForm']['name'] != 'monthly'
            else f"{salary['from']}-{salary['to']}"
        )

    def parse_rr_section(self, section_name):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        description_section = self.find_section(base_section, section_name)
        join_bullets = lambda section: "@@@@".join(section['model']['bullets'])

        if description_section.get('subSections'):
            return "@@@@".join(
                [join_bullets(sub_section) for sub_section in description_section['subSections']]
            ) if description_section else None
        return join_bullets(description_section) if description_section else None
    
    def parse_technology_list(self):
        technology_list = {}
        offer_technologies = self.get_json_value(self.page_json, self.JSON_PATHS['technologies'])
        get_technology = lambda tech: next(
            (t for t in offer_technologies if t['sectionType'] == tech), {}
        )

        technology_list['required'] = get_technology('technologies-expected').get('textElements')
        technology_list['optional'] = get_technology('technologies-optional').get('textElements')
        return technology_list

    def parse_company_description(self):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        description_section = self.find_section(base_section, 'about-us-description')
        return self.squish(" ".join(description_section['model']['paragraphs'])) if description_section else None

    def find_section(self, section_list, section_name):
        return next((section for section in section_list if section['sectionType'] == section_name), None)

    @staticmethod
    def squish(text):
        return re.sub(r'\s+', ' ', text).strip()

    def extract_json(self):
        json_text = self.doc.find('script', id='__NEXT_DATA__').string
        return json.loads(json_text.replace("'", '"').replace('undefined', '"undefined"'))
    
    def get_json_value(self, json_obj, json_path, key_index=0):
        try:
            return self.get_json_value(json_obj[json_path[key_index]], json_path, key_index + 1)
        except (IndexError, KeyError, TypeError):
            return json_obj


In [14]:
parser = PracujPLParser('pracuj.pl')

with open('/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/test.html', 'r') as f:
    content = f.read()

In [15]:
parser.parse(content, 'huj')

{'url': 'huj',
 'site': 'pracuj.pl',
 'company_name': 'ALIOR BANK',
 'company_url': None,
 'company_description': None,
 'offer_title': 'Specjalista ds. portalu (UX i UI)',
 'position_level': [{'id': 4,
   'name': 'specjalista (Mid / Regular)',
   'pracujPlName': 'specjalista (Mid / Regular)'}],
 'technology_list': {'required': ['HTML'], 'optional': None},
 'offer_description': 'Alior Bank konsekwentnie pracuje na miano innowatora. Stały rozwój bankowości mobilnej i regularne wdrażanie najbardziej nowatorskich na rynku rozwiązań wymaga potężnego zaplecza IT - zarówno w zakresie przygotowania aplikacji i systemów, jak i oceny ryzyka oraz kontroli bezpieczeństwa danych. Aktualnie w pionie IT Alior Banku zatrudniamy ponad 700 specjalistów. Wykorzystują oni 15 różnych platform, 14 frameworków, ponad 50 różnych technologii w obszarze infrastruktury oraz 10 baz danych. Wdrażamy aktualnie nowe systemy, takie jak Kafka, Eureka, Kubernetes. Pracujemy zarówno w klasycznym modelu projektowym, jak

In [2]:
"AAAA".lower()

'aaaa'