In [None]:
import json
# from scripting.parser.pracuj_pl.pracuj_pl import PracujPLParser
import pdb
import re
import json
from bs4 import BeautifulSoup
from scripting.parser.base_parser import BaseParser

class PracujPLParser(BaseParser):
    JSON_PATTERN = r"window\['kansas-offerview'\]\s*=\s*(\{.*?\});"
    RESULT_TEMPLATE ={
            'url': None,
            'site': 'pracuj.pl',
            'company_name': None,
            'company_url': None,
            'company_description': None,
            'offer_title': None,
            'position_level': None,
            'technology_list': None,
            'offer_description': None,
            'requirements': None,
            'responsibilities': None,
            'language': None,
            'city': None,
            'salary': None,
            'work_type': None,
            }
    WORKING_TIME = 8
    WORKING_DAYS = 20
    
    def __init__(self, parsed_site):
        super().__init__(parsed_site)
        
    JSON_PATHS = {
        'company_name': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'displayEmployerName'],
        'company_description': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'offer_title': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'jobTitle'],
        'position_level': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'positionLevels'],
        'technologies': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'textSections'],
        'requairemtns': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'responsibilities': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'sections'],
        'language': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'jobOfferLanguage', 'isoCode'],
        'salary': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'typesOfContracts'],
        'work_type': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'employment', 'workModes'],
        'locaiton': ['props', 'pageProps', 'dehydratedState', 'queries', 0, 'state', 'data', 'attributes', 'workplaces'],
    }
    
    def initalize_variables(self, page_content, url):
        self.url = url
        self.doc = BeautifulSoup(page_content, 'html.parser')
        self.page_json = self.extract_json()
        
    def parse(self, page_content, url):
        result = self.RESULT_TEMPLATE.copy()
        self.initalize_variables(page_content, url)
        self.parse_offer_data(result)
        return result
    
    def parse_offer_data(self, result):
        result['url'] = self.url
        result['company_name'] = self.get_json_value(self.page_json, self.JSON_PATHS['company_name'])
        result['company_description'] = self.parse_company_description()
        result['offer_title'] = self.get_json_value(self.page_json, self.JSON_PATHS['offer_title'])
        result['position_level'] = self.get_json_value(self.page_json, self.JSON_PATHS['position_level'])
        result['technology_list'] = self.parse_technology_list()
        result['requirements'] = self.parse_rr_section('requirements')
        result['responsibilities'] = self.parse_rr_section('responsibilities')
        result['language'] = self.get_json_value(self.page_json, self.JSON_PATHS['language'])
        result['salary'] = self.parse_salary()
        result['work_type'] = self.parse_work_type()
        # self.get_json_value(self.page_json, self.JSON_PATHS['work_type'])
        result['city'] = self.parse_location('city')
        result['country'] = self.parse_location('country')
    
    
    def parse_work_type(self):
        work_type = self.get_json_value(self.page_json, self.JSON_PATHS['work_type'])
        return [wt['code'] for wt in work_type] if work_type else None
    
    def parse_location(self, location_type):
        location_data = self.get_json_value(self.page_json, self.JSON_PATHS['locaiton'])
        location_result = set()
        for location in location_data:
            match location_type:
                case 'city':
                    location_result.add(location['inlandLocation']['location']['name'])
                case 'country':
                    location_result.add(location['country']['name'])
                case _:
                    location_result.add(None)
        return location_result                            

    def parse_work_type(self):
        work_type = self.get_json_value(self.page_json, self.JSON_PATHS['work_type'])
        return [wt['code'] for wt in work_type] if work_type else None
    
    def parse_salary(self):
        salary_data = self.get_json_value(self.page_json, self.JSON_PATHS['salary'])
        process_salary = lambda salary: {
            'salary_amount': self.salary_amount(salary['salary']), 
            'employment_type': salary['name'], 
            'currency': salary['salary']['currency']['code']
            }
        return list(map(process_salary, salary_data)) if salary_data else None
    
    def salary_amount(self, salary):
        # pdb.set_trace()
        salary_amount = (
            f"{salary['from'] * self.WORKING_TIME * self.WORKING_DAYS}-{salary['to'] * self.WORKING_TIME * self.WORKING_DAYS}"
            if salary['timeUnit']['longForm']['name'] != 'monthly'
            else f"{salary['from']}-{salary['to']}"
        )
        return salary_amount        

    def parse_rr_section(self, section_name):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        desctiption_section = self.find_section(base_section, section_name)
        join_bullets = lambda section: "@@@@".join(section['model']['bullets'])
        if desctiption_section.get('subSections'):
            return "@@@@".join([join_bullets(sub_section) for sub_section in desctiption_section['subSections']]) if desctiption_section else None
        else:
            return join_bullets(desctiption_section) if desctiption_section else None
    
    def parse_technology_list(self):
        technology_list = {}
        offer_technologies = self.get_json_value(self.page_json, self.JSON_PATHS['technologies'])
        get_techology = lambda tesh: next((tech for tech in offer_technologies if tech['sectionType'] == tesh), {})
        technology_list['required'] = get_techology('technologies-expected').get('textElements')
        technology_list['expected'] = get_techology('technologies-optional').get('textElements')
        return technology_list
        
    def parse_company_description(self):
        base_section = self.get_json_value(self.page_json, self.JSON_PATHS['company_description'])
        desctiption_section = self.find_section(base_section, 'about-us-description')
        return self.squish(" ".join(desctiption_section['model']['paragraphs'])) if desctiption_section else None
        
    def find_section(self, seection_list, section_name):
        return next((section for section in seection_list if section['sectionType'] == section_name), None)
        
    def squish(text):
        return re.sub(r'\s+', ' ', text).strip()
        
    def extract_json(self):
        json_text = self.doc.find('script', id='__NEXT_DATA__').string
        return json.loads(json_text.replace("'", '"').replace('undefined', '"undefined"'))
    
    def get_json_value(self, json_obj, json_path, key_index=0):
        try:
            return self.get_json_value(json_obj[json_path[key_index]], json_path, key_index + 1)
        except:
            return json_obj


In [40]:
parser = PracujPLParser('pracuj.pl')
with open('/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/test.html', 'r') as f:
    html = f.read()
    
parser.parse(html, 'https://www.pracuj.pl/praca/ml-ops-engineer-warszawa,oferta,1003733553')

{'url': 'https://www.pracuj.pl/praca/ml-ops-engineer-warszawa,oferta,1003733553',
 'site': 'pracuj.pl',
 'company_name': 'DCG',
 'company_url': None,
 'company_description': None,
 'offer_title': 'ML Ops Engineer',
 'position_level': [{'id': 4,
   'name': 'specialist (Mid / Regular)',
   'pracujPlName': 'specjalista (Mid / Regular)'}],
 'technology_list': {'required': ['Databricks', 'Azure Cosmos DB', 'ML Ops'],
  'expected': ['Dash', 'Shiny R', 'Streamlit']},
 'offer_description': None,
 'requirements': 'At least 2 years of professional experience in ML Ops or ML engineering, with a focus on deploying and scaling machine learning models in production.@@@@Proficient in English, both written and spoken, suitable for business communication.@@@@Extensive knowledge of the Azure cloud environment and Databricks, including setup and maintenance as an ML platform.@@@@Demonstrated expertise in software development best practices, such as testing, continuous integration, and the use of DevOps t

NameError: name 'json' is not defined

In [4]:
import scripting.parser


scripting.parser

<module 'scripting.parser' from '/Users/ulakruts/Personal Project/Praca_Inzynierska/MJ/MORE_JOBS/scripting/parser/__init__.py'>

In [3]:
with open('proxy_lust.json', 'r') as f:
    proxy_data = json.load(f)

In [11]:
def transform_proxy_list(input_file, output_file):
    with open(input_file, 'r') as f:
        proxy_data = json.load(f)['data']

    transformed_list = []
    for proxy in proxy_data:
        transformed_list.append({
            "url": f"{proxy['protocols'][0]}://{proxy['ip']}:{proxy['port']}",
            "banned": False
        })

    with open(output_file, 'w') as f:
        json.dump(transformed_list, f, indent=4)

In [12]:
transform_proxy_list('proxy_lust.json', 'proxy_lust_transformed.json')

In [2]:
from playwright.sync_api import sync_playwright


ModuleNotFoundError: No module named 'playwright'

In [None]:
json_obj = {
    "offers": {
        "offer1": {
            "details": {
                "price": 100
            }
        },
        "offer2": {
            "details": {
                "price": 200
            }
        }
    }
}
json_path = ["offers", "offer1", "details", "price"]

def get_offer_value(json_path, key_index=0, json_test=None):
    porcess_json = json_test if json_test else json_obj
    try:
        return get_offer_value([json_path[key_index]], json_path, key_index + 1)
    except:
        return json_obj
    # if json_obj.get(json_path[key_index]):
    #     return get_offer_value(json_obj.get(json_path[key_index]), json_path, key_index + 1)
    # else:
    #     return json_obj[key_index]
    
print(get_offer_value(json_obj, json_path))

{'offers': {'offer1': {'details': {'price': 100}}, 'offer2': {'details': {'price': 200}}}}


In [3]:
from scripting.login import Chrome_base

ModuleNotFoundError: No module named 'base_login'