In [10]:
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from bs4 import BeautifulSoup
import re

In [11]:
def configure_driver():
    chrome_options = Options()
    # options.add_argument('--headless')  # Menjalankan Chrome tanpa antarmuka grafis
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36")
    chrome_options.add_argument('--disable-dev-shm-usage')  # Mencegah masalah memori
    chrome_options.add_argument('--disable-gpu')  # Opsi tambahan untuk meningkatkan stabilitas
    chrome_options.add_argument('--remote-debugging-port=9222')

    # Tentukan path ChromeDriver
    service = Service(r'chromedriver-win64\chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [12]:
# Helper function to safely get text from an element
def safe_get_text(element, tag, attr=None):
    try:
        if attr:
            return element.find(tag).get(attr).strip()
        return element.find(tag).get_text().strip()
    except (AttributeError, TypeError, KeyError):
        return 'No information'

In [13]:
# Setup WebDriver
driver = configure_driver()  # konfigurasi driver

# Navigasi ke halaman yang diinginkan
driver.get("https://www.kalibrr.id/id-ID/home")

# Tunggu beberapa detik agar halaman dimuat sepenuhnya
time.sleep(2)

# Counter untuk menghitung jumlah klik "Load More"
click_count = 0
max_clicks = 150  # Batasi klik sampai 150 kali

# List untuk menyimpan data pekerjaan
job_titles, company_names, company_logos, locations, salary, job_types, last_active, deadlines, job_links = [], [], [], [], [], [], [], [], []

# Loop untuk terus klik tombol "Load More" hingga tidak ada lagi atau mencapai batas klik
n = 1
while True:
    try:
        # Cari div dengan tombol "Load More"
        load_more_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='k-font-dm-sans k-w-full k-flex k-justify-center k-mb-10']"))
        )

        # Cari tombol dalam div tersebut
        load_more_button = load_more_container.find_element(By.XPATH, ".//button[contains(@class, 'k-btn-primary')]")
        
        # Scroll ke tombol "Load More" jika tombol belum muncul di layar
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)

        # Tunggu beberapa detik untuk memastikan tombol dapat diklik
        time.sleep(1)

        # Klik tombol "Load More"
        load_more_button.click()

        # Update click_count setiap kali tombol diklik
        click_count += 1

        # Cek apakah jumlah klik sudah mencapai batas
        if click_count >= max_clicks:
            print(f"Tombol 'Load More' telah diklik sebanyak {click_count} kali. Proses berhenti.")
            break  # Keluar dari loop jika klik mencapai batas

        # Tunggu beberapa detik agar konten baru dimuat
        time.sleep(2)

        # Ambil halaman setelah klik
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Menemukan kontainer yang berisi job cards
        container = soup.find_all('div', class_='k-container k-grid k-grid-cols-1 md:k-grid-cols-2 xl:k-grid-cols-3 k-gap-4 k-mt-8 k-mb-10')

        # Loop untuk mengumpulkan informasi dari setiap job card
        for job_card_container in container:
            job_cards = job_card_container.find_all('div', class_='k-font-dm-sans k-rounded-lg k-bg-white k-border-solid k-border hover:k-border-2 hover:k-border-primary-color k-border k-group k-flex k-flex-col k-justify-between css-1otdiuc')

            if job_cards:  # Jika job cards ditemukan
                for job in job_cards:
                    # Step 3: Fetch data from each job card
                    # Find the job title
                    job_titles.append(safe_get_text(job, 'h2'))

                    # Find company name
                    company_name_span = job.find('span', class_='k-inline-flex k-items-center k-mb-1')
                    company_name = company_name_span.find('a', class_='k-text-subdued k-font-bold').get_text().strip() if company_name_span else 'No company name'
                    company_names.append(company_name)

                    # Find the logo container and get the image URL
                    company_logo_container = job.find('div', class_='k-flex k-p-4 k-gap-2 md:k-gap-4')
                    if company_logo_container:
                        logo_img = company_logo_container.find('img')
                        if logo_img:
                            logo_url = logo_img.get('src')  # Get the src attribute (logo URL)
                            company_logos.append(logo_url)  # Append the logo URL to company_logos

                    # Find job location (location can sometimes be inside a span tag)
                    location = job.find('span', class_='k-text-gray-500')
                    locations.append(location.get_text().strip() if location else 'No location found')

                    # Find salary info
                    salary_span = job.find('span', class_='k-text-subdued')
                    salary_text = 'No salary info'
                    if salary_span:
                        # Extract all salary components and join them
                        salary_parts = [s.get_text().strip() for s in salary_span.find_all('span') if s.get_text().strip()]
                        # Join and remove unwanted non-breaking space characters
                        salary_text = ' '.join(salary_parts).replace('\xa0', ' ')  # Replace '\xa0' with a regular space
                    salary.append(salary_text)

                    # Job type (e.g. Full-time)
                    job_type_span = job.find('span', class_='k-flex k-gap-4 2xl:k-gap-4 k-items-center k-text-gray-300 k-pointer-events-none')
                    job_type = job_type_span.find('span', class_='k-text-gray-500').get_text().strip() if job_type_span else 'No job type'
                    job_types.append(job_type)

                    # Last active
                    active = job.find_all('span', class_='k-text-gray-500')  # Same span as job type, you might need to adjust the logic if there are different spans
                    last_active.append(active[2].text.strip() if active else 'No last active info')

                    # Application deadline
                    deadline = job.find('span', class_='k-text-xs')
                    deadlines.append(deadline.get_text().strip() if deadline else 'No deadline')

                    # Job link
                    job_link = job.find('a', itemprop='name')
                    job_links.append("https://www.kalibrr.id/" + job_link['href'] if job_link else 'No link')

                    print(f"{'-' * 30} Successful Fetching Data ke-{n} {'-' * 30}")
                    n += 1

            else:
                print("No job cards found in this container.")

    except Exception as e:
        print("Tidak ada tombol 'Load More' lagi atau error:", e)
        break

# Menampilkan jumlah klik yang terjadi
print(f"Tombol 'Load More' telah diklik sebanyak {click_count} kali.")

# Menutup browser setelah selesai
driver.quit()

------------------------------ Successful Fetching Data ke-1 ------------------------------
------------------------------ Successful Fetching Data ke-2 ------------------------------
------------------------------ Successful Fetching Data ke-3 ------------------------------
------------------------------ Successful Fetching Data ke-4 ------------------------------
------------------------------ Successful Fetching Data ke-5 ------------------------------
------------------------------ Successful Fetching Data ke-6 ------------------------------
------------------------------ Successful Fetching Data ke-7 ------------------------------
------------------------------ Successful Fetching Data ke-8 ------------------------------
------------------------------ Successful Fetching Data ke-9 ------------------------------
------------------------------ Successful Fetching Data ke-10 ------------------------------
------------------------------ Successful Fetching Data ke-11 -----------------

In [14]:
# Save the data to DataFrame as needed
df = pd.DataFrame({
    'Job Title': job_titles,
    'Company Name': company_names,
    'Company Logo URL': company_logos,
    'Job Location': locations,
    'Salary Information': salary,
    'Job Type': job_types,
    'Last Active': last_active,
    'Application Deadline': deadlines,
    'Job Link': job_links
})

In [16]:
df.to_csv('kalibrr_jobs.csv', index=False)  # Save the data to a CSV file