In [61]:
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

import pandas as pd
import time
from bs4 import BeautifulSoup
import re

In [62]:
def configure_driver():
    chrome_options = Options()
    # options.add_argument('--headless')  # Run Chrome without GUI (optional)
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36")
    chrome_options.add_argument('--disable-dev-shm-usage')  # Prevent memory issues
    chrome_options.add_argument('--disable-gpu')  # Additional option to improve stability
    chrome_options.add_argument('--remote-debugging-port=9222')

    # Specify the path for ChromeDriver
    service = Service(r'chromedriver-win64\chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [63]:
# Helper function to safely get text from an element
def safe_get_text(element, tag, attr=None):
    try:
        if attr:
            return element.find(tag).get(attr).strip()
        return element.find(tag).get_text().strip()
    except (AttributeError, TypeError, KeyError):
        return 'No information'

In [70]:
# Setup WebDriver
driver = configure_driver()

# List to store data
company_names, company_logos, locations, industries, job_counts, last_active, job_links = [], [], [], [], [], [], []

# Define the number of pages you want to scrape (you can change this based on your needs)
total_pages = 5  # Adjust as needed or make it dynamic

for page in range(1, total_pages + 1):
    # Navigate to the specific page
    url = f"https://glints.com/id/companies?countries=ID&page={page}"
    driver.get(url)

    # Wait for the page to load
    time.sleep(2)

    # Get the page source after loading
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find the container that holds the company cards
    container = soup.find_all('div', class_='CompaniesPagesc__CompanyCardGrid-sc-wb7i3-6 jLlCKx')

    # Loop to collect information from each company card
    n = 1
    for company_card_container in container:
        # Loop through each job card inside the container
        job_cards = company_card_container.find_all('a', class_='stylessc__Anchor-sc-1edpj7p-0 jFinMF')
        
        for company_card in job_cards:
            # Extract the company name
            company_name = company_card.find('p', class_='ParagraphStyles__Paragraph-sc-1w5f8q5-0 gYennx aries-typography-paragraph stylessc__CompanyName-sc-1edpj7p-2 mcykD')
            company_names.append(company_name.get_text().strip() if company_name else 'No company name')

            # Extract the company logo
            logo_img = company_card.find('img', class_='ThumborImagesc__CustomImage-sc-1uzwahj-0 bnrHLO stylessc__CompanyLogo-sc-1edpj7p-6 cThZRm')
            company_logos.append(logo_img.get('src') if logo_img else 'No logo')

            # Extract location info
            location = company_card.find('p', class_='ParagraphStyles__Paragraph-sc-1w5f8q5-0 dksooX aries-typography-paragraph stylessc__LocationName-sc-1edpj7p-3 bswkpI')
            locations.append(location.get_text().strip() if location else 'No location')

            # Extract industry info
            industry = company_card.find('p', class_='ParagraphStyles__Paragraph-sc-1w5f8q5-0 jPmnmj aries-typography-paragraph stylessc__IndustryName-sc-1edpj7p-12 dWPZbv')
            industries.append(industry.get_text().strip() if industry else 'No industry')
            
            # Extract job count (lowongan)
            job_count_span = company_card.find('span')
            if job_count_span:
                job_count_link = job_count_span.find('a', {'aria-label': 'Company card active jobs'})
                if job_count_link:
                    job_counts.append(job_count_link.get_text().strip())  # Extracts the text like '5 lowongan'
                else:
                    job_counts.append('No jobs listed')
            else:
                job_counts.append('No jobs listed')

            # Extract last active info
            last_active_time = company_card.find('div', class_='stylessc__ActiveTimeRow-sc-1edpj7p-14')
            last_active.append(last_active_time.get_text().strip() if last_active_time else 'No active info')

            # Extract job link (href)
            job_link = company_card.find('a', class_='stylessc__Anchor-sc-1edpj7p-0 jFinMF')
            if job_link and job_link.has_attr('href'):
                job_links.append("https://glints.com" + job_link['href'])  # Append the full URL
            else:
                job_links.append('No link')

            print(f"{'-' * 30} Successful Fetching Data for Company-{n} {'-' * 30}")
            n += 1

# Ensure all lists have the same length
print(f"Total companies scraped: {len(company_names)}")
print(f"Company Names: {len(company_names)}")
print(f"Job Counts: {len(job_counts)}")
print(f"Locations: {len(locations)}")
print(f"Industries: {len(industries)}")
print(f"Job Links: {len(job_links)}")

# Print completion
print(f"Scraping completed. Total companies scraped: {len(company_names)}.")

# Close the browser after scraping
driver.quit()

------------------------------ Successful Fetching Data for Company-1 ------------------------------
------------------------------ Successful Fetching Data for Company-2 ------------------------------
------------------------------ Successful Fetching Data for Company-3 ------------------------------
------------------------------ Successful Fetching Data for Company-4 ------------------------------
------------------------------ Successful Fetching Data for Company-5 ------------------------------
------------------------------ Successful Fetching Data for Company-6 ------------------------------
------------------------------ Successful Fetching Data for Company-7 ------------------------------
------------------------------ Successful Fetching Data for Company-8 ------------------------------
------------------------------ Successful Fetching Data for Company-9 ------------------------------
------------------------------ Successful Fetching Data for Company-10 --------------------

In [71]:
# Create a DataFrame to save the data
df = pd.DataFrame({
    'Company Name': company_names,
    'Logo URL': company_logos,
    'Location': locations,
    'Industry': industries,
    'Job Count': job_counts,
    'Last Active': last_active,
    'Job Link': job_links
})

In [72]:
df

Unnamed: 0,Company Name,Logo URL,Location,Industry,Job Count,Last Active,Job Link
0,PT Info Solusindo Data Utama,https://images.glints.com/unsafe/glints-dashbo...,"Jakarta Selatan, DKI Jakarta, Indonesia",Computer Networking,No jobs listed,Terakhir aktif 4 jam yang lalu,No link
1,Widodo Group,https://images.glints.com/unsafe/glints-dashbo...,"Kab. Ponorogo, Jawa Timur, Indonesia",Telecommunications,No jobs listed,Terakhir aktif 9 jam yang lalu,No link
2,Nusantara Data Indonesia,https://images.glints.com/unsafe/glints-dashbo...,"Yogyakarta, DI Yogyakarta, Indonesia",Information Technology and Services,No jobs listed,Terakhir aktif 10 jam yang lalu,No link
3,Pt Andaru Persada Mandiri,https://images.glints.com/unsafe/glints-dashbo...,"Bogor, Jawa Barat, Indonesia",Medical Devices,No jobs listed,Terakhir aktif 14 jam yang lalu,No link
4,PT CUSTOMPEDIA CREATIVE GROUP,https://images.glints.com/unsafe/glints-dashbo...,"Semarang, Jawa Tengah, Indonesia",Events Services,No jobs listed,Terakhir aktif 14 jam yang lalu,No link
...,...,...,...,...,...,...,...
145,Padang Golf Adisutjipto,No logo,"Kab. Sleman, DI Yogyakarta, Indonesia",Sports,No jobs listed,Terakhir aktif 2 hari yang lalu,No link
146,PT. ARSENET GLOBAL SOLUSI,No logo,"Jakarta Selatan, DKI Jakarta, Indonesia",Information Technology and Services,No jobs listed,Terakhir aktif 2 hari yang lalu,No link
147,CV. Sumber Harapan Teknik,No logo,"Depok, Jawa Barat, Indonesia",Consumer Services,No jobs listed,Terakhir aktif 2 hari yang lalu,No link
148,BPR Dhanatani Cepiring cab. batang & Pekalongan,No logo,"Kab. Kendal, Jawa Tengah, Indonesia",Financial Services,No jobs listed,Terakhir aktif 2 hari yang lalu,No link


In [None]:
# Save the data to a CSV file
df.to_csv('glints_companies_data.csv', index=False)
print("Data has been saved to glints_companies_data.csv")