In [None]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [None]:
# Initialize Selenium WebDriver
driver = webdriver.Chrome()

# Global variables to store job data
job_data_list = []
global_job_id = 1

In [None]:
def get_text(xpath):
    try:
        element = driver.find_element(By.XPATH, xpath)
        text = element.text.strip()
        return text
    except:
        return "NA"

In [None]:
def get_html(xpath):
    try:
        element = driver.find_element(By.XPATH, xpath)
        html = element.get_attribute('innerHTML')
        return html
    except:
        return "NA"

In [None]:
def extract_company_and_reviews(company_text):
    reviews = "NA"
    match = re.search(r'(\d+\.\d+)\s*Reviews', company_text)
    if match:
        reviews = match.group(1)
        company_text = company_text.replace(match.group(0), "").strip()
    return company_text, reviews

In [None]:
def clean_key_skills(key_skills_html):
    try:
        soup = BeautifulSoup(key_skills_html, 'html.parser')
        spans = soup.find_all('span')
        skills_list = [span.get_text(strip=True) for span in spans]
        formatted_skills = ', '.join(skills_list)
        return formatted_skills
    except Exception as e:
        print(f"Error cleaning key skills: {e}")
        return "NA"

In [None]:
def extract_job_details(job_element):
    global global_job_id
    try:
        job_url = job_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
        driver.execute_script("window.open(arguments[0], '_blank');", job_url)
        driver.switch_to.window(driver.window_handles[-1])
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'styles_job-header-container___0wLZ')))

        job_title_text = get_text("//h1[contains(@class, 'styles_jd-header-title__rZwM1')]")
        company_text_raw = get_text("//div[contains(@class, 'styles_jd-header-comp-name__MvqAI')]")
        company_text, reviews_text = extract_company_and_reviews(company_text_raw)
        location_text = get_text("//div[contains(@class, 'styles_jhc_loc__Du2H')]")
        experience_text = get_text("//div[contains(@class, 'styles_jhc_exp_k_giM')]")
        salary_text = get_text("//div[contains(@class, 'styles_jhc_salary_jdfEC')]")

        key_skills_html = get_html("//div[contains(@class, 'styles_key-skill_GIPn')]")
        key_skills_text = clean_key_skills(key_skills_html)

        job_data_list.append({
            "Job ID": global_job_id,
            "Job Title": job_title_text,
            "Company": company_text,
            "Reviews": reviews_text,
            "Location": location_text,
            "Experience": experience_text,
            "Salary": salary_text,
            "Key Skills": key_skills_text
        })

        global_job_id += 1
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    except Exception as e:
        print(f"Error processing job {global_job_id}: {e}")
        if len(driver.window_handles) > 1:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

In [None]:
def scrape_jobs(url, job_count):
    page_number = 1
    total_jobs_collected = 0

    while total_jobs_collected < job_count:
        print(f"Scraping page {page_number}...")
        current_url = url if page_number == 1 else f"{url.rstrip('-')}-{page_number}"
        driver.get(current_url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, "srp-jobtuple-wrapper")))

        job_list = driver.find_elements(By.CLASS_NAME, "srp-jobtuple-wrapper")
        if not job_list:
            print("No more jobs found or page not loaded correctly.")
            break

        for i in range(len(job_list)):
            if total_jobs_collected >= job_count:
                break
            try:
                job_element = job_list[i]
                extract_job_details(job_element)
                total_jobs_collected += 1
                time.sleep(2)
            except Exception as e:
                print(f"Error processing job element {total_jobs_collected + 1}: {e}")

        if total_jobs_collected < job_count:
            page_number += 1
            time.sleep(3)

In [None]:
# Static input for job title and job count
job_title = 'data-scientist'  # Change this to the job title you want to scrape
job_count = 10  # Change this to the number of jobs you want to scrape
job_url = f"https://www.naukri.com/{job_title}-jobs"

In [None]:
scrape_jobs(job_url, job_count)

In [None]:
# Save data to CSV
df = pd.DataFrame(job_data_list)
df.to_csv('scraped_jobs.csv', index=False, encoding='utf-8')
driver.quit()

print(f"Scraping complete. Data saved to 'scraped_jobs.csv'.")