In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

In [2]:
# Configure Selenium to use Chrome
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Set up the Chrome WebDriver (using webdriver_manager to handle the driver automatically)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Base URL for Naukri data engineer job search
# base_url = "https://www.naukri.com/data-engineer-jobs?k=data+engineer&experience=5"
# base_url = "https://www.naukri.com/data-engineer-jobs-in-hyderabad"

In [3]:
# Function to get job data from a single page
def get_job_data(page_number):
    
    # url = f"{base_url}-{page_number}?l=hyderabad&experience=5"
    
    url = f"https://www.naukri.com/data-engineer-jobs-in-hyderabad-{page_number}?experience=5"

    driver.get(url)  # Open the webpage
    
    time.sleep(3)  # Wait for JavaScript to load content (adjust sleep time if necessary)
    
    job_cards = driver.find_elements(By.CLASS_NAME, "srp-jobtuple-wrapper")  # Locate job listing elements by class name
    
    job_titles = []
    company_names = []
    locations = []
    experience = []
    posted_dates = []
    salary = []
    skills = []
    
    # Loop through each job card and extract details
    for job in job_cards:
        try:
            title = job.find_element(By.CLASS_NAME, "title").text   
        except:
            title = "No Title"
        
        try:
            company = job.find_element(By.CLASS_NAME, 'comp-name').text
        except:
            company = "No Company"
        
        try:
            location = job.find_element(By.CLASS_NAME, 'loc-wrap').text
        except:
            location = "No Location"
        
        try:
            exp = job.find_element(By.CLASS_NAME, 'exp-wrap').text
        except:
            exp = "No Experience"
        
        try:
            date = job.find_element(By.CLASS_NAME, 'job-post-day').text
        except:
            date = "No Date"

        try:
            sal = job.find_element(By.CLASS_NAME, 'sal-wrap').text
        except:
            sal = "NAN"

        try:
            skill_tag = job.find_element(By.CLASS_NAME, 'tags-gt')
            skill_list = []
            
            for child in  skill_tag.find_elements(By.TAG_NAME, 'li'):
            #     print("here")
            #     print(child.text)
                skill_list.append(child.text.strip().lower())

            skill = ",".join(skill_list)
            
        except:
            skill = "NAN"
        
        
        job_titles.append(title)
        company_names.append(company)
        locations.append(location)
        experience.append(exp)
        posted_dates.append(date)
        salary.append(sal)
        skills.append(skill)
    
    # Return the extracted data as a dictionary
    job_data = {
        "Job Title": job_titles,
        "Company": company_names,
        "Location": locations,
        "Experience": experience,
        "Date Posted": posted_dates,
        "Salary": salary,
        "Skills": skills
    }
    
    return job_data

In [4]:
# Function to scrape multiple pages
def scrape_naukri_jobs(max_pages):
    all_jobs = []
    
    for page_number in range(1, max_pages + 1):
        print(f"Scraping page {page_number}...")
        job_data = get_job_data(page_number)
        
        if job_data:
            all_jobs.append(job_data)
        
        # Wait a bit between requests to avoid overwhelming the server
        time.sleep(3)
    
    return all_jobs

In [5]:
# Scrape data from multiple pages
job_data_list = scrape_naukri_jobs(max_pages=50)  # Change max_pages as needed

# Convert the data into a DataFrame
if job_data_list:
    # Flatten the list of dictionaries into one DataFrame
    all_jobs_df = pd.DataFrame(job_data_list[0])
    for job_data in job_data_list[1:]:
        temp_df = pd.DataFrame(job_data)
        all_jobs_df = pd.concat([all_jobs_df, temp_df], ignore_index=True)

    # Save the DataFrame to CSV
    all_jobs_df.to_csv('data_engineer_jobs_naukri.csv', index=False)
    print("Scraped data saved to 'data_engineer_jobs_naukri.csv'")
    print(all_jobs_df.head())  # Display the first few rows of the data
else:
    print("No job data scraped.")

# Close the Selenium WebDriver
driver.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraped d