# 👜 Automated Job Data Extraction from Naukri.com

# Business Understanding



To develop a job scraping tool that automatically extracts job listings based on user-defined keywords, roles, and locations. The extracted data is then analyzed to provide insights into job trends.

> ***Objective :***
* Automate job data collection from platforms like Naukri.com.
* It allows users to input job roles, locations, and desired job counts dynamically.
* Store scraped job data in an Excel file in data folder for further analysis.
* An Streamlit web app for a user-friendly interface.
* Perform basic data analysis to extract insights from collected job listings.

> ***Success Criteria :***
* Successfully scrapes job title, company, location, experience, salary, skills, job description, and job link from Naukri.com.
* Handles missing values (e.g., "Not Specified" for missing salary).
* Automatically creates the data/ folder if it doesn’t exist.
* Scrapes multiple pages (e.g., 1-5) and stops when no more jobs are found.
* Confirms successful file saving in the output.

> 

# Data Understanding

> ***Data Source :***
* Job listings scraped from Naukri.com using Selenium.

> ***Collected Features :***
* Job Title	
* Company Name 
* Location 
* Experience
* Salary
* Job Description
* Job Link 

> ***Challenges in Data Collection :***
* Some Jobs may lack salary or experience details.
* Pagination handling for fetching more job listing.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.chrome.options import Options
import time
import os
from datetime import datetime

# User Inputs
job_role = input("Enter the Job role (e.g., Machine Learning Engineer): ").replace(" ", "%20")
location = input("Enter the Location (e.g., Bengaluru): ").replace(" ", "%20")
max_jobs = int(input("Enter the total No. of the Jobs to Scrap: "))


# Setting up WebDriver
driver = webdriver.Chrome()
base_url = f"https://www.naukri.com/{job_role}-jobs-in-{location}?k={job_role}&l={location}"
driver.get(base_url)
time.sleep(6)  # Allow page to load

job_list = []
job_count=0
max_pages =  5 # pages to srape. Naukri.com has 21 job posting on single page

for page in range(1, max_pages + 1):
    print(f"Scraping Page {page}...")

    jobs = driver.find_elements(By.CLASS_NAME, "srp-jobtuple-wrapper")

    if not jobs:
        print(f" No jobs found on Page {page}. Skipping...")
        continue

    for job in jobs:
        if job_count >= max_jobs:
            break
        
        try:
            title = job.find_element(By.CLASS_NAME, "title").text
            company = job.find_element(By.CLASS_NAME, "comp-name").text
            location = job.find_element(By.CLASS_NAME, "loc").text
            experience = job.find_element(By.CLASS_NAME, "exp").text if job.find_elements(By.CLASS_NAME, "exp") else "Not Specified"
            
            #job link for naukri.com for specifiic job role
            job_link = job.find_element(By.CLASS_NAME, "title").get_attribute("href")  
            
            #Visiting the each job page directly to get particularr info
            driver.execute_script("window.open();")  # Open new tab
            driver.switch_to.window(driver.window_handles[1])  # Switch to new tab
            driver.get(job_link)  # Go to job link
            time.sleep(4)

        #Extracting job description
            try:
                job_desc = driver.find_element(By.CLASS_NAME, "styles_JDC__dang-inner-html__h0K4t").text[:400]  # Extract first 400 chars
            except:
                job_desc = "Not Available"

            # Extracting the salary
            try:
                salary = driver.find_element(By.CLASS_NAME, "styles_jhc__salary__jdfEC").text
            except:
                salary = "Not Specified"

            # Extracting the skills if available
            try:
                skills_container = driver.find_element(By.CLASS_NAME, "styles_key-skill__GIPn_")  
                skills = [skill.text for skill in skills_container.find_elements(By.TAG_NAME, "span")]
            except:
                skills = ["Not Specified"]

            # Closing job tab and switching back
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

            job_list.append([title, company, location, experience, salary, skills, job_desc, job_link])
            job_count += 1
            
            
        except Exception as e:
            print("Skipping job due to error:", e)
            
    if job_count >= max_jobs:
        break

    # Moving to next page

    try:
        next_page = driver.find_element(By.XPATH, f"//a[text()='{page + 1}']")  # Selecting the next page button
        driver.execute_script("arguments[0].click();", next_page)  # Clicking using JavaScript
        time.sleep(5)  # Wait for the next page to load
    except Exception as e:
        print("No more pages found or error:", e)
        break  # Stop scraping if no more pages are available

print("✅ Scraping completed!")

# Creating a DataFrame
df = pd.DataFrame(job_list, columns=["Job Title", "Company", "Location", "Experience", "Salary", "Skills", "Job Description", "Job Link"])

# (one level up from notebooks/) datafolder
data_folder = "../data"
os.makedirs(data_folder, exist_ok=True)

#Generating the filename
filename = f"{job_role.replace('%20', '_')}_naukri_jobs_with_details.xlsx"
file_path = os.path.join(data_folder, filename)

# Saving DataFrame to Excel using full path
df.to_excel(file_path, index=False)

print(f"✅ All job listings saved to {file_path}")

# browser =closed
driver.quit()

Enter the Job role (e.g., Machine Learning Engineer):  MongoDb
Enter the Location (e.g., Bengaluru):  Bangalore
Enter the total No. of the Jobs to Scrap:  5


Scraping Page 1...
✅ Scraping completed!
✅ All job listings saved to ../data\MongoDb_naukri_jobs_with_details.xlsx
