---

In [1]:
# Update package list silently
!apt-get update -qq > /dev/null 2>&1
# Install required libraries silently
!apt-get install -qq \
libglib2.0-0 \
libnss3 \
libdbus-glib-1-2 \
libgconf-2-4 \
libfontconfig1 \
libvulkan1 \
gconf2-common \
libwayland-server0 \
libgbm1 \
udev \
libu2f-udev > /dev/null 2>&1
# Fix broken installs if any
!apt --fix-broken install -y > /dev/null 2>&1
# Download Chrome binary
!wget -P /tmp https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chrome-linux64.zip > /dev/null 2>&1
# Unzip Chrome to /usr/bin
!unzip /tmp/chrome-linux64.zip -d /usr/bin/ > /dev/null 2>&1
# Download ChromeDriver
!wget -P /tmp https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chromedriver-linux64.zip > /dev/null 2>&1
# Unzip ChromeDriver to /usr/bin
!unzip /tmp/chromedriver-linux64.zip -d /usr/bin/ > /dev/null 2>&1
# Install Selenium Python bindings
!apt install -y python3-selenium > /dev/null 2>&1
# Install specific Selenium version
!pip install selenium==3.141.0 > /dev/null 2>&1
# Install PyGithub
!pip install PyGithub > /dev/null 2>&1

---

In [2]:
# Standard library imports
import csv
import datetime
import html
import json
import os
import re
import shutil
import smtplib
import time
import urllib.parse
from email.message import EmailMessage

In [3]:
# Third-party imports
import pandas as pd
import pytz
import github
from github.GithubException import GithubException
from kaggle_secrets import UserSecretsClient
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException,NoSuchElementException,StaleElementReferenceException,InvalidSelectorException,WebDriverException,InvalidArgumentException

---

In [4]:
def add_driver_options(options):
    chrome_options = Options()
    for opt in options:
        chrome_options.add_argument(opt)
    return chrome_options

In [5]:
def get_driver()->webdriver.Chrome:
    driver_config = {
        "options": [
            "--headless",
            "--no-sandbox",
            "--start-fullscreen",
            "--allow-insecure-localhost",
            "--disable-dev-shm-usage",
            "user-agent=Chrome/116.0.5845.96"
        ],
    }
    CHROME_BINARY_LOCATION = "/usr/bin/chrome-linux64/chrome"
    CHROMEDRIVER_BINARY_LOCATION = "/usr/bin/chromedriver-linux64/chromedriver"
    options = add_driver_options(driver_config["options"])
    options.binary_location = CHROME_BINARY_LOCATION
    driver = webdriver.Chrome(executable_path=CHROMEDRIVER_BINARY_LOCATION,options=options)
    return driver

---

In [6]:
def add_driver_options(options):
    chrome_options = Options()
    for opt in options:
        chrome_options.add_argument(opt)
    return chrome_options

In [7]:
def get_repository()->github.Repository.Repository:
    access_token = UserSecretsClient().get_secret('access_token')
    repo_str = UserSecretsClient().get_secret('repo_str')
    g = github.Github(access_token)
    repo = g.get_repo(repo_str)
    return repo

In [8]:
def get_new_file(folder:str,branch:str='master')->str:
    repo = get_repository()
    contents = repo.get_contents(folder,ref=branch)
    path = ''
    for c in contents:
        if path < c.path:
            path = c.path
    return path

In [9]:
def get_file_path(folder:str,path_type:str,branch:str='master')->str:
    github_url = 'https://github.com'
    repo_str = UserSecretsClient().get_secret('repo_str')
    file_path = get_new_file(folder=folder,branch=branch)
    file_url = github_url+'/'+repo_str+'/blob/'+branch+'/'+file_path
    if path_type == 'url':
        return file_url
    elif path_type == 'raw':
        return file_url.replace('github','raw.githubusercontent').replace('blob/','')
    else:
        raise Exception("path_type is either 'url' or 'raw'!")

In [10]:
def clean_text_for_csv(text:str)->str:
    if not isinstance(text,str):
        return text
    cleaned_text = urllib.parse.unquote(text)
    cleaned_text = html.unescape(cleaned_text)
    cleaned_text = cleaned_text.replace('\\','\\\\') 
    cleaned_text = cleaned_text.replace('\n','\\n')
    cleaned_text = cleaned_text.replace('\t','\\t') 
    cleaned_text = cleaned_text.replace('\r','\\r') 
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [11]:
def save_lst_as_csv(data:list[dict],folder:str,filename:str,branch:str='master'):
    cleaned_data = [
        {k: clean_text_for_csv(v) for k,v in row.items()} for row in data
    ]
    df = pd.DataFrame(cleaned_data)
    repo = get_repository()
    csv_content = df.to_csv(index=False)
    file_pth = folder+'/'+filename
    try:
        repo.create_file(file_pth,"create source file",csv_content,branch=branch)
        print(f"{file_pth} created!")
    except:
        file = repo.get_contents(file_pth,ref=branch)
        full_raw_pth = get_file_path(folder=folder,path_type='raw',branch=branch)
        existing_df = pd.read_csv(full_raw_pth)
        updated_df = pd.concat([existing_df,df],ignore_index=True)
        updated_csv_content = updated_df.to_csv(index=False)
        repo.update_file(file_pth,'update source file',updated_csv_content,file.sha,branch=branch)
        print(f"{file_pth} updated!")

In [12]:
def check_file_exists_in_folder(folder:str,filename:str,branch:str='master')->bool:
    try:
        repo = get_repository()
        contents = repo.get_contents(folder,ref=branch)
        for content_file in contents:
            if content_file.name == filename:
                return True
        return False
    except GithubException as e:
        if e.status == 404:
            return False
        else:
            raise e

---

In [13]:
def get_days_filter_for_current_month():
    today = datetime.datetime.today()
    days_passed = today.day
    available_days = [30,15,7,3,1]  
    for days in available_days:
        if days <= days_passed:
            return str(days)
    return "1"

In [14]:
def scrapeNakuri(job_role:str,driver,is_full_load:bool):
    job_lst = []
    query = "-".join(job_role.lower().split())
    base_url = f"https://www.naukri.com/{query}-jobs?jobAge=1"
    if is_full_load:
        base_url = f"https://www.naukri.com/{query}-jobs?jobAge={get_days_filter_for_current_month()}"
    wait1,wait2 = WebDriverWait(driver,10),WebDriverWait(driver,30)
    driver.get(base_url)
    try:
        xpth = "/html/body/div/div/main/div[1]/div[1]/div[1]/div[@class='styles_nrc__heading__dLYOD']"
        wait1.until(EC.presence_of_all_elements_located((By.XPATH,xpth)))
        no_results_text = driver.find_element(By.XPATH,xpth).text
        print(f"{no_results_text} for {job_role}!")
        return []
    except NoSuchElementException:
        pass
    except TimeoutException:
        pass
    except Exception as e:
        return []
    job_links = []
    page_count = 0
    xpth1 = "/html/body/div/div/main/div[1]/div[2]/"
    while page_count < 26:
        wait2.until(EC.presence_of_all_elements_located((By.XPATH,f"{xpth1}div[2]/div/div[@class='srp-jobtuple-wrapper']")))
        elements = driver.find_elements(By.XPATH,f"{xpth1}div[2]/div/div[@class='srp-jobtuple-wrapper']")
        for div_element in elements:
            link = div_element.find_element(By.CLASS_NAME,"title ").get_attribute("href")
            job_links.append(link)
        try:
            next_page = wait2.until(EC.element_to_be_clickable((By.XPATH,f"{xpth1}div[3]/div/a[2]")))
            next_url = next_page.get_attribute("href")+"?jobAge=1"
            driver.get(next_url)
            page_count += 1
        except:
            break
    job_links = job_links[:500]
    xpth2 = "/html/body/div/div/main/div[@class='styles_jdc__content__EZJMQ ']/div[1]/"
    for url in job_links:
        try:  
            driver.get(url)
            driver.refresh()
            wait2.until(EC.presence_of_element_located((By.XPATH,xpth2+"section[1]/div[1]/div[1]/header/h1")))
            job_title = driver.find_element(By.XPATH,xpth2+"section[1]/div[1]/div[1]/header/h1").text
            company = driver.find_element(By.XPATH,xpth2+"section[1]/div[1]/div[1]/div/a").text
            experience = driver.find_element(By.XPATH,xpth2+"section[1]/div[1]/div[2]/div[1]/div[1]/span").text
            salary = driver.find_element(By.XPATH,xpth2+"section[1]/div[1]/div[2]/div[1]/div[2]/span").text
            locations = [loc.text for loc in driver.find_elements(By.XPATH,xpth2+"section[1]/div[1]/div[2]/div[@class='styles_jhc__loc___Du2H']/span/a")]
            description = driver.find_element(By.XPATH,xpth2+"section[2]/div[2]/div[@class='styles_JDC__dang-inner-html__h0K4t']").text
            industry = driver.find_element(By.XPATH,xpth2+"section[2]/div[2]/div[2]/div[2]").text
            department = driver.find_element(By.XPATH,xpth2+"section[2]/div[2]/div[2]/div[3]").text
            employment_type = driver.find_element(By.XPATH,xpth2+"section[2]/div[2]/div[2]/div[4]").text
            skills = [skill.text for skill in driver.find_elements(By.XPATH,xpth2+"section[2]/div[3]/div/a[@target='_blank']")]
            job_dict = {
                'job_title': job_title,
                'company': company,
                'experience': experience,
                'salary': salary,
                'locations': locations,
                'description': description,
                'industry': industry,
                'department': department,
                'employment_type': employment_type,
                'skills': skills,
                'scraped_at': datetime.datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d %H:%M:%S")
            }
            job_lst.append(job_dict)
        except:
            pass
    return job_lst

In [15]:
def scrapeFoundit(job_role:str,driver,is_full_load:bool):
    query = "+".join(job_role.split())
    base_url = f"https://www.foundit.in/srp/results?query={query}&jobFreshness=1"
    if is_full_load:
        base_url = f"https://www.foundit.in/srp/results?query={query}&jobFreshness={get_days_filter_for_current_month()}"
    url = base_url.encode("ascii","ignore").decode("unicode_escape")
    wait = WebDriverWait(driver,30)
    driver.get(url)
    try:
        no_results_text = driver.find_element(By.XPATH,"/html/body/div[3]/div/div[6]/div/div/div[2]/div[1]/div[2]/p[1]").text
        print(f"{no_results_text} {job_role}!")
        return []
    except NoSuchElementException:
        pass
    except Exception as e:
        return []
    job_list = []
    count = 0
    while True:
        try:
            driver.refresh()
            base_xpath = "/html/body/div[@id='srpThemeDefault']/div[@class='srpContainer']/div[@id='srpContent']"
            card_container_xpath = f"{base_xpath}/div[@class='srpCardContainer']/div[@class='srpResultCard']"
            wait.until(EC.presence_of_element_located((By.XPATH,card_container_xpath+"/div[@class='srpCardsWrapper']")))
            job_cards = driver.find_elements(By.XPATH,card_container_xpath+"/div[@class='srpCardsWrapper']")
            for card in job_cards:
                card.click()
                try:
                    try:
                        accept_cookies = driver.find_element(By.XPATH,'/html/body/div[5]/div/div/div[3]/button')
                        accept_cookies.click()
                    except:
                        pass
                    detail_xpath = f"{base_xpath}/div[@class='srpJdContainer']"
                    detail_element = wait.until(EC.presence_of_element_located((By.XPATH,detail_xpath)))
                    job_data = {}
                    job_data['job_title'] = detail_element.find_element(By.CLASS_NAME,"jdTitle").text
                    job_data['company_name'] = detail_element.find_element(By.CLASS_NAME,"jdCompanyName").text
                    highlights = detail_element.find_elements(By.CLASS_NAME,"highlightsRow")
                    job_data['experience'] = highlights[0].find_element(By.XPATH,"./div[1]").text
                    try:
                        job_data['salary'] = highlights[0].find_element(By.XPATH,"./div[2]").text
                    except Exception:
                        job_data['salary'] = None
                    job_data['location'] = highlights[1].text
                    job_data['industry'] = highlights[2].text
                    job_data['job_description'] = detail_element.find_element(By.CLASS_NAME,"jobDescInfoNew").text
                    job_data['skills'] = [skill.text for skill in detail_element.find_elements(By.CLASS_NAME,"pillItem")]
                    job_data['scraped_at'] = datetime.datetime.now(pytz.timezone('Asia/Kolkata')).strftime("%Y-%m-%d %H:%M:%S")
                    job_list.append(job_data)
                    count += 1
                    if count == 500: 
                        return job_list
                except Exception as e:
                    pass
            pagination = driver.find_element(By.XPATH,card_container_xpath+"/div[@class='pagination']")
            pagination.find_element(By.CLASS_NAME,"mqfisrp-right-arrow").click()
        except Exception as e:
            return job_list
    return job_list

---

In [16]:
start_time = time.time()

In [17]:
job_roles = [
    "Data Engineer",
    "Data Analyst",
    "Data Architect",
    "Data Scientist",
    "Machine Learning Engineer"
]
branch = "test"
yrmn_str = str(datetime.datetime.now(pytz.timezone('Asia/Kolkata')))[:7].replace(' ','-')
csv_file_str = f"{yrmn_str}.csv"
driver = get_driver()

In [18]:
print("\n--- Scraping Nakuri Started ---\n")
driver = get_driver()
for role in job_roles:
    print("Job-role: "+role)
    folder = f"Source/Nakuri/{role.replace(' ','')}"
    is_full_load = not check_file_exists_in_folder(folder=folder,filename=csv_file_str,branch=branch)
    lst = scrapeNakuri(role,driver,is_full_load)
    print("Number of jobs extracted: "+str(len(lst)))
    if lst:
        save_lst_as_csv(data=lst,folder=folder,filename=csv_file_str,branch=branch)
    print()
print("\n--- Scraping Nakuri Ended ---\n")


--- Scraping Nakuri Started ---

Job-role: Data Engineer
Number of jobs extracted: 500
Source/Nakuri/DataEngineer/2025-06.csv created!

Job-role: Data Analyst
Number of jobs extracted: 475
Source/Nakuri/DataAnalyst/2025-06.csv created!

Job-role: Data Architect
Number of jobs extracted: 166
Source/Nakuri/DataArchitect/2025-06.csv created!

Job-role: Data Scientist
Number of jobs extracted: 347
Source/Nakuri/DataScientist/2025-06.csv created!

Job-role: Machine Learning Engineer
Number of jobs extracted: 350
Source/Nakuri/MachineLearningEngineer/2025-06.csv created!


--- Scraping Nakuri Ended ---



In [19]:
print("\n--- Scraping Foundit Started ---\n")
for role in job_roles:
    print("Job-role: "+role)
    folder = f"Source/Foundit/{role.replace(' ','')}"
    is_full_load = not check_file_exists_in_folder(folder=folder,filename=csv_file_str,branch=branch)
    lst = scrapeFoundit(role,driver,is_full_load)
    print("Number of jobs extracted: "+str(len(lst)))
    if lst:
        save_lst_as_csv(data=lst,folder=folder,filename=csv_file_str,branch=branch)
    print()
print("\n--- Scraping Foundit Ended ---\n")
driver.quit()


--- Scraping Foundit Started ---

Job-role: Data Engineer
Number of jobs extracted: 31
Source/Foundit/DataEngineer/2025-06.csv created!

Job-role: Data Analyst
Number of jobs extracted: 30
Source/Foundit/DataAnalyst/2025-06.csv created!

Job-role: Data Architect
Number of jobs extracted: 32
Source/Foundit/DataArchitect/2025-06.csv created!

Job-role: Data Scientist
Number of jobs extracted: 51
Source/Foundit/DataScientist/2025-06.csv created!

Job-role: Machine Learning Engineer
Number of jobs extracted: 8
Source/Foundit/MachineLearningEngineer/2025-06.csv created!


--- Scraping Foundit Ended ---



In [20]:
end_time = time.time()
total_time = end_time - start_time
total_time_minutes = total_time / 60
total_time_hours = total_time_minutes / 60

In [21]:
print(f"""Time-taken: {total_time:.2f} secs (or) {total_time_minutes:.2f} mins (or) {total_time_hours:.2f} hrs.""")

Time-taken: 5399.07 secs (or) 89.98 mins (or) 1.50 hrs.


---