In [1]:
!pip install playwright beautifulsoup4 pandas openpyxl lxml -q
!playwright install chromium

In [2]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
import time

all_jobs = []

async def scrape_all_pages():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # Change to True after testing
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        
        # Stealth script — bypasses Cloudflare & captcha
        await context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => false});
            window.chrome = { runtime: {} };
        """)
        
        page = await context.new_page()
        
        # Start with page 1
        current_page = 1
        while True:
            url = f"https://www.naukri.com/data-analyst-data-analytics-data-analysis-data-visualization-data-cleansing-jobs?k=data%20analyst%2C%20data%20analytics%2C%20data%20analysis%2C%20data%20visualization%2C%20data%20cleansing&experience=0&ugTypeGid=12&jobPostType=1&glbl_qcrc=1018&glbl_qcrc=1020&pageNo={current_page}"
            
            print(f"Scraping page {current_page}...")
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            
            try:
                await page.wait_for_selector('a[title]', timeout=30000)  # any job title
            except:
                print("No more jobs or blocked — stopping")
                break
                
            # Human-like scroll
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(3)
            
            html = await page.content()
            soup = BeautifulSoup(html, 'lxml')
            
            # Multiple selector fallback (Naukri changes class names)
            jobs = soup.find_all('div', class_='srp-jobtuple-wrapper') or \
                    soup.find_all('div', class_='jobTuple') or \
                    soup.find_all('li', class_=lambda x: x and 'jobTuple' in str(x))
            
            if len(jobs) == 0:
                print("No jobs found on this page → end of results")
                break
                
            print(f"   → Found {len(jobs)} jobs on page {current_page}")
            
            for job in jobs:
                try:
                    title_tag = job.find('a', class_='title') or job.find('a', {'data-automation': 'jobTitle'})
                    title = title_tag.get('title') or title_tag.get_text(strip=True) if title_tag else 'N/A'
                    link = title_tag['href'] if title_tag and title_tag.get('href') else 'N/A'
                    if link != 'N/A' and not link.startswith('http'):
                        link = 'https://www.naukri.com' + link
                        
                    company = (job.find('a', class_='comp-name') or 
                              job.find('span', {'data-automation': 'jobCompany'}))
                    company = company.get_text(strip=True) if company else 'N/A'
                    
                    location = (job.find('span', class_='locWdth') or 
                               job.find('span', {'data-automation': 'jobLocation'}))
                    location = location.get_text(strip=True) if location else 'N/A'
                    
                    exp = (job.find('span', class_='expwdth') or 
                          job.find('span', {'data-automation': 'jobExperience'}))
                    exp = exp.get_text(strip=True) if exp else 'N/A'
                    
                    salary = (job.find('span', class_='sal') or 
                             job.find('span', {'data-automation': 'jobSalary'}))
                    salary = salary.get_text(strip=True) if salary else 'Not Disclosed'
                    
                    all_jobs.append({
                        'Job Title': title,
                        'Company': company,
                        'Location': location,
                        'Experience': exp,
                        'Salary': salary,
                        'Link': link
                    })
                except:
                    continue  # skip broken job cards
            
            current_page += 1
            await asyncio.sleep(5)  # Be respectful
            
        await browser.close()

# RUN THE SCRAPER
print("Starting full scrape — this will take 4–8 minutes for 400–500 jobs...")
asyncio.get_event_loop().run_until_complete(scrape_all_pages())

# Save to Excel
df = pd.DataFrame(all_jobs)
df.to_excel('ALL_Fresher_Data_Analyst_Jobs_Naukri_Nov2025.xlsx', index=False)
df.to_csv('ALL_Fresher_Data_Analyst_Jobs_Naukri_Nov2025.csv', index=False)

print(f"\nDONE! Scraped {len(df)} jobs from {len(set(df['Link']))} unique listings")
print("Files saved:")
print("   → ALL_Fresher_Data_Analyst_Jobs_Naukri_Nov2025.xlsx")
print("   → ALL_Fresher_Data_Analyst_Jobs_Naukri_Nov2025.csv")
df.head(10)

Starting full scrape — this will take 4–8 minutes for 400–500 jobs...
Scraping page 1...
   → Found 20 jobs on page 1
Scraping page 2...
   → Found 20 jobs on page 2
Scraping page 3...
   → Found 20 jobs on page 3
Scraping page 4...
   → Found 20 jobs on page 4
Scraping page 5...
   → Found 20 jobs on page 5
Scraping page 6...
   → Found 20 jobs on page 6
Scraping page 7...
   → Found 20 jobs on page 7
Scraping page 8...
   → Found 20 jobs on page 8
Scraping page 9...
   → Found 20 jobs on page 9
Scraping page 10...
   → Found 20 jobs on page 10
Scraping page 11...
   → Found 20 jobs on page 11
Scraping page 12...
   → Found 20 jobs on page 12
Scraping page 13...
   → Found 20 jobs on page 13
Scraping page 14...
   → Found 20 jobs on page 14
Scraping page 15...
   → Found 20 jobs on page 15
Scraping page 16...
   → Found 20 jobs on page 16
Scraping page 17...
   → Found 20 jobs on page 17
Scraping page 18...
   → Found 20 jobs on page 18
Scraping page 19...
   → Found 20 jobs on page 1

Error: Page.goto: net::ERR_NETWORK_CHANGED at https://www.naukri.com/data-analyst-data-analytics-data-analysis-data-visualization-data-cleansing-jobs?k=data%20analyst%2C%20data%20analytics%2C%20data%20analysis%2C%20data%20visualization%2C%20data%20cleansing&experience=0&ugTypeGid=12&jobPostType=1&glbl_qcrc=1018&glbl_qcrc=1020&pageNo=358
Call log:
  - navigating to "https://www.naukri.com/data-analyst-data-analytics-data-analysis-data-visualization-data-cleansing-jobs?k=data%20analyst%2C%20data%20analytics%2C%20data%20analysis%2C%20data%20visualization%2C%20data%20cleansing&experience=0&ugTypeGid=12&jobPostType=1&glbl_qcrc=1018&glbl_qcrc=1020&pageNo=358", waiting until "domcontentloaded"


In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame(all_jobs)   
print(f"Raw scraped rows: {len(df)}")

Raw scraped rows: 7129


In [5]:
df

Unnamed: 0,Job Title,Company,Location,Experience,Salary,Link
0,Data Science Intern,Neozeno Talent Llp,Remote,,"15,000/month",https://www.naukri.com/job-listings-data-scien...
1,Data Analyst Intern,Voylla,Jaipur,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
2,Data Analytics Intern,Reliance Games,Pune,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
3,Data Science Intern,Cyfotok,Coimbatore,,Unpaid,https://www.naukri.com/job-listings-data-scien...
4,Data Analyst,Also Energy,"Noida, Gurugram, Delhi / NCR",0-2 Yrs,4-5 Lacs PA,https://www.naukri.com/job-listings-data-analy...
...,...,...,...,...,...,...
7124,Data engineer,Leading Client,Hyderabad,4-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-engin...
7125,Data Scientist,ADQ Services,Hyderabad,1-3 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
7126,Data Scientist,Syren Cloud Inc,Hyderabad,3-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
7127,Data Scientist,Veniso Solutions,Hyderabad,2-7 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...


In [7]:
df.to_csv('naukri_jobs_codealpha.csv', index=False, encoding='utf-8')

In [8]:
df

Unnamed: 0,Job Title,Company,Location,Experience,Salary,Link
0,Data Science Intern,Neozeno Talent Llp,Remote,,"15,000/month",https://www.naukri.com/job-listings-data-scien...
1,Data Analyst Intern,Voylla,Jaipur,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
2,Data Analytics Intern,Reliance Games,Pune,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
3,Data Science Intern,Cyfotok,Coimbatore,,Unpaid,https://www.naukri.com/job-listings-data-scien...
4,Data Analyst,Also Energy,"Noida, Gurugram, Delhi / NCR",0-2 Yrs,4-5 Lacs PA,https://www.naukri.com/job-listings-data-analy...
...,...,...,...,...,...,...
7124,Data engineer,Leading Client,Hyderabad,4-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-engin...
7125,Data Scientist,ADQ Services,Hyderabad,1-3 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
7126,Data Scientist,Syren Cloud Inc,Hyderabad,3-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
7127,Data Scientist,Veniso Solutions,Hyderabad,2-7 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...


In [17]:
df.duplicated().sum()

np.int64(4743)

In [20]:
df.drop_duplicates(inplace=True)

In [21]:
df

Unnamed: 0,Job Title,Company,Location,Experience,Salary,Link
0,Data Science Intern,Neozeno Talent Llp,Remote,,"15,000/month",https://www.naukri.com/job-listings-data-scien...
1,Data Analyst Intern,Voylla,Jaipur,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
2,Data Analytics Intern,Reliance Games,Pune,0-1 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-analy...
3,Data Science Intern,Cyfotok,Coimbatore,,Unpaid,https://www.naukri.com/job-listings-data-scien...
4,Data Analyst,Also Energy,"Noida, Gurugram, Delhi / NCR",0-2 Yrs,4-5 Lacs PA,https://www.naukri.com/job-listings-data-analy...
...,...,...,...,...,...,...
2384,Data engineer,Leading Client,Hyderabad,4-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-engin...
2385,Data Scientist,ADQ Services,Hyderabad,1-3 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
2386,Data Scientist,Syren Cloud Inc,Hyderabad,3-6 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...
2387,Data Scientist,Veniso Solutions,Hyderabad,2-7 Yrs,Not Disclosed,https://www.naukri.com/job-listings-data-scien...


In [25]:
df.shape

(2386, 6)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2386 entries, 0 to 2388
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Job Title   2386 non-null   object
 1   Company     2386 non-null   object
 2   Location    2386 non-null   object
 3   Experience  2386 non-null   object
 4   Salary      2386 non-null   object
 5   Link        2386 non-null   object
dtypes: object(6)
memory usage: 130.5+ KB


In [29]:
df['Job Title'].isnull().sum()

np.int64(0)

In [22]:
df['Company'].nunique()

1256

In [23]:
df['Salary'].nunique()

134