In [44]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

# Start browser
driver = webdriver.Chrome()
driver.get("https://www.naukri.com/data-analyst-jobs")
time.sleep(5)

# ⏬ Scroll multiple times to load more job cards
for _ in range(5):  # increase range if needed
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)

# 🎯 Get all job cards (will be more than 50 now)
job_cards = driver.find_elements(By.CLASS_NAME, "srp-jobtuple-wrapper")
print(f"✅ Found {len(job_cards)} job cards.")

# Get links of first 60 jobs (safe buffer for duplicates/errors)
job_links = []
for card in job_cards[:60]:
    try:
        link = card.find_element(By.CSS_SELECTOR, "a.title").get_attribute("href")
        job_links.append(link)
    except:
        continue

# Data list
job_data = []

# Loop through job pages
for idx, link in enumerate(job_links):
    try:
        driver.get(link)
        wait = WebDriverWait(driver, 10)

        # Job title
        title = wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))).text.strip()

        # Company name (smart fallbacks)
        company_selectors = [
            "div.jd-header-comp-name a",
            "div.jd-header-comp-name",
            "a.comp-name",
            "span.comp-name",
            "div.company-info a",
        ]
        company = "N/A"
        for selector in company_selectors:
            try:
                element = driver.find_element(By.CSS_SELECTOR, selector)
                if element.text.strip():
                    company = element.text.strip()
                    break
            except:
                continue

        # Key Skills
        try:
            skill_box = driver.find_element(By.XPATH, "//div[contains(@class, 'key-skill') or contains(@class,'styles_jd_keywords')]")
            skill_tags = skill_box.find_elements(By.TAG_NAME, "span")
            skills = [s.text.strip() for s in skill_tags if s.text.strip()]
        except:
            skills = []

        # Store
        job_data.append({
            "Job Title": title,
            "Company": company,
            "Key Skills": ", ".join(skills)
        })

        print(f"{idx+1}. ✅ {title} | {company}")

        # Optional delay to avoid being blocked
        time.sleep(1)

    except Exception as e:
        print(f"{idx+1}. ⚠️ Failed: {link}")
        print("   Reason:", e)
        time.sleep(1)

# Close browser
driver.quit()

# ✅ Save to CSV
with open("naukri_50plus_jobs.csv", mode='w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=["Job Title", "Company", "Key Skills"])
    writer.writeheader()
    writer.writerows(job_data)

print(f"\n🎉 Scraped {len(job_data)} jobs → Saved to 'naukri_50plus_jobs.csv'")


✅ Found 20 job cards.
1. ✅ Data Analyst - Off Roll | N/A
2. ✅ PMT - Data Analyst | N/A
3. ✅ Data Analyst - Havells IP | N/A
4. ✅ Data Analyst | N/A
5. ✅ Data Analyst | Foreign IT Consulting MNC
6. ✅ Data Analyst | N/A
7. ✅ Data Analyst | B2B Firm in Analytics Domain
8. ✅ Data Analyst | N/A
9. ✅ Data Analyst - AWS | Leading IT Consulting MNC
10. ✅ Data Analyst | N/A
11. ✅ Data Analyst | Accounting & Auditing Firm
12. ✅ Data Analyst | N/A
13. ✅ Data Analyst | Large IT MNC
14. ✅ Data Analyst | N/A
15. ✅ Linguistic/Localization Data Analyst | Leading Indian MNC in BPO
16. ✅ Data Analyst | Firm in BPO Sector
17. ✅ Data Analyst | N/A
18. ✅ S&C GN - Data&AI - Resources - Analyst | Product Based Foreign MNC
19. ✅ Data Analyst | N/A
20. ✅ Data Analyst | N/A

🎉 Scraped 20 jobs → Saved to 'naukri_50plus_jobs.csv'


Task

In [107]:
my_skills = [
    "data modeling", "pyspark", "data wrangling", "data cleansing", "excel",
    "azure databricks", "azure synapse", "azure data factory", "alteryx", "power bi",
    "python", "sql", "data pipeline", "etl", "kpi", "business intelligence", "data visualization"
]


In [109]:
import pandas as pd

df = pd.read_csv("naukri_50plus_jobs.csv")


In [111]:
# Convert Key Skills column to lowercase
df['Key Skills'] = df['Key Skills'].fillna('').str.lower()

# Your CV skills to lowercase
my_skills_lower = [skill.lower() for skill in my_skills]


In [113]:
# Function to count matched skills
def match_count(job_skills):
    return sum(skill in job_skills for skill in my_skills_lower)

# Apply to all rows
df["Matched Skills Count"] = df["Key Skills"].apply(match_count)


#Skill not matched

In [117]:
import pandas as pd

# ✅ Combined skill list from your CV
my_skills = [
    "data modeling", "pyspark", "data wrangling", "data cleansing", "excel",
    "azure databricks", "azure synapse", "azure data factory", "alteryx", "power bi",
    "python", "sql", "data pipeline", "etl", "kpi", "business intelligence", "data visualization"
]

# Lowercase for consistent matching
my_skills = [skill.lower() for skill in my_skills]

# ✅ Load the job data
df = pd.read_csv("naukri_50plus_jobs.csv")
df["Key Skills"] = df["Key Skills"].fillna("").str.lower()

# ✅ Function to compute match stats
def skill_match_analysis(job_skills_raw):
    job_skills = [s.strip() for s in job_skills_raw.split(",") if s.strip()]
    
    matched_skills = [skill for skill in job_skills if skill in my_skills]
    unmatched_skills = [skill for skill in job_skills if skill not in my_skills]

    return pd.Series({
        "Matched Skill Count": len(matched_skills),
        "Total Key Skills in Job": len(job_skills),
        "Skills Not Matched": ", ".join(unmatched_skills)
    })

# ✅ Apply the function to each row
skill_stats = df["Key Skills"].apply(skill_match_analysis)

# ✅ Merge back into the original DataFrame
df = pd.concat([df, skill_stats], axis=1)

# ✅ Sort by best skill match
df_sorted = df.sort_values(by="Matched Skill Count", ascending=False)

# ✅ Save to CSV
df_sorted.to_csv("naukri_matched_skills_full.csv", index=False)

print("✅ Final output saved → 'naukri_matched_skills_full.csv'")


✅ Final output saved → 'naukri_matched_skills_full.csv'


In [119]:
df_sorted 

Unnamed: 0,Job Title,Company,Key Skills,Matched Skill Count,Total Key Skills in Job,Skills Not Matched
4,Data Analyst,Foreign IT Consulting MNC,"python, sql queries, software development, pys...",6,20,"sql queries, software development, data analys..."
6,Data Analyst,B2B Firm in Analytics Domain,"python, sql queries, pyspark, power bi, bi dev...",6,23,"sql queries, bi development, macros, data anal..."
10,Data Analyst,Accounting & Auditing Firm,"power bi, tableau, sql, python, data transform...",5,15,"tableau, data transformation, data research, d..."
18,Data Analyst,,"python, data analysis, pyspark, machine learni...",5,20,"data analysis, machine learning, hive, data an..."
17,S&C GN - Data&AI - Resources - Analyst,Product Based Foreign MNC,"python, data analysis, machine learning, artif...",5,17,"data analysis, machine learning, artificial in..."
7,Data Analyst,,"master data, metadata, python, data analysis, ...",5,20,"master data, metadata, data analysis, data man..."
11,Data Analyst,,"insurance, funding, python, data analysis, dat...",4,20,"insurance, funding, data analysis, data manage..."
12,Data Analyst,Large IT MNC,"business analytics, power bi, kpi, data analyt...",3,10,"business analytics, data analytics, business p..."
0,Data Analyst - Off Roll,,"looker., power bi, tableau, sql, data modeling...",3,7,"looker., tableau, data analysis, data analytics"
1,PMT - Data Analyst,,"health insurance, excel, data management, proc...",2,10,"health insurance, data management, process imp..."


Repeated skills

In [137]:
import pandas as pd
from collections import Counter

# Load the job CSV
df = pd.read_csv("naukri_50plus_jobs.csv")

# Clean and prepare skills
df["Key Skills"] = df["Key Skills"].fillna("").str.lower()

# Split all skills into a flat list
all_skills = []
for skills in df["Key Skills"]:
    all_skills.extend([s.strip() for s in skills.split(",") if s.strip()])

# Count occurrences
skill_counts = Counter(all_skills)

# Convert to DataFrame
skill_freq_df = pd.DataFrame(skill_counts.items(), columns=["Skill", "Frequency"])
skill_freq_df = skill_freq_df.sort_values(by="Frequency", ascending=False).reset_index(drop=True)

# Save to CSV
skill_freq_df.to_csv("most_common_skills.csv", index=False)

print("✅ Top repeated skills saved to 'most_common_skills.csv'")
print("\n🔝 Top 10 Most Frequent Skills:\n")
print(skill_freq_df)


✅ Top repeated skills saved to 'most_common_skills.csv'

🔝 Top 10 Most Frequent Skills:

              Skill  Frequency
0     data analysis         13
1    data analytics         12
2               sql         11
3          power bi          9
4            python          8
..              ...        ...
112        metadata          1
113     master data          1
114            ssis          1
115      sql server          1
116          hadoop          1

[117 rows x 2 columns]


In [157]:
df=skill_freq_df.head(50
                    )
print(df)

                          Skill  Frequency
0                 data analysis         13
1                data analytics         12
2                           sql         11
3                      power bi          9
4                        python          8
5                 data modeling          7
6            data visualization          7
7                advanced excel          6
8                       tableau          6
9                  data analyst          6
10                  data mining          6
11            change management          4
12                          hcm          4
13             business analyst          4
14               data migration          4
15                      vlookup          4
16                   oracle hcm          4
17         predictive analytics          3
18                        excel          3
19              data management          3
20               data integrity          3
21                   analytical          3
22         

#Most in-demand skills from Naukri job listings that are not in your naukri

In [135]:
import pandas as pd
from collections import Counter

# 1. Load job data
df = pd.read_csv("naukri_50plus_jobs.csv")
df["Key Skills"] = df["Key Skills"].fillna("").str.lower()

# 2. Flatten all skills
all_skills = []
for row in df["Key Skills"]:
    skills = [s.strip() for s in row.split(",") if s.strip()]
    all_skills.extend(skills)

# 3. Count frequency
skill_counts = Counter(all_skills)

# 4. Your skills list (lowercase)
my_skills = [
    "data modeling", "pyspark", "data wrangling", "data cleansing", "excel",
    "azure databricks", "azure synapse", "azure data factory", "alteryx", "power bi",
    "python", "sql", "data pipeline", "etl", "kpi", "business intelligence", "data visualization"
]


# 5. Find missing in-demand skills
missing_skills = {
    skill: count for skill, count in skill_counts.items() if skill not in my_skills
}

# 6. Create DataFrame
missing_df = pd.DataFrame(missing_skills.items(), columns=["Missing Skill", "Frequency"])
missing_df = missing_df.sort_values(by="Frequency", ascending=False).reset_index(drop=True)

# 7. Save and show top 20
missing_df.to_csv("in_demand_missing_skills_from_my_skills.csv", index=False)

print("\n📊 Top In-Demand Skills Not in your `my_skills` list:\n")
print(missing_df.head(20))



📊 Top In-Demand Skills Not in your `my_skills` list:

           Missing Skill  Frequency
0          data analysis         13
1         data analytics         12
2                tableau          6
3            data mining          6
4         advanced excel          6
5           data analyst          6
6             oracle hcm          4
7         data migration          4
8                vlookup          4
9      change management          4
10                   hcm          4
11      business analyst          4
12                 spark          3
13        data integrity          3
14            analytical          3
15  predictive analytics          3
16       data management          3
17    performance tuning          2
18                    bi          2
19                   sas          2


In [None]:
#distinct skill

In [151]:
import pandas as pd

# Load the job data
df = pd.read_csv("naukri_50plus_jobs.csv")

# Clean and standardize the 'Key Skills' column
df["Key Skills"] = df["Key Skills"].fillna("").str.lower()

# Extract and flatten all skill tags into a list
all_skills = []
for row in df["Key Skills"]:
    skills = [s.strip() for s in row.split(",") if s.strip()]
    all_skills.extend(skills)

# Use a set to get distinct skills
unique_skills = sorted(set(all_skills))

# Save to a CSV
distinct_df = pd.DataFrame(unique_skills, columns=["Distinct Skill"])
distinct_df.to_csv("distinct_skills.csv", index=False)

# Show the first 20 for preview
print("✅ Found", len(unique_skills), "distinct skills.")
print("\n🔍 Preview of distinct skills:\n")
print(distinct_df.head(20))


✅ Found 117 distinct skills.

🔍 Preview of distinct skills:

                     Distinct Skill
0                             adobe
1                    advanced excel
2                             agile
3                           alteryx
4                        analytical
5                         analytics
6           artificial intelligence
7                        automation
8                               aws
9                                bi
10                   bi development
11                business analysis
12                 business analyst
13               business analytics
14             business development
15            business intelligence
16  business performance management
17                change management
18                        cleansing
19               client interaction
