In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

def scrape_karkidi_jobs(keyword="data science", pages=1):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for page in range(1, pages + 1):
        url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
        print(f"Scraping page: {page} - {url}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        job_blocks = soup.find_all("div", class_="ads-details")
        for job in job_blocks:
            try:
                title = (job.find("h4") or job.find("h2")).get_text(strip=True) if job.find("h4") or job.find("h2") else ""
                company = ""
                company_tag = job.find("a", href=lambda x: x and "Employer-Profile" in x)
                if not company_tag:
                    company_tag = job.find("span", class_="company-name")
                if company_tag:
                    company = company_tag.get_text(strip=True)

                location = job.find("p").get_text(strip=True) if job.find("p") else ""
                experience = job.find("p", class_="emp-exp").get_text(strip=True) if job.find("p", class_="emp-exp") else ""
                summary = ""
                skills = ""

                key_skills_tag = job.find("span", string="Key Skills")
                if key_skills_tag:
                    skills = key_skills_tag.find_next("p").get_text(strip=True)

                summary_tag = job.find("span", string="Summary")
                if summary_tag:
                    summary = summary_tag.find_next("p").get_text(strip=True)

                # Alternative fallback for skills block
                if not skills:
                    skills_block = job.find("div", class_="job-skills")
                    skills = skills_block.get_text(strip=True) if skills_block else ""

                jobs_list.append({
                    "Title": title,
                    "Company": company,
                    "Location": location,
                    "Experience": experience,
                    "Summary": summary,
                    "Skills": skills
                })
            except Exception as e:
                print(f"Error parsing job block: {e}")
                continue

        time.sleep(1)  # Be nice to the server

    df = pd.DataFrame(jobs_list)
    os.makedirs("data", exist_ok=True)
    df.to_csv("data/jobs_data.csv", index=False)
    print(f" Saved {len(df)} jobs to data/jobs_data.csv")
    return df

if __name__ == "__main__":
    df_jobs = scrape_karkidi_jobs(keyword="data science", pages=2)
    print(df_jobs.head())

Scraping page: 1 - https://www.karkidi.com/Find-Jobs/1/all/India?search=data%20science
Scraping page: 2 - https://www.karkidi.com/Find-Jobs/2/all/India?search=data%20science
 Saved 20 jobs to data/jobs_data.csv
                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bac

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from joblib import dump
import re

# Load scraped data
df = pd.read_csv("data/jobs_data.csv")

# Step 1: Preprocess Skills
def clean_skills(skills):
    if pd.isna(skills):
        return ""
    skills = skills.lower()
    skills = re.sub(r"[^a-zA-Z0-9, ]", "", skills)  # Remove special characters
    skills = [skill.strip() for skill in skills.split(",") if skill.strip()]
    return " ".join(skills)  # Return as string for vectorization

df["Cleaned_Skills"] = df["Skills"].apply(clean_skills)

# Step 2: Vectorize Skills using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Cleaned_Skills"])

# Step 3: KMeans Clustering
n_clusters = 5  # You can experiment with different numbers
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df["Cluster"] = kmeans.fit_predict(X)

# Step 4: Save model and data
os.makedirs("models", exist_ok=True) # Create the models directory if it doesn't exist
dump(kmeans, "models/kmeans_model.joblib")
dump(vectorizer, "models/tfidf_vectorizer.joblib")
df.to_csv("data/clustered_jobs.csv", index=False)

In [5]:
from joblib import load
import pandas as pd
import re

# Load saved model and vectorizer
kmeans = load("models/kmeans_model.joblib")
vectorizer = load("models/tfidf_vectorizer.joblib")

# Preprocessing function (same as before)
def clean_skills(skills):
    if pd.isna(skills):
        return ""
    skills = skills.lower()
    skills = re.sub(r"[^a-zA-Z0-9, ]", "", skills)
    skills = [skill.strip() for skill in skills.split(",") if skill.strip()]
    return " ".join(skills)

def classify_new_jobs(df_new_jobs):
    df_new_jobs["Cleaned_Skills"] = df_new_jobs["Skills"].apply(clean_skills)
    X_new = vectorizer.transform(df_new_jobs["Cleaned_Skills"])
    df_new_jobs["Predicted_Cluster"] = kmeans.predict(X_new)
    return df_new_jobs


In [8]:
def notify_user(df_new_jobs, preferred_cluster):
    matched_jobs = df_new_jobs[df_new_jobs["Predicted_Cluster"] == preferred_cluster]

    if not matched_jobs.empty:
        print(f" Found {len(matched_jobs)} new job(s) in your preferred category (Cluster {preferred_cluster}):\n")
        for _, row in matched_jobs.iterrows():
            print(f"- {row['Title']} at {row['Company']}")
    else:
        print(f"No new jobs found in Cluster {preferred_cluster} today.")


In [7]:
# Assume this is your new scraped dataset
df_new_jobs = pd.read_csv("data/jobs_data.csv")

# Step 1: Classify them using the trained model
df_new_classified = classify_new_jobs(df_new_jobs)

# Step 2: Notify the user if any match their preferred cluster
notify_user(df_new_classified, preferred_cluster=1)


 Found 6 new job(s) in your preferred category (Cluster 1):

- Staff Software Engineer - Monetization, Poe (Remote) at Quora, Inc.
- Staff Backend Engineer - Bot Creator Ecosystem, Poe (Remote) at Quora, Inc.
- Senior Backend Engineer - Bot Creator Ecosystem, Poe (Remote) at Quora, Inc.
- Staff Software Engineer - Monetization, Poe (Remote) at Quora, Inc.
- Staff Backend Engineer - Bot Creator Ecosystem, Poe (Remote) at Quora, Inc.
- Senior Backend Engineer - Bot Creator Ecosystem, Poe (Remote) at Quora, Inc.


In [9]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [10]:
# 🛠 Create the app.py file inside Colab
%%writefile app.py
import streamlit as st
import pandas as pd
from joblib import load
import re
import requests
from bs4 import BeautifulSoup
import time

# Load models once
kmeans = load("models/kmeans_model.joblib")
vectorizer = load("models/tfidf_vectorizer.joblib")

def clean_skills(skills):
    if pd.isna(skills):
        return ""
    skills = skills.lower()
    skills = re.sub(r"[^a-zA-Z0-9, ]", "", skills)
    skills = [skill.strip() for skill in skills.split(",") if skill.strip()]
    return " ".join(skills)

# Use your existing scraper logic (simplified here)
def scrape_karkidi_jobs(keyword="data science", pages=1):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for page in range(1, pages + 1):
        url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        job_blocks = soup.find_all("div", class_="ads-details")

        for job in job_blocks:
            try:
                title = (job.find("h4") or job.find("h2")).get_text(strip=True) if job.find("h4") or job.find("h2") else ""
                company_tag = job.find("a", href=lambda x: x and "Employer-Profile" in x)
                company = company_tag.get_text(strip=True) if company_tag else ""
                skills = ""
                key_skills_tag = job.find("span", string="Key Skills")
                if key_skills_tag:
                    skills = key_skills_tag.find_next("p").get_text(strip=True)
                if not skills:
                    skills_block = job.find("div", class_="job-skills")
                    skills = skills_block.get_text(strip=True) if skills_block else ""

                jobs_list.append({
                    "Title": title,
                    "Company": company,
                    "Skills": skills
                })
            except:
                continue
        time.sleep(1)

    return pd.DataFrame(jobs_list)

def classify_new_jobs(df_new_jobs):
    df_new_jobs["Cleaned_Skills"] = df_new_jobs["Skills"].apply(clean_skills)
    X_new = vectorizer.transform(df_new_jobs["Cleaned_Skills"])
    df_new_jobs["Predicted_Cluster"] = kmeans.predict(X_new)
    return df_new_jobs

def notify_user(df_new_jobs, preferred_cluster):
    matched_jobs = df_new_jobs[df_new_jobs["Predicted_Cluster"] == preferred_cluster]
    if not matched_jobs.empty:
        return matched_jobs[["Title", "Company"]]
    else:
        return pd.DataFrame()

# --- Streamlit UI ---
st.title("Job Posting Classifier and Notifier")

keyword = st.text_input("Enter skill keyword(s) to search jobs:", "data science")
pages = st.slider("Number of pages to scrape:", 1, 5, 1)

if st.button("Scrape and Classify Jobs"):
    with st.spinner("Scraping jobs..."):
        df_jobs = scrape_karkidi_jobs(keyword, pages)

    with st.spinner("Classifying jobs..."):
        df_classified = classify_new_jobs(df_jobs)

    st.success(f"Found {len(df_classified)} jobs and classified into clusters.")

    cluster_options = df_classified["Predicted_Cluster"].unique().tolist()
    preferred_cluster = st.selectbox("Select your preferred cluster:", cluster_options)

    matched_jobs = notify_user(df_classified, preferred_cluster)

    if not matched_jobs.empty:
        st.markdown(f"### 🔔 Jobs in Cluster {preferred_cluster} matching your interest:")
        for idx, row in matched_jobs.iterrows():
            st.write(f"**{row['Title']}** at *{row['Company']}*")
    else:
        st.write(f"No new jobs found in Cluster {preferred_cluster}.")


Writing app.py
