In [4]:
import requests
from bs4 import BeautifulSoup # Corrected import statement
import pandas as pd
import time

def scrape_karkidi_jobs(keyword="data science", pages=1):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for page in range(1, pages + 1):
        url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
        print(f"Scraping page: {page}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        job_blocks = soup.find_all("div", class_="ads-details")
        for job in job_blocks:
            try:
                title = job.find("h4").get_text(strip=True)
                company = job.find("a", href=lambda x: x and "Employer-Profile" in x).get_text(strip=True)
                location = job.find("p").get_text(strip=True)
                experience = job.find("p", class_="emp-exp").get_text(strip=True)
                key_skills_tag = job.find("span", string="Key Skills")
                skills = key_skills_tag.find_next("p").get_text(strip=True) if key_skills_tag else ""
                summary_tag = job.find("span", string="Summary")
                summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""

                jobs_list.append({
                    "Title": title,
                    "Company": company,
                    "Location": location,
                    "Experience": experience,
                    "Summary": summary,
                    "Skills": skills
                })
            except Exception as e:
                print(f"Error parsing job block: {e}")
                continue

        time.sleep(1)  # Be nice to the server

    return pd.DataFrame(jobs_list)

# Example use:
if __name__ == "__main__":
    df_jobs = scrape_karkidi_jobs(keyword="data science", pages=2)
    print(df_jobs.head())

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid
import pandas as pd
import joblib

# Define the tokenizer globally
def split_skills_tokenizer(x):
    """Custom tokenizer to split skills by comma."""
    return x.split(',')

def preprocess_skills(df):
    df = df.copy()
    df['Skills'] = df['Skills'].fillna("").str.lower().str.strip()
    return df

def vectorize_skills(df):
    vectorizer = TfidfVectorizer(tokenizer=split_skills_tokenizer, lowercase=True)
    X = vectorizer.fit_transform(df['Skills'])
    return X, vectorizer

def cluster_skills(X, n_clusters=5):
    model = AgglomerativeClustering(n_clusters=n_clusters)
    labels = model.fit_predict(X.toarray())  # Note: must convert sparse to dense
    return model, labels

def train_centroid_classifier(X, labels):
    clf = NearestCentroid()
    clf.fit(X.toarray(), labels)
    return clf

def classify_new_jobs(df_new, vectorizer, clf):
    df_new = preprocess_skills(df_new)
    X_new = vectorizer.transform(df_new['Skills'])
    df_new['Cluster'] = clf.predict(X_new.toarray())
    return df_new

def notify_user(df_classified, user_cluster_id):
    matched = df_classified[df_classified['Cluster'] == user_cluster_id]
    if not matched.empty:
        print(" New job(s) matching your interest:")
        display(matched[['Title', 'Company', 'Skills']])
    else:
        print(" No new matching jobs today.")

# === PIPELINE ===

# Step 1: Scrape initial jobs
df_jobs = scrape_karkidi_jobs("data science", pages=5)

# Step 2: Preprocess
df_jobs = preprocess_skills(df_jobs)

# Step 3: Vectorize and Cluster
X, vectorizer = vectorize_skills(df_jobs)
model, labels = cluster_skills(X, n_clusters=5)
df_jobs['Cluster'] = labels

# Step 4: Train a centroid classifier for future prediction
clf = train_centroid_classifier(X, labels)

# Step 5: Simulate new jobs
df_new_jobs = scrape_karkidi_jobs("data science", pages=1)

# Step 6: Predict new job clusters
df_classified = classify_new_jobs(df_new_jobs, vectorizer, clf)

# Step 7: Notify user (assume interest in cluster 2)
notify_user(df_classified, user_cluster_id=2)

Scraping page: 1
Scraping page: 2
                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bachelor's degree in El...   
1  About Quora:Quora’s mission is to grow and sha...   
2  About Quora:Quora’s mission is to grow and sha...   
3  About Quora:Quora’s mission is to g



Scraping page: 1
 New job(s) matching your interest:


Unnamed: 0,Title,Company,Skills
4,Data Scientist Lead - AIML,JPMorgan Chase,"aartificial intelligence,data science techniqu..."
7,Manager - Machine Learning,Observe.AI,"aartificial intelligence,large language models..."
