In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import pickle


# 1. Scrape Function
def scrape_karkidi_jobs(keywords=["data scientist"], pages=2):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for keyword in keywords:
        for page in range(1, pages + 1):
            url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
            print(f"Scraping page {page} for '{keyword}'...")
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, "html.parser")

            job_blocks = soup.find_all("div", class_="ads-details")
            for job in job_blocks:
                try:
                    title = job.find("h4").get_text(strip=True)
                    company = job.find("a", href=lambda x: x and "Employer-Profile" in x).get_text(strip=True)
                    location = job.find("p").get_text(strip=True)
                    experience = job.find("p", class_="emp-exp").get_text(strip=True)
                    key_skills_tag = job.find("span", string="Key Skills")
                    skills = key_skills_tag.find_next("p").get_text(strip=True) if key_skills_tag else ""
                    summary_tag = job.find("span", string="Summary")
                    summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""

                    jobs_list.append({
                        "Keyword": keyword,
                        "Title": title,
                        "Company": company,
                        "Location": location,
                        "Experience": experience,
                        "Summary": summary,
                        "Skills": skills
                    })
                except Exception as e:
                    print(f"Error parsing job block: {e}")
                    continue

            time.sleep(1)  # Be polite

    return pd.DataFrame(jobs_list)

if __name__ == "__main__":
    df_jobs = scrape_karkidi_jobs()


Scraping page 1 for 'data scientist'...
Scraping page 2 for 'data scientist'...


In [35]:
display(df_jobs)

Unnamed: 0,Keyword,Title,Company,Location,Experience,Summary,Skills
0,data scientist,"Principal Product Manager - Growth, Poe (Remote)","Quora, Inc.",India,6-8 year,About Quora:Quora’s mission is to grow and sha...,"Aartificial intelligence,Data Analytics,Data s..."
1,data scientist,Machine Learning Physical Design Engineer,Google,"Bengaluru, Karnataka, India",4-6 year,Minimum qualifications:Bachelor's degree in El...,"Aartificial intelligence,Algorithms,Data struc..."
2,data scientist,"Staff Software Engineer - Monetization, Poe (R...","Quora, Inc.",India,8-10 year,About Quora:Quora’s mission is to grow and sha...,"Aartificial intelligence,Analytical and Proble..."
3,data scientist,Staff Backend Engineer - Bot Creator Ecosystem...,"Quora, Inc.",India,8-10 year,About Quora:Quora’s mission is to grow and sha...,"Aartificial intelligence,API,Data science tech..."
4,data scientist,Senior Backend Engineer - Bot Creator Ecosyste...,"Quora, Inc.",India,6-8 year,About Quora:Quora’s mission is to grow and sha...,"Aartificial intelligence,API,Data science tech..."
5,data scientist,Data Scientist Lead - AIML,JPMorgan Chase,"Bengaluru, Karnataka, India",6-8 year,We have an opportunity to impact your career a...,"Aartificial intelligence,Data science techniqu..."
6,data scientist,Applied AI ML Director - Machine Learning,JPMorgan Chase,"Hyderabad, Telangana, India",12-14 year,Elevate your career as the Director of Machine...,"Aartificial intelligence,AWS,Azure,Google Clou..."
7,data scientist,Senior Product Designer,Observe.AI,"Bangalore, Karnataka, India",8-10 year,Observe.AI is transforming customer service wi...,"Design,Leadership Skill,Machine learning techn..."
8,data scientist,Manager - Machine Learning,Observe.AI,"Bangalore, Karnataka, India",8-10 year,Observe.AI is transforming customer service wi...,"Aartificial intelligence,Large Language Models..."
9,data scientist,Data Scientist,Spotify,"Mumbai, Maharashtra, India",6-8 year,We are looking for a Data Scientist to join th...,"Amazon RedShift,Apache Hadoop,Data science tec..."


In [40]:
# 2. Clean and Vectorize Skills
# Define tokenizer outside to make it pickleable
def custom_tokenizer(text):
    return [token.strip() for token in text.split(',') if token.strip()]

def preprocess_and_vectorize_skills(df):
    df = df.copy()
    df["Skills"] = df["Skills"].fillna("").str.lower().str.replace(r'[^a-zA-Z, ]', '', regex=True)
    tfidf = TfidfVectorizer(tokenizer=custom_tokenizer, lowercase=True)
    X = tfidf.fit_transform(df["Skills"])
    return X, tfidf



In [41]:
# 3. Determine Optimal Clusters using Silhouette Score
def find_optimal_k(X, k_range=range(2, 10)):
    best_score = -1
    best_k = 2
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(X)
        score = silhouette_score(X, labels)
        print(f"Silhouette Score for k={k}: {score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k
    print(f"\nBest k based on silhouette score: {best_k}")
    return best_k



In [42]:
# 4. Cluster and Save Models
def cluster_and_save(X, df, tfidf, k):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    df["Cluster"] = kmeans.fit_predict(X)

    # Save the model and vectorizer
    joblib.dump(kmeans, "kmeans_model.joblib")
    joblib.dump(tfidf, "tfidf_vectorizer.joblib")
    df.to_csv("clustered_jobs.csv", index=False)

    print("✅ Model and vectorizer saved. Clustered data exported to 'clustered_jobs.csv'")
    return df



In [43]:
# 5. Full Pipeline
def main():

    # Preprocess & vectorize
    X, tfidf = preprocess_and_vectorize_skills(df_jobs)

    # Find best number of clusters
    best_k = find_optimal_k(X)

    # Cluster and save
    clustered_df = cluster_and_save(X, df_jobs, tfidf, best_k)

    print(clustered_df[["Title", "Skills", "Cluster"]].head())

if __name__ == "__main__":
    main()

Silhouette Score for k=2: 0.2069
Silhouette Score for k=3: 0.2892
Silhouette Score for k=4: 0.4250
Silhouette Score for k=5: 0.5557
Silhouette Score for k=6: 0.6802
Silhouette Score for k=7: 0.8033
Silhouette Score for k=8: 0.9343
Silhouette Score for k=9: 1.0000

Best k based on silhouette score: 9
✅ Model and vectorizer saved. Clustered data exported to 'clustered_jobs.csv'
                                               Title  \
0   Principal Product Manager - Growth, Poe (Remote)   
1          Machine Learning Physical Design Engineer   
2  Staff Software Engineer - Monetization, Poe (R...   
3  Staff Backend Engineer - Bot Creator Ecosystem...   
4  Senior Backend Engineer - Bot Creator Ecosyste...   

                                              Skills  Cluster  
0  Aartificial intelligence,Data Analytics,Data s...        5  
1  Aartificial intelligence,Algorithms,Data struc...        4  
2  Aartificial intelligence,Analytical and Proble...        7  
3  Aartificial intelligence,