loading the dataset

In [None]:
import pandas as pd

file_path = 'scraped_data (1).csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_colwidth', None)
# Display basic info
print("First 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

basic data cleaning

In [None]:
# Remove duplicates based on all columns
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)

# Check for duplicates based on job_link (unique identifier)
df = df.drop_duplicates(subset=["job_link"], keep="first")
print("Shape after removing duplicates by job_link:", df.shape)

# Confirm no missing values remain (should be zero already)
print("Missing values after cleaning:")
print(df.isnull().sum())

data preprocessing

process tags

In [None]:
# 3.1 Process Tags
df["Tags"] = df["Tags"].str.lower().str.replace(", ", ",").str.strip()
print("Sample Tags after cleaning:")
print(df["Tags"].head())

parse and normalize salary

In [None]:
# 3.2 Parse and Normalize Salary (Revised)
def parse_salary(salary):
    if "not disclosed" in salary.lower() or pd.isna(salary):
        return None
    salary = salary.lower().replace("lacs pa", "").replace("lpa", "").replace("(including variable: 15.0%)", "").replace(",", "").strip()
    if "pa" in salary:  # Assume rupees annually, convert to LPA
        try:
            value = float(salary.replace("pa", "").strip()) / 100000  # e.g., "50000 pa" → 0.5 LPA
            return value if value < 100 else None  # Cap at 100 LPA to catch errors
        except ValueError:
            return None
    if "-" in salary:
        low, high = map(float, salary.split("-"))
        return (low + high) / 2 if (low < 100 and high < 100) else None  # Cap at 100 LPA
    try:
        value = float(salary)
        return value if value < 100 else None  # Cap at 100 LPA
    except ValueError:
        return None

df["salary_clean"] = df["Salary"].apply(parse_salary)
mean_salary = df["salary_clean"].mean()
print("Mean salary before imputation:", mean_salary)
df["salary_clean"] = df["salary_clean"].fillna(mean_salary)  
df["salary_norm"] = df["salary_clean"] / df["salary_clean"].max()
print("\nSalary Stats:")
print(df["salary_clean"].describe())

parse and normalize experience

In [None]:
# 3.3 Parse and Normalize Experience
def parse_experience(exp):
    exp = exp.lower().replace("yrs", "").strip()
    if "-" in exp:
        low, high = map(int, exp.split("-"))
        return (low + high) / 2
    return float(exp.split()[0]) if exp else 0

df["exp_clean"] = df["Experience"].apply(parse_experience)
mean_exp = df["exp_clean"].mean()
print("Mean experience before imputation:", mean_exp)
df["exp_clean"] = df["exp_clean"].fillna(mean_exp)  
df["exp_norm"] = df["exp_clean"] / df["exp_clean"].max()
print("\nExperience Stats:")
print(df["exp_clean"].describe())

encode location

In [None]:
# 3.4 Encode Location
df["location_clean"] = df["Location"].str.replace("Hybrid - ", "").str.split(",").str[0].str.strip().str.lower()
top_locations = df["location_clean"].value_counts().index[:10]
for loc in top_locations:
    df[f"loc_{loc}"] = df["location_clean"].apply(lambda x: 1 if x == loc else 0)
print("\nTop 10 Locations:")
print(df["location_clean"].value_counts().head(10))

feature matrix

In [None]:
# 3.5 Create Feature Matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# TF-IDF for Tags
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(","))
skill_matrix = vectorizer.fit_transform(df["Tags"]).toarray()
print("Skill Matrix Shape:", skill_matrix.shape)

# Numerical features
num_features = df[["salary_norm", "exp_norm"]].values
print("Numerical Features Shape:", num_features.shape)

# Location features (top 10)
top_locations = df["location_clean"].value_counts().index[:10]  
loc_features = df[[f"loc_{loc}" for loc in top_locations]].values
print("Location Features Shape:", loc_features.shape)

# Combine into feature matrix
job_features = np.hstack((skill_matrix, num_features, loc_features))
print("\nFinal Feature Matrix Shape:", job_features.shape)


In [None]:
# Save cleaned DataFrame for later use
df.to_csv('scrapedjobs_cleaned.csv', index=False)
print("Cleaned dataset saved to Drive.")

clustering

test kmeans for different k values

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Test k values from 5 to 20
k_values = range(5, 21)
silhouette_scores = []
inertia_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(job_features)
    silhouette = silhouette_score(job_features, clusters)
    inertia = kmeans.inertia_
    silhouette_scores.append(silhouette)
    inertia_scores.append(inertia)
    print(f"k={k}, Silhouette Score={silhouette:.3f}, Inertia={inertia:.1f}")

clustering with dimensionality reduction and metrics

reduce dimensionality with truncated SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Reduce dimensions to 50 components
svd = TruncatedSVD(n_components=50, random_state=42)
job_features_reduced = svd.fit_transform(job_features)
print("Reduced Feature Matrix Shape:", job_features_reduced.shape)

# Explained variance ratio
explained_variance = svd.explained_variance_ratio_.sum()
print(f"Explained Variance Ratio (50 components): {explained_variance:.3f}")

test kmeans on reduced features

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Test k values from 5 to 20
k_values = range(5, 21)
silhouette_scores = []
inertia_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(job_features_reduced)
    silhouette = silhouette_score(job_features_reduced, clusters)
    inertia = kmeans.inertia_
    silhouette_scores.append(silhouette)
    inertia_scores.append(inertia)
    print(f"k={k}, Silhouette Score={silhouette:.3f}, Inertia={inertia:.1f}")

analyze and pick best k

In [None]:
import matplotlib.pyplot as plt


# Find best k
best_k = k_values[silhouette_scores.index(max(silhouette_scores))]
print(f"\nBest k: {best_k}, Silhouette Score: {max(silhouette_scores):.3f}")

# Plot silhouette scores
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. k (Reduced Features)')
plt.show()

# Plot inertia
plt.plot(k_values, inertia_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Inertia vs. k (Reduced Features)')
plt.show()

apply final clustering

In [None]:

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(job_features_reduced)
print("Clustering complete. Added 'cluster' column to DataFrame.")
print("Cluster distribution:")
print(df["cluster"].value_counts())

process userinput

In [None]:

def preprocess_user_input(skills, salary, exp, location):
    # Skills to TF-IDF
    skills_clean = skills.lower().replace(", ", ",").strip()
    skills_vec = vectorizer.transform([skills_clean]).toarray() 

    # Salary (assume LPA)
    salary_val = float(salary) if salary else df["salary_clean"].mean()
    salary_norm = salary_val / df["salary_clean"].max()

    # Experience
    exp_val = float(exp) if exp else df["exp_clean"].mean()
    exp_norm = exp_val / df["exp_clean"].max()

    # Location (one-hot for top 10)
    loc_clean = location.lower().strip()
    loc_vec = np.array([1 if loc_clean == loc else 0 for loc in top_locations]).reshape(1, -1)

    # Combine features
    user_features = np.hstack((skills_vec, [[salary_norm, exp_norm]], loc_vec))

    # Reduce with SVD 
    user_features_reduced = svd.transform(user_features)
    return user_features_reduced


#  test with sample values 
sample_skills = "python, sql, machine learning"
sample_salary = "10"  # LPA
sample_exp = "5"      # years
sample_location = "bengaluru"
user_input = preprocess_user_input(sample_skills, sample_salary, sample_exp, sample_location)
print("User Features Reduced Shape:", user_input.shape)

assign user to cluster

In [None]:

df = df.reset_index(drop=True)
print("DataFrame index reset. New shape:", df.shape)

# Find user's cluster
user_cluster = kmeans.predict(user_input)[0]
print(f"User assigned to Cluster: {user_cluster}")

# Filter jobs in user's cluster
cluster_jobs_idx = df[df["cluster"] == user_cluster].index
cluster_job_features = job_features_reduced[cluster_jobs_idx]
print(f"Number of jobs in Cluster {user_cluster}: {len(cluster_jobs_idx)}")

knn recommendations

In [None]:
from sklearn.neighbors import NearestNeighbors

# kNN for top 5 recommendations
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(cluster_job_features)
distances, indices = knn.kneighbors(user_input)
recommended_indices = cluster_jobs_idx[indices[0]]
recommendations = df.iloc[recommended_indices][["Title", "Tags", "Salary", "Location","Experience","job_link"]]
print("\nTop 5 Recommended Jobs:")
print(recommendations)

compute metrics

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Cosine similarity for recommendations
similarities = 1 - distances[0]  # Convert distance to similarity
avg_similarity = similarities.mean()
print(f"\nAverage Cosine Similarity: {avg_similarity:.3f}")

# Baseline: Random jobs from cluster
random_idx = np.random.choice(cluster_jobs_idx, 5, replace=False)
random_features = job_features_reduced[random_idx]
random_similarities = cosine_similarity(user_input, random_features)[0]
avg_random_similarity = random_similarities.mean()
print(f"Random Baseline Similarity: {avg_random_similarity:.3f}")

In [22]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

# Create a function to replace the lambda
def tokenize_tags(text):
    return text.split(",")

# Create directories if they don't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Load the cleaned data 
print("Loading data...")
df = pd.read_csv('scrapedjobs_cleaned.csv')
print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns")

# Process tags for TF-IDF
print("Processing tags...")
df["Tags"] = df["Tags"].str.lower().str.replace(", ", ",").str.strip()

# 1. Create and save the TF-IDF vectorizer
print("Creating TF-IDF vectorizer...")
# Use the named function instead of lambda
vectorizer = TfidfVectorizer(tokenizer=tokenize_tags)
skill_matrix = vectorizer.fit_transform(df["Tags"]).toarray()
print(f"Skill matrix shape: {skill_matrix.shape}")
with open('models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
print("Saved vectorizer")

# 2. Prepare numerical features
print("Preparing numerical features...")
salary_max = df["salary_clean"].max()
exp_max = df["exp_clean"].max()
salary_mean = df["salary_clean"].mean()
exp_mean = df["exp_clean"].mean()

df["salary_norm"] = df["salary_clean"] / salary_max
df["exp_norm"] = df["exp_clean"] / exp_max
num_features = df[["salary_norm", "exp_norm"]].values
print(f"Numerical features shape: {num_features.shape}")

# 3. Prepare location features
print("Preparing location features...")
top_locations = df["location_clean"].value_counts().index[:10].tolist()
print(f"Top 10 locations: {top_locations}")

# Check if location columns exist, if not create them
for loc in top_locations:
    col_name = f"loc_{loc}"
    if col_name not in df.columns:
        df[col_name] = df["location_clean"].apply(lambda x: 1 if x == loc else 0)

loc_features = df[[f"loc_{loc}" for loc in top_locations]].values
print(f"Location features shape: {loc_features.shape}")

# 4. Combine into feature matrix
print("Creating combined feature matrix...")
job_features = np.hstack((skill_matrix, num_features, loc_features))
print(f"Job features shape: {job_features.shape}")

# Save feature information
with open('models/feature_info.pkl', 'wb') as f:
    pickle.dump({
        'top_locations': top_locations,
        'salary_max': salary_max,
        'exp_max': exp_max,
        'salary_mean': salary_mean,
        'exp_mean': exp_mean
    }, f)
print("Saved feature information")

# 5. Create and save SVD model
print("Creating SVD model...")
svd = TruncatedSVD(n_components=50, random_state=42)
job_features_reduced = svd.fit_transform(job_features)
print(f"Reduced job features shape: {job_features_reduced.shape}")
print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}")
with open('models/svd.pkl', 'wb') as f:
    pickle.dump(svd, f)
print("Saved SVD model")

# 6. Create and save K-means model with 8 clusters
print("Creating K-means model...")
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(job_features_reduced)
print("Cluster distribution:")
print(df["cluster"].value_counts())
with open('models/kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
print("Saved K-means model")

# 7. Save processed data
print("Saving processed job data...")
df.to_csv('models/processed_jobs.csv', index=False)
print("Saved processed job data")

# 8. Save job_features_reduced as it's needed for KNN
print("Saving reduced feature matrix...")
np.save('models/job_features_reduced.npy', job_features_reduced)
print("Saved reduced feature matrix")

print("\nAll models and data saved successfully in the 'models' directory!")
print("Files saved:")
for file in os.listdir('models'):
    file_size = os.path.getsize(f'models/{file}') / (1024 * 1024)  # Convert to MB
    print(f"- {file} ({file_size:.2f} MB)")

Loading data...
Loaded data with 3978 rows and 24 columns
Processing tags...
Creating TF-IDF vectorizer...
Skill matrix shape: (3978, 5371)
Saved vectorizer
Preparing numerical features...
Numerical features shape: (3978, 2)
Preparing location features...
Top 10 locations: ['mumbai', 'bengaluru', 'pune', 'hyderabad', 'gurugram', 'chennai', 'kolkata', 'coimbatore', 'noida', 'ahmedabad']
Location features shape: (3978, 10)
Creating combined feature matrix...




Job features shape: (3978, 5383)
Saved feature information
Creating SVD model...
Reduced job features shape: (3978, 50)
Explained variance ratio: 0.551
Saved SVD model
Creating K-means model...
Cluster distribution:
cluster
2    1012
0     853
3     731
4     581
1     356
6     168
7     149
5     128
Name: count, dtype: int64
Saved K-means model
Saving processed job data...
Saved processed job data
Saving reduced feature matrix...
Saved reduced feature matrix

All models and data saved successfully in the 'models' directory!
Files saved:
- feature_info.pkl (0.00 MB)
- job_features_reduced.npy (1.52 MB)
- kmeans.pkl (0.02 MB)
- processed_jobs.csv (1.82 MB)
- svd.pkl (2.06 MB)
- vectorizer.pkl (0.14 MB)


In [26]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.neighbors import NearestNeighbors

# Function for tokenizing in the same way as during model creation
def tokenize_tags(text):
    return text.split(",")

# Load all the saved models and data
def load_models():
    print("Loading models and data...")
    
    # Load processed job data
    df = pd.read_csv('models/processed_jobs.csv')
    print(f"Loaded processed jobs: {df.shape[0]} rows")
    
    # Load vectorizer
    with open('models/vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    print("Loaded vectorizer")
    
    # Load SVD model
    with open('models/svd.pkl', 'rb') as f:
        svd = pickle.load(f)
    print("Loaded SVD model")
    
    # Load K-means model
    with open('models/kmeans.pkl', 'rb') as f:
        kmeans = pickle.load(f)
    print("Loaded K-means model")
    
    # Load feature information
    with open('models/feature_info.pkl', 'rb') as f:
        feature_info = pickle.load(f)
    print("Loaded feature information")
    
    # Load reduced job features
    job_features_reduced = np.load('models/job_features_reduced.npy')
    print(f"Loaded reduced job features: {job_features_reduced.shape}")
    
    return df, vectorizer, svd, kmeans, feature_info, job_features_reduced

# Preprocess user input to match model input format
def preprocess_user_input(skills, salary, exp, location, vectorizer, svd, feature_info):
    print("Preprocessing user input...")
    
    # Skills to TF-IDF
    skills_clean = skills.lower().replace(", ", ",").strip()
    skills_vec = vectorizer.transform([skills_clean]).toarray()
    print(f"Skills vector shape: {skills_vec.shape}")

    # Salary (assume LPA)
    if salary and salary.strip():
        salary_val = float(salary)
    else:
        salary_val = feature_info['salary_mean']
    salary_norm = salary_val / feature_info['salary_max']
    print(f"Normalized salary: {salary_norm:.3f}")

    # Experience
    if exp and exp.strip():
        exp_val = float(exp)
    else:
        exp_val = feature_info['exp_mean']
    exp_norm = exp_val / feature_info['exp_max']
    print(f"Normalized experience: {exp_norm:.3f}")

    # Location (one-hot for top 10)
    loc_clean = location.lower().strip()
    loc_vec = np.array([1 if loc_clean == loc else 0 for loc in feature_info['top_locations']]).reshape(1, -1)
    print(f"Location vector shape: {loc_vec.shape}")

    # Combine features
    user_features = np.hstack((skills_vec, [[salary_norm, exp_norm]], loc_vec))
    print(f"Combined user features shape: {user_features.shape}")

    # Reduce with SVD
    user_features_reduced = svd.transform(user_features)
    print(f"Reduced user features shape: {user_features_reduced.shape}")
    
    return user_features_reduced

# Get recommendations based on user input
def get_recommendations(user_input, df, kmeans, job_features_reduced, num_recommendations=5):
    print("Getting recommendations...")
    
    # Find user's cluster
    user_cluster = kmeans.predict(user_input)[0]
    print(f"User assigned to cluster: {user_cluster}")
    
    # Filter jobs in user's cluster
    cluster_jobs_idx = df[df["cluster"] == user_cluster].index
    print(f"Number of jobs in cluster {user_cluster}: {len(cluster_jobs_idx)}")
    
    cluster_job_features = job_features_reduced[cluster_jobs_idx]
    
    # Use kNN to find top recommendations
    knn = NearestNeighbors(n_neighbors=num_recommendations, metric="cosine")
    knn.fit(cluster_job_features)
    distances, indices = knn.kneighbors(user_input)
    
    # Get recommended jobs
    recommended_indices = cluster_jobs_idx[indices[0]]
    recommendations = df.iloc[recommended_indices][["Title", "Company", "Salary", "Location", "Experience", "job_link"]]
    
    # Calculate similarity score (1 - distance)
    similarities = 1 - distances[0]
    recommendations['similarity'] = similarities
    
    print(f"Average similarity score: {similarities.mean():.3f}")
    
    return recommendations, user_cluster, similarities.mean()

# Main function to perform the entire recommendation process
def recommend_jobs(skills, salary, exp, location, num_recommendations=5):
    # Load all necessary models and data
    df, vectorizer, svd, kmeans, feature_info, job_features_reduced = load_models()
    
    # Preprocess user input
    user_input = preprocess_user_input(skills, salary, exp, location, vectorizer, svd, feature_info)
    
    # Get recommendations
    recommendations, user_cluster, avg_similarity = get_recommendations(
        user_input, df, kmeans, job_features_reduced, num_recommendations
    )
    
    return recommendations, user_cluster, avg_similarity

# Example usage
if __name__ == "__main__":
    # Sample user input
    sample_skills = "python, java , pyspark , "
    sample_salary = "2"  # LPA
    sample_exp = "0"      # years
    sample_location = "noida"
    
    print("\n" + "="*50)
    print(f"Getting recommendations for:")
    print(f"Skills: {sample_skills}")
    print(f"Salary: {sample_salary} LPA")
    print(f"Experience: {sample_exp} years")
    print(f"Location: {sample_location}")
    print("="*50 + "\n")
    
    # Get recommendations
    recommendations, user_cluster, avg_similarity = recommend_jobs(
        sample_skills, sample_salary, sample_exp, sample_location
    )
    
    # Display results
    print("\n" + "="*50)
    print(f"Recommendation Results:")
    print(f"User assigned to cluster: {user_cluster}")
    print(f"Average similarity score: {avg_similarity:.3f} ({avg_similarity*100:.1f}%)")
    print(f"Top {len(recommendations)} recommendations:")
    print("="*50)
    
    for i, (_, job) in enumerate(recommendations.iterrows(), 1):
        print(f"\nRecommendation #{i} (Similarity: {job['similarity']*100:.1f}%):")
        print(f"Title: {job['Title']}")
        print(f"Company: {job['Company']}")
        print(f"Salary: {job['Salary']}")
        print(f"Location: {job['Location']}")
        print(f"Experience: {job['Experience']}")
        print(f"Job Link: {job['job_link']}")


Getting recommendations for:
Skills: python, java , pyspark , 
Salary: 2 LPA
Experience: 0 years
Location: noida

Loading models and data...
Loaded processed jobs: 3978 rows
Loaded vectorizer
Loaded SVD model
Loaded K-means model
Loaded feature information
Loaded reduced job features: (3978, 50)
Preprocessing user input...
Skills vector shape: (1, 5371)
Normalized salary: 0.032
Normalized experience: 0.000
Location vector shape: (1, 10)
Combined user features shape: (1, 5383)
Reduced user features shape: (1, 50)
Getting recommendations...
User assigned to cluster: 4
Number of jobs in cluster 4: 581
Average similarity score: 0.956

Recommendation Results:
User assigned to cluster: 4
Average similarity score: 0.956 (95.6%)
Top 5 recommendations:

Recommendation #1 (Similarity: 96.7%):
Title: Azure Devops Engineer
Company: Infogain
Salary: Not disclosed
Location: Noida, Pune, Mumbai (All Areas)
Experience: 4-6 Yrs
Job Link: https://www.naukri.com/job-listings-azure-devops-engineer-infoga