In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

In [2]:
# For recruiter
df = pd.read_csv(r"C:\Users\Nikhil\Downloads\Job_sample_dataset_13.csv").dropna() 
df.head(5)

Unnamed: 0,Job_title,Experience,Name,Skills,Work_Location,Work_Preference,Company
0,Python Developer,3 year,Amit Sharma,"REST APIs, SQL, Django, Python",Pune,"Part-time, Remote",LTI
1,Machine Learning Engineer,1 year,Priya Mehta,"Scikit-learn, TensorFlow, Python, Pandas",Kolkata,"Full-time, Onsite",HCL
2,Data Analyst,4 year,Rahul Desai,"Excel, SQL, Power BI, Python",Bengaluru,"Part-time, onsite",Mindtree
3,Python Developer,2 year,Sneha Iyer,"SQL, REST APIs, Python, Flask",Lucknow,"Full-time, Remote",Tech Mahindra
4,Data Analyst,5 year,Vikram Patel,"Excel, SQL, Tableau, Power BI",Jaipur,"Remote, Full-time",HCL


In [3]:
# For candidate
df1 = pd.read_csv(r"C:\Users\Nikhil\Downloads\Jobs_sample_dataset _12.csv").dropna() 
df1.head(5)

Unnamed: 0,Job_title,Required_experience,Skills,Location,Preferences,Company
0,QA Engineer,2 year,"Automation Testing, JUnit, Manual Testing, Sel...",Kolkata,"Part-time, Remote",TCS
1,Data Analyst,1 year,"Tableau, Power BI, SQL, Python, Excel",Kolkata,"Full-time, Onsite",Accenture
2,Data Analyst,4 year,"Tableau, Python, Excel, Power BI, SQL",Chennai,"Part-time, onsite",LTI
3,Product Manager,5 year,"Agile, JIRA, Scrum, Roadmapping",Mumbai,"Full-time, Remote",Tech Mahindra
4,Data Analyst,0,"SQL, Tableau, Power BI, Excel, Python",Bengaluru,"Remote, Full-time",Zoho


In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [5]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) 
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [6]:
df['combined_features'] = (
    df['Skills'].apply(clean_text) + " " +
    df['Name'].apply(clean_text) + " " +
    df['Job_title'].apply(clean_text) + " " +
    df['Experience'].apply(clean_text) + " " +
    df['Work_Location'].apply(clean_text) + " " +
    df['Work_Preference'].apply(clean_text) + " " +
    df['Company'].apply(clean_text)
)
df.head(5)

Unnamed: 0,Job_title,Experience,Name,Skills,Work_Location,Work_Preference,Company,combined_features
0,Python Developer,3 year,Amit Sharma,"REST APIs, SQL, Django, Python",Pune,"Part-time, Remote",LTI,rest apis sql django python amit sharma python...
1,Machine Learning Engineer,1 year,Priya Mehta,"Scikit-learn, TensorFlow, Python, Pandas",Kolkata,"Full-time, Onsite",HCL,scikitlearn tensorflow python panda priya meht...
2,Data Analyst,4 year,Rahul Desai,"Excel, SQL, Power BI, Python",Bengaluru,"Part-time, onsite",Mindtree,excel sql power bi python rahul desai data ana...
3,Python Developer,2 year,Sneha Iyer,"SQL, REST APIs, Python, Flask",Lucknow,"Full-time, Remote",Tech Mahindra,sql rest apis python flask sneha iyer python d...
4,Data Analyst,5 year,Vikram Patel,"Excel, SQL, Tableau, Power BI",Jaipur,"Remote, Full-time",HCL,excel sql tableau power bi vikram patel data a...


In [7]:
df1['combined_features'] = (
    df1['Skills'].apply(clean_text) + " " +
    df1['Job_title'].apply(clean_text) + " " +
    df1['Required_experience'].apply(clean_text) + " " +
    df1['Location'].apply(clean_text) + " " +
    df1['Preferences'].apply(clean_text) + " " +
    df1['Company'].apply(clean_text)
)
df1.head(5)

Unnamed: 0,Job_title,Required_experience,Skills,Location,Preferences,Company,combined_features
0,QA Engineer,2 year,"Automation Testing, JUnit, Manual Testing, Sel...",Kolkata,"Part-time, Remote",TCS,automation testing junit manual testing seleni...
1,Data Analyst,1 year,"Tableau, Power BI, SQL, Python, Excel",Kolkata,"Full-time, Onsite",Accenture,tableau power bi sql python excel data analyst...
2,Data Analyst,4 year,"Tableau, Python, Excel, Power BI, SQL",Chennai,"Part-time, onsite",LTI,tableau python excel power bi sql data analyst...
3,Product Manager,5 year,"Agile, JIRA, Scrum, Roadmapping",Mumbai,"Full-time, Remote",Tech Mahindra,agile jira scrum roadmapping product manager 5...
4,Data Analyst,0,"SQL, Tableau, Power BI, Excel, Python",Bengaluru,"Remote, Full-time",Zoho,sql tableau power bi excel python data analyst...


In [8]:
tfidf_recruiter = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix_r = tfidf_recruiter.fit_transform(df['combined_features'])

In [9]:
tfidf_candidate = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix_c = tfidf_candidate.fit_transform(df1['combined_features'])

In [19]:
# For Recruiters
def get_recommendation(Skills,Experience = None, top_n=10):

    cleaned_skills = clean_text(Skills).split()
    
    skill_mask = df['Skills'].apply(
        lambda x: all(skill in clean_text(x) for skill in cleaned_skills)
    )
    filtered_jobs = df[skill_mask]
    
    if Experience:
        cleaned_exp = clean_text(Experience)
        exp_mask = filtered_jobs['Experience'].apply(
            lambda x: cleaned_exp in clean_text(x)
        )
        filtered_jobs = filtered_jobs[exp_mask]
    
    if len(filtered_jobs) == 0:
        return 'No matching jobs found with these criteria'
    
    user_query = clean_text(Skills)
    if Experience:
        user_query += ' ' + clean_text(Experience)
    
    query_vec = tfidf_recruiter.transform([user_query])
    filtered_matrix = tfidf_recruiter.transform(filtered_jobs['combined_features'])
    cosine_sim = linear_kernel(query_vec, filtered_matrix).flatten()
    
    sim_indices = cosine_sim.argsort()[-top_n:][::-1]
    
    return filtered_jobs[['Job_title', 'Name', 'Experience', 'Work_Location', 'Company', 'Skills']].iloc[sim_indices]

get_recommendation(Skills='html, css, js, react', Experience='2 year')


Unnamed: 0,Job_title,Name,Experience,Work_Location,Company,Skills
25,Full Stack Developer,Ayesha Khan,2 year,Ahmedabad,Capgemini,"MongoDB, Express, Node.js, JS, HTML, CSS, JS, ..."
18,Frontend Developer,Siddharth Jain,2 year,Bengaluru,HCL,"Bootstrap, React, CSS, JS, HTML"


In [16]:
def get_job_recommendations(Skills, top_n=10):
    # Clean and prepare inputs
    cleaned_skills = clean_text(Skills).split()
    
    skill_mask = df1['Skills'].apply(
        lambda x: all(skill in clean_text(x) for skill in cleaned_skills)
    )
    
    filtered_jobs = df1[skill_mask]
    
    if len(filtered_jobs) == 0:
        return 'No matching jobs found with these skills and location'
    
    user_query = clean_text(Skills)
    
    query_vec = tfidf_recruiter.transform([user_query])
    filtered_matrix = tfidf_recruiter.transform(filtered_jobs['combined_features'])
    cosine_sim = linear_kernel(query_vec, filtered_matrix).flatten()
    
    sim_indices = cosine_sim.argsort()[-top_n:][::-1]
    
    return filtered_jobs[['Job_title', 'Location', 'Company', 'Required_experience', 'Skills']].iloc[sim_indices]

get_job_recommendations(Skills='python')

Unnamed: 0,Job_title,Location,Company,Required_experience,Skills
31,AI Engineer,Hyderabad,Capgemini,0,"Python, PyTorch, Keras, Deep Learning, TensorFlow"
29,AI Engineer,Mumbai,Mindtree,2 year,"Python, PyTorch, Keras, Deep Learning, TensorFlow"
20,Django Developer,Ahmedabad,Capgemini,0,"Django, CSS, PostgreSQL, Python, HTML"
13,Django Developer,Ahmedabad,Capgemini,5 year,"PostgreSQL, HTML, JS, Python, CSS, Django"
21,Flask Developer,Mumbai,TCS,1 year,"CSS, SQLAlchemy, Python, Flask, HTML"
23,Django Developer,Ahmedabad,Accenture,0,"CSS, HTML, Django, JS, Python"
19,Flask Developer,Chennai,LTI,5 year,"HTML, Python, Flask, SQL, CSS"
38,Data Analyst,Ahmedabad,Capgemini,2 year,"SQL, Python, Excel, Tableau, Power BI"
10,Software Engineer,Ahmedabad,HCL,0,"Python, Java, Data Structures, Algorithms, C"
22,Machine Learning Engineer,Hyderabad,Mindtree,1 year,"TensorFlow, Python, Scikit-learn, Keras, Pandas"


In [12]:
import pickle

with open('tfidf_recruiter.pkl', 'wb') as f:
    pickle.dump(tfidf_recruiter, f)

with open('tfidf_candidate.pkl', 'wb') as f:
    pickle.dump(tfidf_candidate, f)

df.to_pickle('candidate_profiles.pkl')
df1.to_pickle('job_postings.pkl')