In [None]:
from google.colab import drive
drive.mount('/content/drive')
proj = '//content/drive/MyDrive/job‑rec‑project'


Mounted at /content/drive


In [25]:
from google.colab import files

uploaded = files.upload()


Saving user_profiles.csv to user_profiles (1).csv
Saving dataset_joblistings.csv to dataset_joblistings (1).csv


In [26]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, average_precision_score
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Load datasets
jobs = pd.read_csv("dataset_joblistings.csv")
users = pd.read_csv("user_profiles.csv")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply text cleaning
jobs['combined'] = (jobs['job_title'] + " " + jobs['description'] + " " + jobs['required_skills']).apply(clean_text)
users['combined'] = (
    users['skillset'] + " " +
    users['education'] + " " +
    users['experience_title'] + " " +
    users['experience_description'] + " " +
    users['job_like_title'] + " " +
    users['job_like_description'] + " " +
    users['job_like_skills']
).apply(clean_text)


In [28]:
tfidf = TfidfVectorizer()
job_vectors = tfidf.fit_transform(jobs['combined'])

# Sample user for demo
sample_user = users.iloc[0]
user_vector = tfidf.transform([sample_user['combined']])

# Cosine Similarity for Ranking
cos_sim = cosine_similarity(user_vector, job_vectors).flatten()
jobs['similarity_score'] = cos_sim
ranked_jobs = jobs.sort_values(by='similarity_score', ascending=False)


In [29]:
# Simulate job_family from job titles for classification
def categorize(title):
    title = title.lower()
    if 'data' in title:
        return 'DataSci'
    elif 'ml' in title or 'ai' in title:
        return 'ML_AI'
    elif 'frontend' in title or 'backend' in title:
        return 'WebDev'
    elif 'cloud' in title or 'devops' in title:
        return 'Infra'
    else:
        return 'Other'

# Add job_family column to the jobs DataFrame first
jobs['job_family'] = jobs['job_title'].apply(categorize)


# Train classifier
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(jobs['combined'])
y = jobs['job_family']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))


F1 Score: 1.0


In [30]:
# New sample user input
new_user = {
    'user_name': 'Sai Tanusha',
    'skillset': 'Python, SQL, Machine Learning, Data Analysis, Deep Learning',
    'education': 'B.Tech in Computer Science',
    'experience': 'Worked as a Data Analyst intern at Motorq. Built machine learning models to analyze customer behavior.',
    'job_like_title': 'Data Scientist',
    'job_like_desc': 'Build predictive models and work with machine learning pipelines.',
    'job_like_skills': 'Python, ML, Statistics, SQL'
}

# Combine user features
new_user['combined'] = ' '.join([
    new_user['skillset'],
    new_user['education'],
    new_user['experience'],
    new_user['job_like_title'],
    new_user['job_like_desc'],
    new_user['job_like_skills']
])


In [31]:
# TF-IDF vector for similarity
new_user_vector = tfidf.transform([new_user['combined']])

# Cosine similarity with job listings
cos_sim = cosine_similarity(new_user_vector, job_vectors).flatten()
jobs['similarity_score'] = cos_sim
ranked_jobs = jobs.sort_values(by='similarity_score', ascending=False)

# Classification to predict job_family
clf_vec = vectorizer.transform([new_user['combined']])
predicted_family = le.inverse_transform(clf.predict(clf_vec))[0]
print("Predicted Job Family for User:", predicted_family)


Predicted Job Family for User: DataSci


In [32]:
# Filter by predicted family and sort
filtered_jobs = ranked_jobs[ranked_jobs['job_family'] == predicted_family]
filtered_jobs = filtered_jobs.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)

# Show Top 10 Recommended Jobs
filtered_jobs[['job_title', 'required_skills', 'similarity_score']].head(10)


Unnamed: 0,job_title,required_skills,similarity_score
0,Data Scientist,"Python, SQL, Machine Learning",0.847234
1,Data Scientist,"Python, SQL, Machine Learning",0.847234
2,Data Scientist,"Python, SQL, Machine Learning",0.847234
3,Data Scientist,"Python, SQL, Machine Learning",0.847234
4,Data Scientist,"Python, SQL, Machine Learning",0.847234
5,Data Analyst,"Excel, SQL, PowerBI",0.250124
6,Data Analyst,"Excel, SQL, PowerBI",0.250124
7,Data Analyst,"Excel, SQL, PowerBI",0.250124
8,Data Analyst,"Excel, SQL, PowerBI",0.250124
9,Data Analyst,"Excel, SQL, PowerBI",0.250124


In [33]:
# Evaluate MAP based on liked family match
liked_family = categorize(new_user['job_like_title'])
relevance = (filtered_jobs['job_family'] == liked_family).astype(int).tolist()
map_score = average_precision_score(relevance, filtered_jobs['similarity_score'])
print("MAP Score for the New User:", map_score)


MAP Score for the New User: 1.0
