In [16]:
import pandas as pd 

In [17]:
jobs_df = pd.read_csv("../../data/raw/job_Postings.csv")
resumes_df = pd.read_csv("../../data/raw/resume.csv")

In [18]:
jobs_df = jobs_df.drop_duplicates() #remove dulicate values
jobs_df.isna().sum()  #checking for any missing values

title              0
description        0
requirements       0
company_profile    0
location           0
salary_range       0
employment_type    0
industry           0
benefits           0
fraudulent         0
dtype: int64

In [19]:
# if 'fraudulent' in jobs_df.columns:
#     jobs_df = jobs_df.drop(columns=['fraudulent'])

# jobs_df.columns

In [20]:
job_text_cols = [
    'title',
    'company_profile',
    'description',
    'requirements',
    'benefits',
    'location',
    'employment_type',
    'salary_range',
    'industry'

]

# Keep only existing columns
job_text_cols = [col for col in job_text_cols if col in jobs_df.columns]

jobs_df[job_text_cols] = jobs_df[job_text_cols].fillna('')


In [21]:
resume_text_cols = [
    'skills',
    'experience',
    'education',
    'summary'
]

resume_text_cols = [col for col in resume_text_cols if col in resumes_df.columns]
resumes_df[resume_text_cols] = resumes_df[resume_text_cols].fillna('')


In [22]:
# Combine job text
jobs_df['job_text'] = jobs_df[job_text_cols].apply(lambda x: ' '.join(x), axis=1)

# Combine resume text
resumes_df['resume_text'] = resumes_df[resume_text_cols].apply(lambda x: ' '.join(x), axis=1)


## Data Preprocessing 

In [23]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [24]:
jobs_df['job_text'] = jobs_df['job_text'].apply(clean_text)
resumes_df['resume_text'] = resumes_df['resume_text'].apply(clean_text)


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

# Fit on job descriptions
job_vectors = tfidf.fit_transform(jobs_df['job_text'])

# Transform resumes using same vectorizer
resume_vectors = tfidf.transform(resumes_df['resume_text'])


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_jobs(resume_index, top_n=5):
    similarities = cosine_similarity(
        resume_vectors[resume_index],
        job_vectors
    ).flatten()

    top_jobs = similarities.argsort()[-top_n:][::-1]

    return jobs_df.iloc[top_jobs][
        ['title', 'description']
    ]


In [27]:
# Test for first resume
recommended = recommend_jobs(resume_index=0, top_n=5)
# recommended


In [None]:
# jobs_df.to_csv("../../data/processed/clean_jobs.csv", index=False)
# resumes_df.to_csv("../../data/processed/clean_resumes.csv", index=False)