In [1]:
!pip install pypdf
!pip install pandas
!pip install matplotlib
!pip install numpy
!pip install seaborn
!pip install nltk
!pip install sentence_transformers
!pip install openai
!pip install keybert
!pip install -U python-jobspy


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')



In [1]:
from pypdf import PdfReader
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from transformers import AutoModel
import torch

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from openai import OpenAI
import json
import re
import pickle

import warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm, trange


In [3]:
from data_fetching import update_job_data
from resume_parser import resume_parser
from resume_info_extraction import resume_summary
from data_cleaning import clean_data
from jd_embeddings import compute_job_embeddings

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

In [4]:
#df = update_job_data()
df = pd.read_csv('job_data.csv')
df = clean_data(df)
resume_summary = resume_summary('Binrui_Yang_Resume.pdf')
resume_info = resume_parser('Binrui_Yang_Resume.pdf')

Parsed data saved to parsed_resume.json


In [8]:
job_embeddings_bert, tfidf_job_descriptions, tfidf_vectorizer = compute_job_embeddings(df)

In [9]:
def compute_resume_embedding(resume_summary, resume_info):
    with open('tfidf_model.pkl', 'rb') as f:
        tfidf_vectorizer = pickle.load(f)
    model = SentenceTransformer('all-mpnet-base-v2')
    employment_exp = resume_info['employment_experiences']

    resume_embedding_bert = model.encode(resume_summary, convert_to_tensor=True)
    resume_tfidf = tfidf_vectorizer.transform([employment_exp])

    return resume_embedding_bert, resume_tfidf


In [10]:
resume_embedding_bert, resume_tfidf = compute_resume_embedding(resume_summary, resume_info)

In [17]:
def generate_recommendations(job_embeddings_bert, tfidf_job_descriptions, resume_embedding_bert, resume_tfidf, df, top_k=10):
    """
    Generate job recommendations based on a combination of BERT embeddings and TF-IDF scores.

    Parameters:
    - job_embeddings_bert: Matrix of BERT embeddings for job descriptions (shape: num_jobs x embedding_dim).
    - tfidf_job_descriptions: Sparse matrix of TF-IDF vectors for job descriptions.
    - resume_embedding_bert: BERT embedding vector for the resume (shape: 1 x embedding_dim).
    - resume_tfidf: Sparse matrix of the TF-IDF vector for the resume (shape: 1 x num_features).
    - df: Pandas DataFrame containing the job descriptions and metadata.
    - top_k: Number of top recommendations to return (default: 10).

    Returns:
    - A DataFrame containing the top-k recommended jobs.
    """
    cosine_scores_bert = util.cos_sim(resume_embedding_bert, job_embeddings_bert)[0].cpu().numpy()
    cosine_scores_tfidf = cosine_similarity(resume_tfidf, tfidf_job_descriptions).flatten()
    combined_scores = 0.5 * cosine_scores_bert + 0.5 * cosine_scores_tfidf
    top_indices = combined_scores.argsort()[-top_k:][::-1]


    return df.iloc[top_indices].assign(combined_score=combined_scores[top_indices])




In [22]:
top_recommendations = generate_recommendations(
    job_embeddings_bert=job_embeddings_bert,
    tfidf_job_descriptions=tfidf_job_descriptions,
    resume_embedding_bert=resume_embedding_bert,
    resume_tfidf=resume_tfidf,
    df=df,
    top_k=10
)

In [23]:
top_recommendations

Unnamed: 0,id,site,job_url,title,company,date_posted,description,date_fetched,city,state,cleaned_desc,cleaned_desc_2_len,description_clean,combined_score
2435,in-86ac34ac66da4c44,indeed,https://www.indeed.com/viewjob?jk=86ac34ac66da...,AI/ML Data Scientist,"Nexagen Networks, Inc",2024-11-26,"Nexagen Networks, Inc. takes pride in its repu...",2024-12-01,Aberdeen,MD,We are seeking a highly skilled and motivated ...,332,seeking highly skilled motivated aiml data sci...,0.394502
3017,gd-1009533585633,glassdoor,https://www.glassdoor.com/job-listing/j?jl=100...,Data Analyst,"McAfee, LLC",2024-11-19,***Job Title:***\nData Analyst***Role Overview...,2024-12-01,Remote,Remote,Data Analyst Data Analyst Data Science \& ...,331,data analyst data analyst data science analyti...,0.392457
1234,gd-1009543102359,glassdoor,https://www.glassdoor.com/job-listing/j?jl=100...,Data Scientist,POWER-tek Global Inc.,2024-11-28,The Data Science \& Modeling role focuses on l...,2024-12-01,White Plains,NY,Use advanced analytical techniques to anal...,320,use advanced analytical technique analyze stru...,0.390821
2973,in-0e9b73bb78b634fc,indeed,https://www.indeed.com/viewjob?jk=0e9b73bb78b6...,Data Analyst – NLP Data Science (BHJOB22048_676),ITmPowered,2022-03-24,**Data Analyst – ML NLP Data Scientist – Build...,2024-12-01,Ontario,CA,Have you ever wanted to work on state\ of\ the...,319,ever wanted work state art natural language pr...,0.388108
477,gd-1009542064111,glassdoor,https://www.glassdoor.com/job-listing/j?jl=100...,AI/ML Data Scientist,Nexagen,2024-11-26,**Job ID**27744614\n**Work Remote**Yes\n**Loca...,2024-11-30,Remote,Remote,"Remote Remote Nexagen Networks, Inc. ta...",334,remote remote nexagen network inc take pride r...,0.383152
2961,in-ea019366df9aa23a,indeed,https://www.indeed.com/viewjob?jk=ea019366df9a...,Data Scientist/Architect Consultant,"CRP, Incorporated",2024-11-20,**Data Scientist/Architect Consultant**\n\nThe...,2024-12-01,Washington,DC,The Data Scientist/Architect \ Consultant wil...,345,data scientistarchitect consultant lead crp cr...,0.382487
2496,in-83c4f77ad9858dac,indeed,https://www.indeed.com/viewjob?jk=83c4f77ad985...,AI Software Engineer,"Copart, Inc",2024-11-25,Copart is looking for an AI/ML Engineer to be ...,2024-12-01,Dallas,TX,"Gather business requirements, translate them...",315,gather business requirement translate informat...,0.367448
2447,in-c6e3d3606bd4c9da,indeed,https://www.indeed.com/viewjob?jk=c6e3d3606bd4...,Data Scientist,ThinkEd,2024-11-26,"**Job Summary** \nOur client, a fast\-growing...",2024-12-01,Dallas,TX,"Our client, a fast\ growing AI company that de...",313,client fast growing ai company design large sc...,0.367304
3373,in-df9860ea94cbcc1b,indeed,https://www.indeed.com/viewjob?jk=df9860ea94cb...,Business Intelligence (BI) Data Analysis,POWER-tek Global Inc.,2024-11-27,The Business Intelligence (BI) Data Analysis t...,2024-12-01,White Plains,NY,Analyze large datasets from various source...,283,analyze large datasets various source uncover ...,0.367012
3494,in-57006664778bde86,indeed,https://www.indeed.com/viewjob?jk=57006664778b...,Data Analyst,SECU,2024-11-26,**SECU is seeking a Data Analyst. This is a re...,2024-12-01,Linthicum,MD,"to intake requests, projects, and requirem...",313,intake request project requirement requiring a...,0.36473


In [None]:
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
# tfidf_job_descriptions = tfidf_vectorizer.fit_transform(df['description_clean'])
# tfidf_resume = tfidf_vectorizer.transform([employment_exp])

# cosine_similarities = cosine_similarity(tfidf_resume, tfidf_job_descriptions)


In [None]:
# sort the cosine_similarities

In [None]:
# resume_summary = resume_summary('Yuke_Wu_Resume.pdf')
# resume_info = resume_parser('Yuke_Wu_Resume.pdf')


# resume_embedding = model.encode(resume_summary, convert_to_tensor=True)

# cosine_scores_bert = util.pytorch_cos_sim(resume_embedding, job_embeddings)[0]
# top_10_indices = torch.topk(cosine_scores_bert, k=10, largest=True, sorted=True).indices.tolist()

# df.iloc[top_10_indices]

In [None]:
# jina_resume_embedding = jina.encode(resume_summary, convert_to_tensor=True)

# cosine_scores_jina = util.pytorch_cos_sim(jina_resume_embedding, jina_job_embeddings)[0]
# jina_top_10_indices = torch.topk(cosine_scores_jina, k=10, largest=True, sorted=True).indices.tolist()

# df.iloc[jina_top_10_indices]