In [3]:
from pypdf import PdfReader
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [4]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
reader = PdfReader('job_posting_data/Binrui_Yang_Resume.pdf')
page = reader.pages[0]
text = page.extract_text()



In [20]:
job_df_1 = pd.read_csv('job_posting_data/jobs.csv')
job_df_2 = pd.read_csv('job_posting_data/postings.csv')

job_df_1.shape, job_df_2.shape

((8261, 18), (123849, 31))

In [21]:
# random sample 2000 rows from job_df_1
job_df_1 = job_df_1.sample(n=2000, random_state=42)
job_df_2 = job_df_2.sample(n=2000, random_state=42)

In [22]:
job_df_1 = job_df_1[['company', 'title', 'description', 'post_url']]
job_df_2 = job_df_2[['company_name', 'title', 'description', 'job_posting_url']]

In [23]:
job_df_1.columns = ['company_name', 'title', 'description', 'job_posting_url']
job_df = pd.concat([job_df_1, job_df_2])

In [24]:
df = job_df[['title', 'description']].drop_duplicates().dropna()

df.shape

(3554, 2)

In [25]:
def preprocess_text(text):
    # lowecasing
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # tokenize
    words = word_tokenize(text)
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # lemmatization the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a full string
    preprocess_text = ' '.join(words)

    return preprocess_text

In [26]:
df['description'] = df['description'].apply(preprocess_text)

In [27]:
from imblearn.over_sampling import RandomOverSampler


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
ros = RandomOverSampler(random_state=42)

# Separate features and target
X_train = train_df['description']
y_train = train_df['title']

# Transform the data into an array to use with RandomUnderSampler
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

# Convert back to DataFrame for compatibility with existing code
sample_train_df = pd.DataFrame({
    'description': X_train_resampled.flatten(),
    'title': y_train_resampled
})

In [35]:
sample_train_df = train_df.sample(frac=0.5, random_state=42)
sample_train_df.shape

(1422, 2)

In [36]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(solver='liblinear'))
])

# Parameter grid
param_grid = {
    'tfidf__max_df': [0.75, 0.85, 1.0],
    'tfidf__min_df': [1, 2, 5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10]
}

# Perform RandomizedSearchCV
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=2, verbose=2, n_jobs=-1, n_iter=10)
grid_search.fit(sample_train_df['description'], sample_train_df['title'])

# Update vectorizer with best parameters
best_params = grid_search.best_params_
vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=best_params['tfidf__max_df'],
                             min_df=best_params['tfidf__min_df'],
                             ngram_range=best_params['tfidf__ngram_range'])

# Fit the vectorizer on the full dataset
tfidf_matrix = vectorizer.fit_transform(sample_train_df['description'])

Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [37]:
from sentence_transformers import SentenceTransformer, util

# Reset the index of the DataFrame
sample_train_df = sample_train_df.reset_index(drop=True)

# load pre-trained model
model = SentenceTransformer('all-mpnet-base-v2')

# encode job description
job_embeddings = model.encode(sample_train_df['description'].tolist(), convert_to_tensor=True)

In [42]:
def search_jobs(query, job_embeddings, df, model, vectorizer, top_n=10):
    # Preprocess and encode the query with BERT
    query = preprocess_text(query)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity with BERT embeddings
    cosine_scores_bert = util.pytorch_cos_sim(query_embedding, job_embeddings)[0]

    # Transform query with TF-IDF vectorizer and compute cosine similarity
    query_tfidf = vectorizer.transform([query])
    cosine_scores_tfidf = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Combine scores (e.g., weighted sum)
    combined_scores = 0.8 * cosine_scores_bert.cpu().numpy() + 0.2 * cosine_scores_tfidf

    # Get top N most similar jobs
    top_indices = np.argsort(combined_scores)[-top_n:][::-1]
    top_jobs = df.iloc[top_indices]

    return top_jobs

In [43]:
predicted_labels = grid_search.predict(test_df['description'])
true_labels = test_df['title']

# Define evaluation function
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}")

# Evaluate the model
evaluate_model(true_labels, predicted_labels)

Accuracy: 0.18143459915611815
Precision: 0.06969503346469212
Recall: 0.18143459915611815
F1-Score: 0.09140309696020224


In [40]:
text[:100]

' \n \nBinrui Yang by2361@columbia.edu | 408-966-5168 | Linkedin.com/in/binrui-y EDUCATION Columbia Uni'

In [45]:
top_jobs = search_jobs(text, job_embeddings, df, model, vectorizer)
top_jobs

Unnamed: 0,title,description
5716,Brand Project Manager,role gensler ’ brand design team making people...
2751,Site Reliability Engineer,team workday application built developer hande...
7227,Machine Learning Engineer,description machine learning ml strategic amaz...
7153,Software Development Snr Director,job description oracle database utility group ...
6715,Staff Software Engineer - Golang / AWS - Enter...,seniortostaff level software engineer proven e...
1840,Senior Azure/Python Engineer (Remote),100 remote full time direct hire job title sen...
4310,Site Reliability Engineer,site reliability engineer position summary sit...
4189,Machine Learning Engineer,knock mission empower people move freely knock...
3063,Data Scientist,company description company description togeth...
230,Machine Learning Engineer,machine learning engineer sorcero leader langu...
