## Word Embeddings with gensim Word2Vec

In [2]:
import gensim
import pandas as pd
import numpy as np
import os
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/jansen52x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jansen52x/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def preprocess(text):
    text = text.lower()
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [5]:
def text_to_vector(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None

In [None]:
def find_similar_texts(input, corpus, model):
    preprocessed_input = preprocess(input)
    input_vector = text_to_vector(preprocessed_input, model)
    if input_vector is None:
        return "Could not generate vector for input text"
    
    similarities = []
    for text in corpus:
        preprocessed_text = preprocess(text)
        text_vector = text_to_vector(preprocessed_text, model)

        if text_vector is not None:
            similarity = cosine_similarity([input_vector], [text_vector])[0][0]
            similarities.append((text, similarity.item()))
        else:
            similarities.append((text, 0))

    top_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[:10]
    final_similar = pd.DataFrame(top_similar, columns=["Job Description", "Similarity Score"])

    return final_similar

### Resume datasets

In [7]:
dataset = "resume_dataset"
cols = ["clean_text"]

if not os.path.exists(dataset):
    print(f"Folder '{dataset}' does not exist")
else:
    csv_files = [file for file in os.listdir(dataset) if file.endswith(".csv")]
    resume_dataframes = {file: pd.read_csv(os.path.join(dataset, file), usecols=cols) for file in csv_files}
    
    for file, df in resume_dataframes.items():
        print(f"File: {file}")
        print(df.head())
        print("\n")

File: Software_Developer.csv
                                          clean_text
0  software developer software developer software...
1  software developer project planning 3 years cu...
2  sr software engineer software developer smart ...
3  technology analyst onsite subject matter exper...
4  senior software engineer software engineer sof...


File: IT_Security_Analyst.csv
                                          clean_text
0  it security analyst active directory security ...
1  it security analyst vulnerability management i...
2  cyber security analyst it support analyst data...
3  cyber security analyst freelance it consultant...
4  quality assurance lead it security consultant ...


File: Software_Developer2.csv
                                          clean_text
0  senior software developer java linux senior so...
1  ui developer react developer ui developer reac...
2  engineering manager software developer ios dev...
3  application developer full stack java develope...
4  tab

### Job posting datasets

In [8]:
dataset = "job_posting_dataset/computing_desc_job_posting.csv"
cols = ["description"]

if not os.path.exists(dataset):
    print(f"Folder '{dataset}' does not exist")
else:
    job_posting_df = pd.read_csv(dataset, usecols=cols)
    job_posting_df["preprocessed_desc"] = job_posting_df["description"].apply(preprocess)
   
    print(job_posting_df.head())

                                         description  \
0  PGAV Destinations is seeking a self-motivated ...   
1  A leading pharmaceutical company committed to ...   
2  Education Bachelor's degree in software, math,...   
3  Job Description:GOYT is seeking a skilled and ...   
4  Are you driven by the thrill of solving proble...   

                                   preprocessed_desc  
0  pgav destinations seeking selfmotivated highly...  
1  leading pharmaceutical company committed devel...  
2  education bachelors degree software math scien...  
3  job descriptiongoyt seeking skilled motivated ...  
4  driven thrill solving problems offering unpara...  


In [9]:
corpus = []
for df in resume_dataframes.values():
    corpus.extend(df["clean_text"].tolist())
corpus.extend(job_posting_df["preprocessed_desc"].tolist())
corpus = [str(doc) if not isinstance(doc, str) else doc for doc in corpus]

print(len(corpus))

44533


In [10]:
sentences = [doc.split() for doc in corpus]

model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [11]:
model.save('matching.model')

In [12]:
print(job_posting_df["description"])

0        PGAV Destinations is seeking a self-motivated ...
1        A leading pharmaceutical company committed to ...
2        Education Bachelor's degree in software, math,...
3        Job Description:GOYT is seeking a skilled and ...
4        Are you driven by the thrill of solving proble...
                               ...                        
12277    Are you a dynamic and experienced Hydraulic Sy...
12278    Position: Quality Engineer I/IILocation: Irvin...
12279    The Dyrt is the largest digital camping platfo...
12280    Position: Quality Engineer I (Complaint Invest...
12281    About Pinterest:\n\nMillions of people across ...
Name: description, Length: 12282, dtype: object


### Resume matching with job posting

In [21]:
# sample resume generated by chatgpt, replace with the resume you want to match
input_resume = "Professional Summary Motivated and detail-oriented Java Developer with over 5 years of experience in designing, developing, and maintaining scalable Java-based applications. Proficient in Java, Spring Boot, Hibernate, and RESTful APIs, with a proven track record of delivering high-quality software solutions in Agile environments. Adept at collaborating with cross-functional teams to enhance application performance and user experience. Seeking to leverage my expertise to drive the success of innovative software projects. Technical SkillsProgramming Languages: Java (expert), Python (intermediate), SQL (advanced), JavaScript (proficient) Frameworks: Spring Boot (expert), Hibernate (advanced), J2EE (proficient) Tools: Eclipse, IntelliJ IDEA, Git, JIRA Databases: MySQL (advanced), Oracle (proficient), MongoDB (intermediate) APIs: RESTful APIs, SOAP APIs Other Skills: Object-Oriented Design (OOD), Microservices Architecture, Unit Testing Professional Experience Senior Java Developer TechVantage Solutions | San Francisco, CA June 2020 – Present Designed and developed scalable microservices architecture using Spring Boot and Hibernate. Optimized server performance by creating custom Java APIs, reducing processing time by 20%. Led a team of developers in maintaining and upgrading legacy Java applications. Conducted automated testing using JUnit and Mockito to ensure code quality. Collaborated with cross-functional teams to gather requirements and implement solutions aligned with business goals. Key Achievement: Reduced system downtime by 30% through proactive debugging and performance optimization. Java Developer Apex Systems | Los Angeles, CA May 2017 – May 2020 Developed enterprise-level applications using Java/J2EE technologies. Enhanced application functionality by implementing new features based on user feedback. Conducted regular code reviews to ensure adherence to best practices. Created interactive UIs using JSP, HTML5, and CSS3. Key Achievement: Improved application response time by 25% through efficient database query optimization. Education Bachelor of Science in Computer Science University of California, Berkeley | Berkeley, CA Graduation Date: May 2017 Certifications Oracle Certified Professional: Java SE Programmer (OCPJP) – 2021 Professional Scrum Master I (PSM I) – 2020 Projects E-Commerce Platform Development Built a scalable e-commerce platform using Spring Boot and Hibernate. Integrated payment gateways and optimized database queries for faster transaction processing. Mobile Application Development Developed a mobile application for fitness tracking using Java and SQLite. Implemented RESTful APIs for seamless communication between the app and the server."

similar_job_postings = find_similar_texts(input_resume, job_posting_df["description"], model)
# top_postings = [(text, similarity) for text, similarity in similar_job_postings]
# top_postings_df = pd.DataFrame(top_postings, columns=["Job Description", "Similarity Score"])

print(similar_job_postings)

<class 'float'>
                                     Job Description  Similarity Score
0  Job Title: Java Full Stack DeveloperLocation: ...          0.902706
1  Job Summary: The Java Developer will be respon...          0.898342
2  Job Summary: \nThe Java Developer will be resp...          0.894416
3  We are looking for a talented and experienced ...          0.892290
4  Our client is looking for a UI Developer for a...          0.887228
5  At NEP Group our purpose is to deliver innovat...          0.882354
6  Candidates who are willing to work on W2\nFull...          0.882315
7  Role: Java Full StacK developer with react JSL...          0.881021
8  If a vendor is submitting a candidate, you mus...          0.880167
9  We are seeking a Senior .NET Software Engineer...          0.875350
