# InterviewMate - Analysis JD dataset with resume model

In [33]:
import pandas as pd
import seaborn as sns
import re
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity  
from gensim.models import Word2Vec  
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer   
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
df = pd.read_csv('../data/JD/job_dataset.csv')
df.head()
df.shape

(1068, 7)

In [35]:
df_resume = pd.read_csv('../data/data/processed_data/pdf_data.csv')
df_resume.head()
df_resume.shape

(2484, 6)

In [36]:
df['Title'].value_counts()

Title
.NET Developer                  20
Business Analyst                20
Copywriter                      20
Data Engineer                   20
Digital Marketing Specialist    20
                                ..
Android Architect                1
iOS Mobile Developer             1
Junior iOS Developer             1
iOS App Developer Intern         1
Graduate iOS Developer           1
Name: count, Length: 218, dtype: int64

In [37]:
def build_jd_text(row):
    parts = [
        str(row.get("Title", "")),
        str(row.get("ExperienceLevel", "")),
        str(row.get("YearsOfExperience", "")),
        str(row.get("Skills", "")),
        str(row.get("Responsibilities", "")),
        str(row.get("Keywords", ""))
    ]
    return " ".join(parts)

df["JD_text"] = df.apply(build_jd_text, axis=1)
df.head()


Unnamed: 0,JobID,Title,ExperienceLevel,YearsOfExperience,Skills,Responsibilities,Keywords,JD_text
0,NET-F-001,.NET Developer,Fresher,0-1,C#; VB.NET basics; .NET Framework; .NET Core f...,Assist in coding and debugging applications; L...,.NET; C#; ASP.NET MVC; Entity Framework; SQL S...,.NET Developer Fresher 0-1 C#; VB.NET basics; ...
1,NET-F-002,.NET Developer,Fresher,0-1,C#; .NET Framework basics; ASP.NET; Razor; HTM...,Write simple C# programs under guidance; Suppo...,.NET; C#; ASP.NET MVC; Entity Framework; SQL S...,.NET Developer Fresher 0-1 C#; .NET Framework ...
2,NET-F-003,.NET Developer,Fresher,0-1,C#; VB.NET basics; .NET Core; ASP.NET MVC; HTM...,Contribute to development of small modules; As...,.NET; C#; ASP.NET MVC; SQL Server; Entity Fram...,.NET Developer Fresher 0-1 C#; VB.NET basics; ...
3,NET-F-004,.NET Developer,Fresher,0-1,C#; .NET Framework; ASP.NET basics; SQL Server...,Support in software design documentation; Assi...,.NET; C#; SQL Server; Entity Framework; ASP.NET,.NET Developer Fresher 0-1 C#; .NET Framework;...
4,NET-F-005,.NET Developer,Fresher,0-1,C#; ASP.NET; MVC; Entity Framework basics; SQL...,Learn to design and build ASP.NET applications...,.NET; C#; ASP.NET MVC; Entity Framework; SQL S...,.NET Developer Fresher 0-1 C#; ASP.NET; MVC; E...


## Sentence Transformer

In [56]:

model = SentenceTransformer("all-MiniLM-L6-v2")
jd_embeddings = model.encode(df["JD_text"].tolist(), show_progress_bar=True)
resume_embeddings =  np.load("model/sentence_tranformers.npy")
print('Success on import jd and resume embeddings.')


Batches: 100%|██████████| 34/34 [00:33<00:00,  1.02it/s]

Success on import jd and resume embeddings.





In [39]:
df_resume.head()

Unnamed: 0.1,Unnamed: 0,Filename,Filepath,Extracted_Text,Category,Cleaned_Text
0,0,10554236.pdf,../data/data/data/ACCOUNTANT\10554236.pdf,ACCOUNTANT\nSummary\nFinancial Accountant spec...,ACCOUNTANT,accountant summary financial accountant specia...
1,1,10674770.pdf,../data/data/data/ACCOUNTANT\10674770.pdf,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,ACCOUNTANT,staff accountant summary highly analytical det...
2,2,11163645.pdf,../data/data/data/ACCOUNTANT\11163645.pdf,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,ACCOUNTANT,accountant professional summary obtain positio...
3,3,11759079.pdf,../data/data/data/ACCOUNTANT\11759079.pdf,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...,ACCOUNTANT,senior accountant experience company name june...
4,4,12065211.pdf,../data/data/data/ACCOUNTANT\12065211.pdf,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,ACCOUNTANT,senior accountant professional summary senior ...


In [40]:
df_resume.shape

(2484, 6)

## Match JD to Resume 
### Method 1: Pure semantic matching using Sentence-BERT

Purpose -> real ranking 

Sentence-BERT embeddings:

- understand meaning / capture context

- are pre-trained on massive NLI + semantic similarity datasets

- excel at resume ↔ JD matching (industry-standard approach)

In [59]:
def match_jd_to_resumes(jd_embeddings, top_k=10): 
    # accept 1-D embedding or 2-D (1, dim) 
    emb = np.asarray(jd_embeddings) 
    if emb.ndim == 2 and emb.shape[0] == 1: 
        emb = emb[0] 
    if emb.ndim != 1: 
        raise ValueError("jd_embeddings must be a 1-D embedding vector or a 2-D array with shape (1, dim)") 
    scores = cosine_similarity(emb.reshape(1, -1), resume_embeddings).flatten() 
    # Safely handle cases where resume_embeddings length and df_resume length differ. 
    # We compute sorted indices by score, then filter only those indices that exist in df_resume. 
    sorted_idx = scores.argsort()[::-1] 
    valid_idx = [int(i) for i in sorted_idx if i < len(df_resume)] 
    if len(valid_idx) == 0: 
        # return empty dataframe with score column if no valid resumes available 
        results = df_resume.iloc[[]].copy() 
        results["score"] = [] 
        return results 
    top_k = min(top_k, len(valid_idx)) 
    top_idx = valid_idx[:top_k] 
    results = df_resume.iloc[top_idx].copy() 
    results["score"] = scores[np.array(top_idx)] 
    return results

In [60]:
test_jd = df["JD_text"].iloc[0]
jd_embedding = model.encode(test_jd, normalize_embeddings=True)
print(test_jd)
match_jd_to_resumes(jd_embedding, top_k=10)


.NET Developer Fresher 0-1 C#; VB.NET basics; .NET Framework; .NET Core fundamentals; ASP.NET; MVC; HTML; CSS; JavaScript basics; SQL Server; Entity Framework basics; LINQ; Visual Studio; Git; Unit Testing basics Assist in coding and debugging applications; Learn and apply .NET Framework and Core fundamentals; Support team in building ASP.NET MVC web applications; Write basic SQL queries and work with Entity Framework; Collaborate with peers to solve issues; Participate in code reviews for learning; Follow best practices in coding; Work with version control (Git) .NET; C#; ASP.NET MVC; Entity Framework; SQL Server; LINQ; Visual Studio; Unit Testing


Unnamed: 0.1,Unnamed: 0,Filename,Filepath,Extracted_Text,Category,Cleaned_Text,score
99,99,38847011.pdf,../data/data/data/ACCOUNTANT\38847011.pdf,STAFF ACCOUNTANT\nProfessional Profile\nTo gai...,ACCOUNTANT,staff accountant professional profile gain pay...,0.46765
7,7,12442909.pdf,../data/data/data/ACCOUNTANT\12442909.pdf,ACCOUNTANT\nSummary\nTo utilize my customer re...,ACCOUNTANT,accountant summary utilize customer relations ...,0.460227
65,65,25462793.pdf,../data/data/data/ACCOUNTANT\25462793.pdf,ACCOUNTANT\nSummary\nAccountant with over a de...,ACCOUNTANT,accountant summary accountant decade diverse p...,0.437689
86,86,29999135.pdf,../data/data/data/ACCOUNTANT\29999135.pdf,STAFF ACCOUNTANT\nTAM BUI\nProfessional Summar...,ACCOUNTANT,staff accountant tam bui professional summary ...,0.431244
43,43,21031285.pdf,../data/data/data/ACCOUNTANT\21031285.pdf,ACCOUNTANT\nHighlights\nMicrosoft Office : Int...,ACCOUNTANT,accountant highlights microsoft office interme...,0.404551
9,9,12802330.pdf,../data/data/data/ACCOUNTANT\12802330.pdf,"LEAD ACCOUNTANT\nHighlights\nQuickBooks, Peach...",ACCOUNTANT,lead accountant highlights quickbooks peachtre...,0.383684
58,58,24103168.pdf,../data/data/data/ACCOUNTANT\24103168.pdf,FINANCIAL ACCOUNTANT\nSummary\nAccomplished ac...,ACCOUNTANT,financial accountant summary accomplished acco...,0.380067
55,55,23513618.pdf,../data/data/data/ACCOUNTANT\23513618.pdf,ACCOUNTANT\nProfessional Summary\nResults-orie...,ACCOUNTANT,accountant professional summary results orient...,0.373993
105,105,49997097.pdf,../data/data/data/ACCOUNTANT\49997097.pdf,STAFF ACCOUNTANT\nSummary\nFlexible Accountant...,ACCOUNTANT,staff accountant summary flexible accountant a...,0.373734
113,113,78403342.pdf,../data/data/data/ACCOUNTANT\78403342.pdf,ACCOUNTANT\nSummary\nSelf-motivated accountant...,ACCOUNTANT,accountant summary self motivated accountant o...,0.372826


### Method 2: Hybrid (Semantic + Keywords)

Purpose -> Explain why its fits 

`final_scores = (weight_semantic [0.7] * semantic_scores) + \
                (weight_keyword [0.3] * keyword_scores)`

The 2nd (hybrid) performed worse because:
- keyword scoring introduced noise

- weights distorted ranking

- keywords are often generic and unreliable

- SBERT already captures those concepts better

#### Normalization (matches your Cleaned_Text style)

In [43]:
def normalize_for_match(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


#### Extract JD keywords

In [44]:
def extract_keywords(jd_keywords):
    if not isinstance(jd_keywords, str):
        return []
    jd_keywords = jd_keywords.replace(";", ",")
    return [kw.strip() for kw in jd_keywords.split(",") if kw.strip()]


In [45]:
def keyword_score(resume_text, jd_keywords):
    resume_norm = normalize_for_match(resume_text)
    resume_tokens = set(resume_norm.split())

    matched = 0
    kws = extract_keywords(jd_keywords)

    for kw in kws:
        kw_norm = normalize_for_match(kw)
        kw_tokens = kw_norm.split()

        if kw_tokens and all(t in resume_tokens for t in kw_tokens):
            matched += 1

    return matched / max(1, len(kws))


In [46]:
def match_jd_to_resumes_method_2(jd_embedding, jd_keywords, 
                                 top_k=10, w_semantic=0.7, w_keyword=0.3):

    emb = np.asarray(jd_embedding)
    if emb.ndim == 2 and emb.shape[0] == 1:
        emb = emb[0]

    scores = cosine_similarity(emb.reshape(1, -1), resume_embeddings).flatten()

    kw_scores = []
    for _, row in df_resume.iterrows():
        txt = row.get("Cleaned_Text", "")
        kw_scores.append(keyword_score(txt, jd_keywords))

    kw_scores = np.array(kw_scores)

    if len(kw_scores) != len(scores):
        raise ValueError("Resume embeddings and df_resume not aligned")

    final = (w_semantic * scores) + (w_keyword * kw_scores)

    sorted_idx = final.argsort()[::-1][:top_k]

    results = df_resume.iloc[sorted_idx].copy()
    results["semantic_score"] = scores[sorted_idx]
    results["keyword_score"] = kw_scores[sorted_idx]
    results["final_score"] = final[sorted_idx]

    return results.sort_values(by="final_score", ascending=False)



In [47]:
# jd = df.iloc[0]
# jd_text = jd["JD_text"]   
# jd_embedding = model.encode(jd_text, normalize_embeddings=True)

# results = match_jd_to_resumes_method_2(jd, jd_embedding, top_k=10)
# print(jd)
# results.head()

### Method 3: Semantic Ranking + Keyword EXPLANATION ONLY

In [48]:
def explain_fit_only(resume_text, jd_keywords):
    resume_norm = normalize_for_match(resume_text)
    resume_tokens = set(resume_norm.split())

    matched = []

    for raw_kw in extract_keywords(jd_keywords):
        kw_norm = normalize_for_match(raw_kw)
        kw_tokens = kw_norm.split()

        if kw_tokens and all(t in resume_tokens for t in kw_tokens):
            matched.append(raw_kw)

    return ", ".join(matched) if matched else "No matching keywords"

def match_jd_to_resumes_method_3(jd_embedding, jd_keywords, top_k=10):
    results = match_jd_to_resumes_method_1(jd_embedding, top_k)

    results["Why it fits?"] = results["Cleaned_Text"].apply(
        lambda txt: explain_fit_only(txt, jd_keywords)
    )

    return results



In [49]:
jd = df.iloc[0]

jd_text = jd["JD_text"]
jd_keywords = jd["Keywords"]

jd_embedding = model.encode(jd_text, normalize_embeddings=True)




In [50]:
match_jd_to_resumes_method_1(jd_embedding, top_k=10)

Unnamed: 0.1,Unnamed: 0,Filename,Filepath,Extracted_Text,Category,Cleaned_Text,score
99,99,38847011.pdf,../data/data/data/ACCOUNTANT\38847011.pdf,STAFF ACCOUNTANT\nProfessional Profile\nTo gai...,ACCOUNTANT,staff accountant professional profile gain pay...,0.46765
7,7,12442909.pdf,../data/data/data/ACCOUNTANT\12442909.pdf,ACCOUNTANT\nSummary\nTo utilize my customer re...,ACCOUNTANT,accountant summary utilize customer relations ...,0.460227
65,65,25462793.pdf,../data/data/data/ACCOUNTANT\25462793.pdf,ACCOUNTANT\nSummary\nAccountant with over a de...,ACCOUNTANT,accountant summary accountant decade diverse p...,0.437689
86,86,29999135.pdf,../data/data/data/ACCOUNTANT\29999135.pdf,STAFF ACCOUNTANT\nTAM BUI\nProfessional Summar...,ACCOUNTANT,staff accountant tam bui professional summary ...,0.431244
43,43,21031285.pdf,../data/data/data/ACCOUNTANT\21031285.pdf,ACCOUNTANT\nHighlights\nMicrosoft Office : Int...,ACCOUNTANT,accountant highlights microsoft office interme...,0.404551
9,9,12802330.pdf,../data/data/data/ACCOUNTANT\12802330.pdf,"LEAD ACCOUNTANT\nHighlights\nQuickBooks, Peach...",ACCOUNTANT,lead accountant highlights quickbooks peachtre...,0.383684
58,58,24103168.pdf,../data/data/data/ACCOUNTANT\24103168.pdf,FINANCIAL ACCOUNTANT\nSummary\nAccomplished ac...,ACCOUNTANT,financial accountant summary accomplished acco...,0.380067
55,55,23513618.pdf,../data/data/data/ACCOUNTANT\23513618.pdf,ACCOUNTANT\nProfessional Summary\nResults-orie...,ACCOUNTANT,accountant professional summary results orient...,0.373993
105,105,49997097.pdf,../data/data/data/ACCOUNTANT\49997097.pdf,STAFF ACCOUNTANT\nSummary\nFlexible Accountant...,ACCOUNTANT,staff accountant summary flexible accountant a...,0.373734
113,113,78403342.pdf,../data/data/data/ACCOUNTANT\78403342.pdf,ACCOUNTANT\nSummary\nSelf-motivated accountant...,ACCOUNTANT,accountant summary self motivated accountant o...,0.372826


In [51]:
match_jd_to_resumes_method_2(jd_embedding, jd_keywords, top_k=10)

ValueError: Resume embeddings and df_resume not aligned

In [52]:
results = match_jd_to_resumes_method_3(jd_embedding, jd_keywords, top_k=10)
results.head()


Unnamed: 0.1,Unnamed: 0,Filename,Filepath,Extracted_Text,Category,Cleaned_Text,score,Why it fits?
99,99,38847011.pdf,../data/data/data/ACCOUNTANT\38847011.pdf,STAFF ACCOUNTANT\nProfessional Profile\nTo gai...,ACCOUNTANT,staff accountant professional profile gain pay...,0.46765,No matching keywords
7,7,12442909.pdf,../data/data/data/ACCOUNTANT\12442909.pdf,ACCOUNTANT\nSummary\nTo utilize my customer re...,ACCOUNTANT,accountant summary utilize customer relations ...,0.460227,No matching keywords
65,65,25462793.pdf,../data/data/data/ACCOUNTANT\25462793.pdf,ACCOUNTANT\nSummary\nAccountant with over a de...,ACCOUNTANT,accountant summary accountant decade diverse p...,0.437689,No matching keywords
86,86,29999135.pdf,../data/data/data/ACCOUNTANT\29999135.pdf,STAFF ACCOUNTANT\nTAM BUI\nProfessional Summar...,ACCOUNTANT,staff accountant tam bui professional summary ...,0.431244,No matching keywords
43,43,21031285.pdf,../data/data/data/ACCOUNTANT\21031285.pdf,ACCOUNTANT\nHighlights\nMicrosoft Office : Int...,ACCOUNTANT,accountant highlights microsoft office interme...,0.404551,No matching keywords
