In [1]:
import os
import re
import string
import pandas as pd
import numpy as np
import pdfplumber
import torch
import contractions
from transformers import DistilBertTokenizer, DistilBertModel, T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import google.generativeai as genai
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/992.0 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/992.0 kB ? eta -:--:--
   -------------------- ----------------- 524.3/992.0 kB 558.9 kB/s eta 0:00:01
   -------------------- ----------------- 524.3/992.0 kB 558.9 kB/s eta 0:00:01
   ------------------------------ ------- 786.4/992.0 kB 578.7 kB/s eta 0:00:01
   ------------------------------ ------- 786.4/992.0 kB 578.7 kB/s eta 0:00:01
   ------------------------------ ------- 786.4/992.0 kB 578.7 kB/s eta 0:00:01
   ------------------------------ ------- 786.4/992.0 kB 578.7 kB/s eta 0:00:01
   ------------------------------ ------- 786.4/992.0 kB 578.7

In [2]:
genai.configure(api_key="AIzaSyCShW_UD_Gpiv6N7Hhn7sdif4GQxKRYIzg")

## Extract from PDF

In [36]:
def extract_information(pdf_path):
    """Extract all text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            resume_text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    resume_text += " " + page_text
        return resume_text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

In [37]:
text = extract_information('Shivansh_Tyagi_Resume.pdf')

In [38]:
print(text)

Shivansh Tyagi
# shivelite2003@gmail.com (cid:132) +91-7456833022 (cid:240) Shivansh Tyagi § ShivanshTyagi9
Summary
SkilledinPython,dataanalysis,andmachinelearning,withexperienceinbuildingpredictivemodels,automation
tools, and interactive applications.
Education
Raj Kumar Goel Institute of Technology Ghaziabad, UP
B.Tech in Computer Science and Engineering Expeted Graduation, 2026
◦ Concentrations: Computational Fundamentals
◦ CGPA: 8
◦ Coursework: Data Structure and Algorithms, Operating System, Database Management System, Com-
puter Architecture, Machine Learning, OOPS
Projects
Handwritten-Digit-Recognition github repo 2
◦ Used TensorFlow and Keras to develop a Convolutional Neural Network (CNN) for handwritten digit
recognition with the MNIST dataset, achieving 99.12 accuracy, enhancing classification performance and
model efficiency.
◦ Used MNIST dataset to train the model.
◦ Tools used: Python, NumPy, Pandas, Tensorflow / Keras, Matplotlib
Fruits and Vegetable Classifier github re

In [39]:
def gemini_extract_skills(project_text):
    model = genai.GenerativeModel('gemini-2.0-flash')
    try:
        response = model.generate_content(f"Extract skills from the following project description and return only a python list of skiils:\n{project_text}")
        skills = (response.text)
        skills = skills.replace("python", "").strip()
        skills = skills.replace("```", "").strip()
        extracted_list = ast.literal_eval(skills)
        return extracted_list
    except Exception as e:
        print("Error calling Gemini API:", e)
    return []

In [40]:
def extract_details(resume_text):
    """
    Extract Skills, Education, and Projects sections using regex,
    then augment the Skills by calling the Gemini API on the Projects section.
    """
    # Define regex patterns for each section
    skills_pattern = r'(?:(?:Technical|Professional|Tech)\s+)?Skills\s*\n([\s\S]*?)(?=\n[A-Z][a-z]+\b|\Z)'
    education_pattern = r'Education\s*\n([\s\S]*?)(?=\n[A-Z][a-z]*\n|$)'
    projects_pattern = r'Projects\s*\n([\s\S]*?)(?=\n\s*\n|$)'
    
    # Extract sections using regex
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL)
    projects_match = re.findall(projects_pattern, resume_text, re.DOTALL)
    
    # Get the first match for each section if available
    skills = skills_match[0] if skills_match else ""
    education = education_match[0] if education_match else ""
    projects = projects_match[0] if projects_match else ""
    
    # If the Projects section is found, call the Gemini API to extract additional skills
    additional_skills = ""
    if projects:
        # Assume gemini_extract_skills(projects) returns a list of skills
        extracted_skills = gemini_extract_skills(projects)
        if extracted_skills:
            additional_skills = " ".join(extracted_skills)
    
    # Combine the original skills with the additional skills from projects
    combined_skills = skills + " " + additional_skills if additional_skills else skills
    
    return {
        'Skills': combined_skills.strip(),
        'Education': education.strip()
    }


In [41]:
extracted = extract_details(text)

In [42]:
print(extracted)

{'Skills': 'Languages: Python, Java, C TensorFlow Keras Convolutional Neural Networks (CNN) MNIST dataset Classification Model Efficiency ResNet EfficientNet Hyperparameter Tuning Data Augmentation Kaggle Fruits and Vegetables Image Recognition dataset Python Matplotlib Spotify API YouTube Data API yt-dlp NLP Mood Detection Pandas Numpy Flask', 'Education': 'Raj Kumar Goel Institute of Technology Ghaziabad, UP\nB.Tech in Computer Science and Engineering Expeted Graduation, 2026\n◦ Concentrations: Computational Fundamentals\n◦ CGPA: 8\n◦ Coursework: Data Structure and Algorithms, Operating System, Database Management System, Com-\nputer Architecture, Machine Learning, OOPS'}


In [43]:
def text_cleaning(text: str) -> str:
    """Clean and normalize text for embedding."""
    if pd.isnull(text):
        return ""
    # Lowercase and trim
    text = text.lower().strip()
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs, emails, phone numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text.strip()

In [44]:
%%time
def build_candidate_dataset(data_folder='data'):
    """
    Walk through the data folder (with subfolders by category),
    extract resume text from each PDF, extract details (Skills, Education),
    and store metadata.
    """
    candidate_data = []
    for category in os.listdir(data_folder):
        category_path = os.path.join(data_folder, category)
        if os.path.isdir(category_path):
            for file in os.listdir(category_path):
                if file.lower().endswith('.pdf'):
                    pdf_path = os.path.join(category_path, file)
                    full_text = extract_information(pdf_path)
                    details = extract_details(full_text)
                    # Use filename (without extension) as unique ID
                    candidate_id = os.path.splitext(file)[0]
                    candidate_data.append({
                        'ID': candidate_id,
                        'Category': category,
                        'Skills': details.get('Skills', ''),
                        'Education': details.get('Education', '')
                    })
    # Create DataFrame
    df = pd.DataFrame(candidate_data)
    # Optionally, remove resumes where both Skills and Education are missing
    df = df[~((df['Skills'] == "") & (df['Education'] == ""))].reset_index(drop=True)
    print(df)
    print("\n----------------\n")
    # Concatenate Skills and Education to form the complete CV text
    df['CV'] = (df['Skills'] + " " + df['Education']).fillna("")
    print(df)
    print("\n----------------\n")
    # Clean the concatenated CV text
    df['CV'] = df['CV'].apply(text_cleaning)
    print(df)
    print("\n----------------\n")
    return df

CPU times: total: 0 ns
Wall time: 1.03 ms


In [45]:
%%time
cv_df = build_candidate_dataset(data_folder='Data_real')
print("Candidate dataset shape:", cv_df.shape)

                      ID Category  \
0  Shivansh_Tyagi_Resume        1   
1   Rajat_Gupta_Resume 1        2   
2       Alok_kumar_yadav        3   
3  SanchitChauhan_Rename        4   

                                              Skills  \
0  Languages: Python, Java, C TensorFlow Keras Co...   
1  Languages: C++, Java,Python, HTML/CSS, JavaScr...   
2  • Programming Languages: C, C++, Python, Solid...   
3  Languages: C++,Java,Object-OrientedProgramming...   

                                           Education  
0  Raj Kumar Goel Institute of Technology Ghaziab...  
1  Raj kumar goel institute of technology (AKTU) ...  
2  Degree Institute Board / University CGPA/Perce...  
3  RajKumarGoelInstituteofTechnology(AKTU) Nov202...  

----------------

                      ID Category  \
0  Shivansh_Tyagi_Resume        1   
1   Rajat_Gupta_Resume 1        2   
2       Alok_kumar_yadav        3   
3  SanchitChauhan_Rename        4   

                                              Skills 

In [46]:
cv_df.to_csv('./pdf_extracted_skills_education.csv', index=False)

In [47]:
extraction_model_dir = "./fine_tuned_t5_small"
tokenizer_extraction = T5Tokenizer.from_pretrained(extraction_model_dir)
model_extraction = T5ForConditionalGeneration.from_pretrained(extraction_model_dir)
model_extraction.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [48]:
def generate_skills(job_description, tokenizer, model):
    """
    Extract skills from a job description using the fine-tuned T5 model.
    """
    input_text = "extract skills: " + job_description
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    input_ids = input_ids.to(model.device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    generated_skills = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_skills

In [49]:
jd_data = [
    "We are looking for a dynamic Full-Stack Developer proficient in modern web technologies including ExpressJS, ReactJS, NextJs, NodeJS, HTML, CSS, and JavaScript to build and maintain high-performance web applications. The ideal candidate will have a strong foundation in data management using MongoDB, PostgreSQL, and SQL, coupled with hands-on experience deploying and managing applications on cloud platforms such as AWS and Azure. This role demands a proactive problem-solver with excellent collaboration skills, ready to contribute to innovative projects in a fast-paced environment.",
    "PhD in machine learning, computer science, statistics, biomedical informatics, or a related field relevant 9+ years of industry/academic experience in machine learning or a related fiel Hands-on experience in implementing and training machine learning algorithms and statistical analysis, including for example non-parametric tests, mixed linear models, modern supervised and unsupervised machine learning algorithms such as SVM, random forest, PCA, t-SNE, clustering, LMMs, and deep learning. Experience with Python, Spark (Databricks), DevOps Tools (Github Actions, Docker, etc.) Effective communication skills, including in an interdisciplinary environment Solid written communication skills of scientific material. Proven deep understanding of mathematical foundations of machine learning, including statistics, linear algebra, and computer science"
    # Add more job descriptions as needed...
]
jd_df = pd.DataFrame({
    'position_title': [
        "React Developer",
        "AI/Ml Developer"
    ],
    'job_description': jd_data
})
# Clean the job description text
jd_df['clean_jd'] = jd_df['job_description'].apply(text_cleaning)

extracted_skills = jd_df['clean_jd'].apply(lambda x: generate_skills(x, tokenizer_extraction, model_extraction))
jd_df['final_jd'] = jd_df['clean_jd'] + " " + extracted_skills
# For the prototype, we use a subset (e.g. first 15 JDs)
#num_jds = 15
job_descriptions = jd_df['clean_jd'].tolist()

In [50]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set model to evaluation mode

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [51]:
def get_embedding(text):
    """Generate an embedding for a given text using mean pooling."""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    # Mean pooling over the sequence length dimension
    embedding = outputs.last_hidden_state.mean(dim=1).numpy()[0]
    return embedding

In [52]:
%%time
print("Generating embeddings for job descriptions...")
jd_embeddings = [get_embedding(jd) for jd in job_descriptions]
print("Generating embeddings for candidate CVs...")
resume_embeddings = [get_embedding(cv) for cv in cv_df['CV'].tolist()]

Generating embeddings for job descriptions...
Generating embeddings for candidate CVs...
CPU times: total: 3.73 s
Wall time: 850 ms


In [53]:
print("JD embedding shape:", jd_embeddings[0].shape)
print("Resume embedding shape:", resume_embeddings[0].shape)

JD embedding shape: (768,)
Resume embedding shape: (768,)


In [54]:
similarity_matrix = cosine_similarity(jd_embeddings, resume_embeddings)

# For each JD, pick the top N candidate indices based on similarity
num_top_candidates = 5
top_candidates = {}

for jd_index, scores in enumerate(similarity_matrix):
    candidate_indices = np.argsort(scores)[::-1][:num_top_candidates]
    top_candidates[jd_index] = [(idx, scores[idx]) for idx in candidate_indices]

In [55]:
for jd_index, candidate_list in top_candidates.items():
    position_title = jd_df.loc[jd_index, 'position_title'] if jd_index < len(jd_df) else "N/A"
    print(f"\nTop candidates for JD {jd_index+1} - Position: {position_title}")
    for candidate_index, score in candidate_list:
        candidate_info = cv_df.iloc[candidate_index]
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {candidate_info['Category']}/{candidate_info['ID']}.pdf")


Top candidates for JD 1 - Position: React Developer
  Candidate 2 - Similarity Score: 0.8738 - 2/Rajat_Gupta_Resume 1.pdf
  Candidate 1 - Similarity Score: 0.8423 - 1/Shivansh_Tyagi_Resume.pdf
  Candidate 3 - Similarity Score: 0.8276 - 3/Alok_kumar_yadav.pdf
  Candidate 4 - Similarity Score: 0.8238 - 4/SanchitChauhan_Rename.pdf

Top candidates for JD 2 - Position: AI/Ml Developer
  Candidate 1 - Similarity Score: 0.9155 - 1/Shivansh_Tyagi_Resume.pdf
  Candidate 2 - Similarity Score: 0.8888 - 2/Rajat_Gupta_Resume 1.pdf
  Candidate 3 - Similarity Score: 0.8444 - 3/Alok_kumar_yadav.pdf
  Candidate 4 - Similarity Score: 0.8411 - 4/SanchitChauhan_Rename.pdf
