In [2]:
from glob import glob
import os
import re
from typing import List, Optional

from dotenv import load_dotenv
import instructor
from openai import OpenAI
from opik.integrations.openai import track_openai
from pydantic import BaseModel, Field
from pypdf import PdfReader, PdfWriter
from rich import print as rprint
import warnings

warnings.filterwarnings('ignore')

In [3]:
# load the environment variables
load_dotenv('../.env')

API_KEY = os.getenv('DEEPSEEK_API_KEY')
DEEPSEEK_URL = os.getenv('DEEPSEEK_URL')
DEEPSEEK_CHAT = os.getenv('DEEPSEEK_CHAT')
DEEPSEEK_REASONER = os.getenv('DEEPSEEK_REASONER')

print(DEEPSEEK_URL)
print(API_KEY)

https://api.deepseek.com
sk-20458afbc6f14646b8b6c3109a153717


### Split the combined resumes into individual resume

In [12]:
reader = PdfReader('../data/resumes_compiled.pdf')

incorrect startxref pointer(3)


In [14]:
pattern = "NATIONAL INSTITUTE OF TECHNOLOGY KARNATAKA, SURATHKAL P.O SRINIVASNAGAR, MANGALORE-575025"
pages = []

for num in range(len(reader.pages)):
    page = reader.pages[num]

    text = page.extract_text()
    clean_text = re.sub(r'\s\s+', ' ', text).strip()

    # save the previous resume
    if pattern in clean_text:
        if pages:
            writer = PdfWriter()
            for p in pages:
                writer.add_page(p)
            
            with open(f"../data/candidate_resume/{filename}", 'wb') as out:
                writer.write(out)
        
        # start the new resume
        pages = [page]

        # extract registration number for filename
        reg_no_line = [line for line in clean_text.split("\n") if 'Reg. No. :' in line]
        if reg_no_line:
            reg_no = reg_no_line[0].split(":")[-1].strip()
        else:
            reg_no = 'page_{num}'
        filename = f"{reg_no}.pdf"
    else:
        pages.append(page)

In [23]:
clean_texts = "\n".join([re.sub(r'\s\s+', ' ', page.extract_text()) for page in reader.pages])

print(len(re.findall(pattern, clean_texts)))
print(len(glob("../data/candidate_resume/*.pdf")))

137
137


In [24]:
# First test number of times NIT pattern occurs should be equal to number of files extracted
assert len(re.findall(pattern, clean_texts)) == len(glob("../data/candidate_resume/*.pdf"))

In [7]:
client = instructor.from_openai(OpenAI(api_key=API_KEY, base_url=DEEPSEEK_URL))
client = track_openai(client)

In [8]:
class StudentMetadata(BaseModel):
    name: str = Field(..., description='Student name')
    gender: str = Field(..., description='Student gender')
    reg_no: str = Field(..., description='Student registration number')
    dob: str = Field(..., description='Student date of birth')
    email: str = Field(..., description='Student date of birth')
    phone: str = Field(..., description='Student phone number')
    mobile: str = Field(..., description='Student mobile number')
    branch: str = Field(..., description='Student branch')
    degree: str = Field(..., description='Student degree')
    #present_address: str = Field(..., description='Student present address')
    #permanent_address: str = Field(..., description='Student permanent address')


class AcademicDegreePerformance(BaseModel):
    semester: int = Field(..., description='Semester number')
    duration: str = Field(..., description='Semester month and year')
    sgpa: float = Field(..., description='Semester sgpa')
    cgpa: float = Field(..., description='Semester cgpa')
    degree: str = Field(..., description='Degree')


class PreDegreePerformance(BaseModel):
    discipline: str = Field(..., description='Pre degree class')
    institution: str = Field(..., description='Pre degree institution')
    board: str = Field(..., description='Pre degree university or board')
    year: int = Field(..., description='Pre degree year')
    marks: float = Field(..., description='Pre degree percentage')


class TechnicalSkills(BaseModel):
    programming_languages: List[str] = Field(default=[], description='List of programming languages known')
    frameworks: List[str] = Field(default=[], description='List of frameworks known')
    databases: List[str] = Field(default=[], description='List of databases known.')
    other_technologies: List[str] = Field(default=[], description='List of other tools and technologies')
    knowledge_area: List[str] = Field(default=[], description='list of skills like web design, cyber security, statistics, etc')


class Experience(BaseModel):
    company: str = Field(..., description='internship or training company')
    #name: str = Field(..., description='short name for the project')
    skill: TechnicalSkills# = Field(..., description='skills used during internship or training')
    duration: str = Field(..., description='duration of the training or intership')


class Projects(BaseModel):
    name: str = Field(..., description='short name for the project or research publication, personal taken up by the student')
    company: str = Field(..., description='internship or training company, if it is a personal project return personal')
    duration: str = Field(..., description='duration of the training or internship')
    #description: str = Field(..., description='project or research publication description')
    skill: TechnicalSkills# = Field(..., description='skills used for the project or research publication')
    

class ExtraCurriculars(BaseModel):
    leadership_roles: List[str] = Field(..., description='List of leadership roles held and keep it crisp')
    technical_fest_participation: List[str] = Field(..., description='List of technical fests participated and keep it crisp')
    sports_achievements: List[str] = Field(..., description='List of sports achievements')
    social_impact: List[str] = Field(default=[], description="List of social impact activities (e.g., NSS, volunteering)")
    olympiad_participation: List[str] = Field(default=[], description="List of Olympiads participated in")
    scholarships: List[str] = Field(default=[], description="List of scholarships received")
    public_speaking: List[str] = Field(default=[], description="List of public speaking achievements")
    languages_known: List[str] = Field(default=[], description="List of languages proficient in")


class ResumeInfo(BaseModel):
    metadata: StudentMetadata
    academic_performance: List[AcademicDegreePerformance]
    #predegree_performance: List[PreDegreePerformance]
    #experience: List[Experience]
    projects: List[Projects]
    #personal_abilities: PersonalAbilities
    #extra_curricular: ExtraCurriculars

In [18]:
reader = PdfReader('../data/candidate_resume/06CO02.pdf')
text = "\n".join([re.sub(r'\s\s+', ' ', page.extract_text()) for page in reader.pages])

response, completion = client.chat.completions.create_with_completion(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are an expert resume parsing system, which extracts the exact information mentioned in resumes. If any information is missing return NA."},
        {"role": "user", "content": f"Candidate resume:\n\n{text}"},
    ],
    temperature=0.0,
    response_model=ResumeInfo,
)

rprint(response)

KeyboardInterrupt: 

In [13]:
rprint(response.model_dump())

In [17]:
rprint(completion.usage.prompt_tokens_details.cached_tokens)

In [17]:
((3081*(0.14/1000000)) + (1105*(0.28/1000000)))*500

0.3703700000000001

In [None]:
import numpy as np
sg