In [10]:
import os
from groq import Groq
from pathlib import Path

## Model setup

In [7]:
from dotenv import load_dotenv
from langchain_groq import ChatGroq


load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

In [8]:
MODEL_NAME = "llama3-8b-8192"

llm = ChatGroq(
    model=MODEL_NAME,
    temperature=0,
    api_key=GROQ_API_KEY)

## Data loading_&_Processing

In [23]:
folder_path = r"C:\Users\Asus\Machine_learning\LLM\Projects\PDF_ResumeExtractor\Pdf_file"
#folder_path =Path ("Pdf_file")
folder_path

'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file'

In [38]:
import glob

pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
pdf_files

['C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Arjun_ML_engineer-1.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Arjun_ML_engineer.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Data science and ML.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Data scientist & ML enginner 2.8 Exp NLP.pdf']

In [30]:
old_name = [os.path.join(folder_path,file_name) for file_name in pdf_files ]

resume_file_lists = [file_name.replace("-","_").replace(" ","_") for file_name in pdf_files]
resume_file_lists

['C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Arjun_ML_engineer_1.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Arjun_ML_engineer.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Data_science_and_ML.pdf',
 'C:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\PDF_ResumeExtractor\\Pdf_file\\Data_scientist_&_ML_enginner_2.8_Exp_NLP.pdf']

In [40]:
for old_name in pdf_files:
    new_name      = old_name.replace("-", "_").replace(" ", "_")
    os.rename(old_name,new_name)

## Processing

In [41]:
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import JsonOutputParser

In [43]:
def load_pdf(file_path:str):

  file_loader   = PyPDFLoader(file_path)
  pages         = file_loader.load_and_split()
  text = "\n\n".join(doc.page_content for doc in pages)
  return text

## Data validation

In [47]:
# @title Pydantic function
import re
from typing import Optional
from pydantic import BaseModel,ValidationError,Field,field_validator,EmailStr


In [48]:
class PersonalInformation(BaseModel):
    """Model for storing personal information"""
    name        : str
    email       : EmailStr
    phone_number: Optional[str] = None

    @field_validator('phone_number')
    def validate_phone_number(cls, number):
        if number:
            # Regular expression pattern to validate phone number
            pattern = r'^(?:\+?\d{1,3}-\d{10}|\d{10})$'
            if not re.match(pattern, number):
                raise ValueError("Invalid phone number format.")

        return number
class ProjectTitle(BaseModel):
  title: list[str] = Field(default=[],max_items=6,description="List of project titles")

class Details(BaseModel):
  project_titles : ProjectTitle
  personal_info  : PersonalInformation

## Parsing

In [49]:
# @title Parasing
parser  = JsonOutputParser(pydantic_object=Details)
prompt  = PromptTemplate(
    template = """
    You are a data extraction and summarization expert.Given the following text, perform the following tasks:
    1. Summarize the text and extract all the project titles (include if any freelancing work is present). Ensure that project titles are returned in a list format.
    2. Extract the following details: name, email, and phone number.
    Return the result in JSON format with the following structure:
    
      1. "project_titles": "<extracted project title 1>", "<extracted project title 2>", ...,
      2. "name"  : "<name of candidate>",
      3. "email" : "<email address>",
      4. "phone_number": "<extracted phone number>"
      5. "experience":"summarize" in 25 words.
    
    {format_instructions}
    Context:{context}
    """,
    input_variables   = ["context"],
    partial_variables = {"format_instructions": parser.get_format_instructions()},)

## Chain

In [50]:
chain= ( prompt| llm| parser)

## Sample data

In [53]:
sample = load_pdf(resume_file_lists[1])

In [54]:
result = chain.invoke({"context":sample})

In [55]:
result

{'project_titles': ['Customer Service Chatbot',
  'Predicting Broadband Adoption',
  'Diabetic Retinopathy Detection System',
  'Advanced Image Segmentation Model for Brain Tumor Detection',
  'Transformative Data Automation'],
 'personal_info': {'name': 'ARJUN P V',
  'email': 'arjunappu1001@gmail.com',
  'phone_number': '+91 9400508669'},
 'experience': 'Data Scientist with over 3 years of industrial experience and 2.5 years of specialized expertise in Data Science and Machine Learning, proficient in Python programming and adept at extracting valuable insights from data.'}