In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import PyPDFLoader
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from typing import List
from groq import Groq
import pandas as pd
import instructor
import json
import os

In [None]:
load_dotenv()
# Configure Groq API
groq_api_key = os.getenv("GROQ_API_KEY")

## Extract PDF text

In [None]:
#loader = PyPDFLoader("CV_avec_profil.pdf")
loader = PyPDFLoader("CV_sans_profil.pdf")
pages = loader.load_and_split()
text = " ".join(list(map(lambda page: page.page_content, pages)))
print(text)

## Extract WORD text

In [None]:
loader = Docx2txtLoader("cv_word.docx")
pages = loader.load_and_split()
text = " ".join(list(map(lambda page: page.page_content, pages)))
print(text)

## Pydantic class to extract information

In [None]:
default = ""

class Formation(BaseModel):
    """Formations que le candidat a suivi"""
    dates: str = Field(default=default)
    intitule_formation: str = Field(default=default)
    ecole: str = Field(default=default)

class Experience(BaseModel):
    """Expériences professionnelles du candidat"""
    dates: str = Field(default=default)
    nom_entreprise: str = Field(default=default)
    intitule_poste: str = Field(default=default)
    missions: List[str] = Field(default=[])

class Competence(BaseModel):
    """Compétence du candidat"""
    nom_competence: str = Field(default=default)
    niveau: str = Field(default=default)

class Langue(BaseModel):
    langue: str = Field(default=default)
    niveau: str = Field(default=default)

class Hobby(BaseModel):
    type_hobby: str = Field(default=default)
    nom_hobby: str = Field(default=default)

class CvExtractor(BaseModel):
    """Informations à extraire du CV du candidat"""
    nom: str = Field(default=default)
    prenom: str = Field(default=default)
    email: str = Field(default=default)
    adresse: str = Field(default=default)
    linkedin: str = Field(default=default)
    formations: List[Formation]
    experiences: List[Experience]
    competences: List[Competence]
    langues: List[Langue]
    centres_interets: List[Hobby]

class Raisonnement(BaseModel):
    extraction_cv: CvExtractor
    reponse_finale: str = Field(description="profil du candidat écrit en français en fonction des informations extraites du CV")

## Prompt Gen AI API to retrieve extracted Informations

In [None]:
client = instructor.from_groq(Groq(), mode=instructor.Mode.JSON)

In [None]:
sys_prompt = """Tu es une IA experte dans l'analyse des CV de candidats. 

J'aimerais analyser le CV d'un candidat et savoir quelle est son profil.

Ta tâche sera d'abord d'extraire les informations du CV sous formes de sections en suivant la structure du JSON Schema Raisonnement puis d'en déduire le profil du candidat dans "reponse_finale".

Tu extrairas les informations du CV et n'inventeras pas d'informations, il est très important que tu suives la structure du Schéma Pydantic donné en paramètre !"""

input_prompt = f"""
Contenu du CV brut : {text}

Réponse :
"""

In [None]:
def extract_cv(client, sys_prompt, input_prompt):
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role":"user", "content": input_prompt}
            ],
            #model="llama3-8b-8192",
            model="llama-3.3-70b-versatile",
            response_model=Raisonnement,
        )
        return response
    except Exception as e:
        print(f"Erreur dans l'analyse du CV : {e}")
        return None

In [None]:
response = extract_cv(client, sys_prompt, input_prompt)
if response:
    response = json.loads(response.model_dump_json())
    print(response)
else:
    print("Error parsing the resume.")

In [None]:
df_formations = pd.DataFrame(response["extraction_cv"]["formations"])
df_formations = df_formations.rename(columns={"dates": "Dates", "intitule_formation": "Formation", "ecole": "Ecole"})

df_experiences = pd.DataFrame(response["extraction_cv"]["experiences"])
df_experiences = df_experiences.rename(columns={"dates": "Dates", "nom_entreprise": "Entreprise", "intitule_poste":"Poste", "missions": "Missions"})

df_competences = pd.DataFrame(response["extraction_cv"]["competences"])
df_competences = df_competences.rename(columns={"nom_competence":"Competence","niveau":"Niveau"})

df_langues = pd.DataFrame(response["extraction_cv"]["langues"])
df_langues = df_langues.rename(columns={"langue":"Langue", "niveau":"Niveau"})

df_centres_interets = pd.DataFrame(response["extraction_cv"]["centres_interets"])
df_centres_interets = df_centres_interets.rename(columns={"type_hobby":"Catégorie", "nom_hobby":"Hobby"})

del response["extraction_cv"]["formations"]
del response["extraction_cv"]["experiences"]
del response["extraction_cv"]["competences"]
del response["extraction_cv"]["centres_interets"]
del response["extraction_cv"]["langues"]

df_informations = pd.DataFrame(response["extraction_cv"], index=[0])
df_informations = df_informations.rename(columns={"nom":"Nom","prenom":"Prenom","email":"Email","adresse":"Adresse","linkedin":"Profil linkedin"})

In [None]:
response["reponse_finale"]

In [None]:
df_informations

In [None]:
df_formations

In [None]:
df_experiences

In [None]:
df_competences

In [None]:
df_langues

In [None]:
df_centres_interets