In [13]:
import json
from typing import List, Optional,Dict
from langchain_chroma import Chroma
from pydantic import BaseModel,Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.output_parsers import PydanticOutputParser

In [14]:
llm = ChatOllama(model="qwen2.5:1.5b")
embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")

In [15]:
vector_store = Chroma(
    embedding_function=embedding_model,
    persist_directory="./vectorstore"
)
retriever = vector_store.as_retriever(search_type="similarity")

In [16]:
def get_field_properties(baseModel:BaseModel):
    schema = baseModel.model_json_schema()
    field_properties:Dict[str,Dict]=dict()
    
    for key,value in schema.get("properties",{}).items():
        value.pop("title",None)
        field_properties[key]=value
        
    return field_properties

def get_required_fields(baseModel:BaseModel):
    schema = baseModel.model_json_schema()
    return schema.get("required",[])


def get_output_schema(baseModel:BaseModel):
    schema = baseModel.model_json_schema()
    output_schema:Dict[str,Dict]=dict()
    
    for key,value in schema.get("properties",{}).items():
        output_schema[key]=value.get("type","string")
        
    return output_schema

In [17]:
PROMPT_TEMPLATE = """
You are an assistant for extracting structured information from LinkedIn profiles.

Your goal is to extract the following fields:

{field_properties}

These are required fields you MUST extract: {required_fields}

Output format (strictly return in this JSON structure):

{output_schema}

The context below is extracted from a LinkedIn profile PDF of our client.
We are preparing to offer services to this client, so accurate detail extraction is critical.

Instructions:

* Extract data precisely based on the context.
* If a field is not present, set its value to **null** (not the string "null").
* Return only a **valid JSON object**.
* Do NOT add any explanation or commentary.

Context:

{context}
"""

In [18]:
def get_prompt_template_to_extract(baseModel:BaseModel)->ChatPromptTemplate:
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.partial(
        field_properties=get_field_properties(baseModel), required_fields=get_required_fields(baseModel), output_schema=get_output_schema(baseModel)
    )
    return prompt

In [19]:
class ClientExperience(BaseModel):
    company: str = Field(..., description="Company name as shown in the experience section.")
    role: str = Field(..., description="Job title held at the company.")
    start_date: str = Field(..., description="Start date of the role (e.g., 'May 2025').")
    end_date: str = Field(..., description="End date of the role or 'Present'.")
    duration: str = Field(..., description="Total duration in the format used on LinkedIn (e.g., '2 years 2 months').")
    location: str = Field(..., description="Location of the role as listed (e.g., 'Berlin, Germany').")
    description: str = Field(..., description="Bullet points or summary describing the responsibilities and achievements.")


class ClientExperienceProfile(BaseModel):
    experiences: List[ClientExperience] = Field(..., description="Complete list of professional experiences.")


class ClientEducation(BaseModel):
    university: str = Field(..., description="Name of the educational institution.")
    degree: str = Field(..., description="Type of degree (e.g., 'Bachelor's degree').")
    field_of_study: str = Field(..., description="Field of study or major (e.g., 'Industrial Engineering').")
    start_date: str = Field(..., description="Start date of the education period (e.g., 'September 2011').")
    end_date: str = Field(..., description="End date of the education period (e.g., 'July 2015').")


class ClientSkills(BaseModel):
    skills: List[str] = Field(..., description="List of top skills listed under 'Top Skills' in the profile.")


class ClientProject(BaseModel):
    name: str = Field(..., description="Project title, if listed explicitly.")
    client: Optional[str] = Field(None, description="Client name, if the project was done for a specific organization.")
    duration: str = Field(..., description="Duration of the project (if mentioned).")
    technologies: List[str] = Field(..., description="Technologies and tools used, inferred from project or experience.")
    description: str = Field(..., description="Summary of the project goals, activities, and outcomes.")


class ClientBasicInfo(BaseModel):
    name: str = Field(..., description="Full name as shown on the resume.")
    email: str = Field(..., description="Email address listed under contact.")
    address: str = Field(..., description="Full address or city/location under contact.")
    summary: str = Field(..., description="Professional summary or personal statement from the resume header.")

In [20]:
def query(baseModel:BaseModel):
    context = "\n\n----\n\n".join([doc.page_content for doc in retriever.invoke(str(get_field_properties(baseModel)))])
    prompt_template = get_prompt_template_to_extract(baseModel)
    prompt = prompt_template.format(context=context)
    return prompt

In [21]:
parser = PydanticOutputParser(pydantic_object=ClientBasicInfo)

In [22]:
print(query(ClientBasicInfo))

Human: 
You are an assistant for extracting structured information from LinkedIn profiles.

Your goal is to extract the following fields:

{'name': {'description': 'Full name as shown on the resume.', 'type': 'string'}, 'email': {'description': 'Email address listed under contact.', 'type': 'string'}, 'address': {'description': 'Full address or city/location under contact.', 'type': 'string'}, 'summary': {'description': 'Professional summary or personal statement from the resume header.', 'type': 'string'}}

These are required fields you MUST extract: ['name', 'email', 'address', 'summary']

Output format (strictly return in this JSON structure):

{'name': 'string', 'email': 'string', 'address': 'string', 'summary': 'string'}

The context below is extracted from a LinkedIn profile PDF of our client.
We are preparing to offer services to this client, so accurate detail extraction is critical.

Instructions:

* Extract data precisely based on the context.
* If a field is not present, set

In [26]:
print(llm.invoke(query(ClientBasicInfo)).content)

{
    "name": "DeFacto Teknoloji",
    "email": null,
    "address": "Istanbul, Turkey",
    "summary": "Business Analyst providing training sessions & user guides for stakeholders. Supporting (bugs) with profound product knowledge."
}


In [24]:
parser.invoke(llm.invoke(query(ClientBasicInfo)))

OutputParserException: Failed to parse ClientBasicInfo from completion {"name": null, "email": null, "address": null, "summary": "Providing training sessions & user guides for stakeholders. Supporting (bugs) with profound product knowledge. Creating test scenarios and performing functional tests. Coordinating UATs."}. Got: 3 validation errors for ClientBasicInfo
name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type
email
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type
address
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 