# Importing the needed libraries

In [2]:
# Import Langchain modules
from langchain.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.llms import Ollama
from langchain_ollama import ChatOllama
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import re

# Other modules and packages
import os
import tempfile
import pandas as pd
from dotenv import load_dotenv



# Defining our LLM

In [3]:

llm = ChatOllama(
    model = "llama3.2"
)

# Processing PDF documents

## Loading the PDF

In [4]:
# Load the PDF file
loader = PDFPlumberLoader("data/NoeFlandre.pdf")
pages = loader.load()

## Splitting the documents into several chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50, length_function=len, separators= ["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

## Creating embeddings

In [6]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(
        model="nomic-embed-text",
    )
    return embeddings
embedding_function = get_embedding_function()

## Creating a vector database

In [7]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [8]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore26")

  vectorstore.persist()


# Query for relevant data

## Loading the vector database

In [9]:
vectorstore = Chroma(persist_directory="vectorstore26", embedding_function=embedding_function)

  vectorstore = Chroma(persist_directory="vectorstore26", embedding_function=embedding_function)


## Defining a retriever

In [10]:
retriever = vectorstore.as_retriever(search_type="similarity")


## Defining a prompt

In [11]:
PROMPT = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to
answer the question. If you don't know the answer, 
just say that you don't know, don't try to make up 
an answer. Rephrase the context given to you, as the 
accents can be parsed poorly.

Here is the context : {context}


Here is the question : {question}

"""

In [12]:
prompt_template = ChatPromptTemplate.from_template(PROMPT)

## Defining the context

In [13]:
question = "Give me the name, the latest work experience and the latest education of this person."

In [14]:
relevant_chunks = retriever.invoke(question)
context_text = "\n\n---\n\n".join([chunk.page_content for chunk in relevant_chunks])

## Defining the document format

In [16]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Generating structured output

In [17]:
class ExtractedInfo(BaseModel):
    """Extracted information about the candidate"""
    name: str = Field(description="Name of the person")
    latest_experience: str = Field(description="Latest work experience of the person")
    latest_education: str = Field(description="Latest education of the person")

In [18]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo)
        )

structured_output = rag_chain.invoke(question)

In [19]:
structured_output

ExtractedInfo(name='Noé Flandre', latest_experience='Data Scientist Intern on LLMs & Simulation at VMASC Suffolk, VA, USA', latest_education='IMT Mines Alès, MEng (Data Science and AI) Alès, France')

In [20]:
# Assuming structured_output might be an object, we convert it to a string
structured_output = str(structured_output)

# Define regex patterns for each field
name_pattern = r"name='([^']*)'"
experience_pattern = r"latest_experience='([^']*)'"
education_pattern = r"latest_education='([^']*)'"

# Extract the information dynamically
name_match = re.search(name_pattern, structured_output)
experience_match = re.search(experience_pattern, structured_output)
education_match = re.search(education_pattern, structured_output)

# Handle cases where the match might not be found
name = name_match.group(1) if name_match else ''
latest_experience = experience_match.group(1) if experience_match else ''
latest_education = education_match.group(1) if education_match else ''

# Create DataFrame
data = {
    'name': [name],
    'latest_experience': [latest_experience],
    'latest_education': [latest_education]
}

df = pd.DataFrame(data)

df.head()


Unnamed: 0,name,latest_experience,latest_education
0,Noé Flandre,Data Scientist Intern on LLMs & Simulation at ...,"IMT Mines Alès, MEng (Data Science and AI) Alè..."
