# Regular Rag with Differnet chunks types

## Install the Need libraries

In [None]:
%%capture
!pip install \
    langchain_community \
    tiktoken \
    langchainhub \
    langchain \
    transformers \
    langchain_huggingface \
    huggingface_hub \
    unstructured \
    sentence-transformers \
    xformers \
    -qU langchain_milvus -qU pydantic langchain_cohere

## Import the Need library

In [None]:
import zipfile
import os
import uuid
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
from sentence_transformers import SentenceTransformer
from langchain_huggingface.llms import HuggingFacePipeline
from huggingface_hub import login
from langchain_core.prompts import PromptTemplate
from langchain_milvus import Milvus
from langchain_core.documents import Document
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_cohere import CohereRerank
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker


## Define the Helper Functions

In [None]:
def extract_name_from_filename(filename):
    """
    Extract the name from the filename.
    """
    name_parts = filename.split('_')
    if len(name_parts) > 1:
        return ' '.join(name_parts).replace('.md', '').strip()
    return filename.replace('.md', '').strip()

In [None]:
def generate_candidate_id():
    """
    Generate a unique candidate ID.
    """
    return str(uuid.uuid4())[:8]

In [None]:
class TextEmbedding:
    """
    This class is responsible for embedding text documents and queries using a pre-trained Sentence-BERT model.
    It provides methods to generate embeddings for individual text queries, documents, and chunks of text.
    It will be used in SemanticChunker as the other StellaEmbedding is quite heavy competion.

    Attributes:
        model (SentenceTransformer): The Sentence-BERT model used for generating embeddings.
        query_prompt_name (str): The name of the prompt used when embedding queries.
    """
    def __init__(self, model_name="all-MiniLM-L6-v2", device="cpu"):
        config_kwargs = {}

        self.model = SentenceTransformer(
            model_name,
            trust_remote_code=True,
            device=device,
            config_kwargs=config_kwargs
        ).to(device)

        self.query_prompt_name = "s2p_query"

    def embed_documents(self, texts):
        """Embed multiple documents."""
        return self.model.encode(texts, show_progress_bar=False)

    def embed_query(self, query):
        """Embed a single query."""
        return self.model.encode([query], prompt_name=self.query_prompt_name, show_progress_bar=False)[0]

    def embed_text(self, text):
        """Generate embedding for a single text."""
        return self.model.encode(text, convert_to_tensor=True)

    def create_text_splitter(self):
        """Create the text splitter using the embed_text function."""
        return SemanticChunker(self.embed_text, breakpoint_threshold_type="gradient")

In [None]:
class StellaEmbedding:
    """
    This class is responsible for embedding text documents and queries using a pre-trained model.
    It uses the Sentence-BERT architecture to generate high-quality embeddings for text input.

    Attributes:
        model (SentenceTransformer): The Sentence-BERT model used to generate embeddings.
        query_prompt_name (str): The name of the prompt used when embedding queries.
    """
    def __init__(self, model_name="dunzhang/stella_en_400M_v5", device="cpu"):
        config_kwargs = {}
        if device == "cpu":
            config_kwargs = {
                "use_memory_efficient_attention": False,
                "unpad_inputs": False
            }

        self.model = SentenceTransformer(
            model_name,
            trust_remote_code=True,
            device=device,
            config_kwargs=config_kwargs
        ).to(device)

        self.query_prompt_name = "s2p_query"

    def embed_documents(self, texts):
        """Embed multiple documents."""
        return self.model.encode(texts, show_progress_bar=False)

    def embed_query(self, query):
        """Embed a single query."""
        return self.model.encode([query], prompt_name=self.query_prompt_name, show_progress_bar=False)[0]

In [None]:

def process_markdown_files(extracted_folder_path: str, text_splitter):
    """
    Process markdown files in a folder and return split documents with metadata.

    Parameters:
        extracted_folder_path (str): Path to the folder containing markdown files.
        text_splitter: An instance of a text splitting class with a split_documents method.

    Returns:
        List: A list of split documents with metadata.
    """
    all_splits = []

    # Process each markdown file
    for root, dirs, files in os.walk(extracted_folder_path):
        for file in files:
            if file.endswith('.md'):  # Only process markdown files
                file_path = os.path.join(root, file)

                # Load markdown file using UnstructuredMarkdownLoader
                loader = UnstructuredMarkdownLoader(file_path)
                data = loader.load()

                # Extract metadata
                extracted_name = extract_name_from_filename(file)
                candidate_id = generate_candidate_id()

                # Attach metadata to the first document in data
                data[0].metadata["name"] = extracted_name
                data[0].metadata["candidate_id"] = candidate_id

                # Use SemanticChunker to split documents
                splits = text_splitter.split_documents(data)
                all_splits.extend(splits)

    return all_splits

In [None]:

# Set the Hugging Face token in the environment variable
os.environ["HF_TOKEN"] = "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"

In [None]:
def format_docs(docs):
    """
    Formats a list of documents by including metadata information (candidate's name)
    and the document content. If the candidate's name is not available, it defaults
    to 'Unknown Candidate'.

    Args:
        docs (list): A list of document objects, where each document has metadata
                     (a dictionary) and page_content (the document's text).

    Returns:
        str: A single string where each document is formatted with the candidate's
             name followed by the document content, separated by double newlines.
    """
    formatted_docs = []
    for doc in docs:
        candidate_name = doc.metadata.get('name', 'Unknown Candidate')
        formatted_doc = f"Candidate Name: {candidate_name}\n{doc.page_content}"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)


## Experiment of trying SemanticChunker vs RecursiveCharacterTextSplitter

- stella_embedding_model will be used in retiveral at the vector database

In [None]:
stella_embedding_model = StellaEmbedding(device="cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/170k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

- TextEmbedding will be used in Sematic chunker as stella_embedding_model take so much time(around 20 mins) but this takes just a second

In [None]:
TextEmbedding_model = TextEmbedding(device="cuda")
text_splitter = SemanticChunker(TextEmbedding_model,breakpoint_threshold_type="gradient")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

- Use Sematic chunker to doing the Chunking process



In [None]:

# Path to the folder containing markdown files
folder_path ='/content/drive/MyDrive/pdf_data/output'


# Process the files and get splits
all_splits_with_SemanticChunker = process_markdown_files(folder_path, text_splitter)


# Output results
print(f"Total splits processed: {len(all_splits_with_SemanticChunker)}")
for split in all_splits_with_SemanticChunker[:6]:
    print(f"Metadata: {split.metadata}")
    print(f"Content: {split.page_content[:100]}")


Total splits processed: 61
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '597b93a7'}
Content: Youssef Medhat

SW Engineer - Machine Learning Contact Address: Cairo, Egypt Phone: +201129681611 E-
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '597b93a7'}
Content: Critical thinker and a quick learner seeking opportunities to gain practical experience and knowledg
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_ML_EsraaSayed/Esraa_Sayed.md', 'name': 'Esraa Sayed', 'candidate_id': '66f415d4'}
Content: ESRAA

SAYED

Experience

Giza Systems Education 9-Month Professional Postgraduate Diploma, Ai And M
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_ML_EsraaSayed/Esraa_Sayed.md', 'name': 'Esraa Sayed', 'candidate_id': '66f415d4'}
Content: Big Data Systems

In [None]:
URI = "./milvus_example_SemanticChunker.db"

vectorstore__with_SemanticChunker = Milvus.from_documents(
    all_splits_with_SemanticChunker,
    stella_embedding_model,
    collection_name="test_cvs",
    connection_args={"uri": URI}
)

In [None]:
retriever_with_SemanticChunker = vectorstore__with_SemanticChunker.as_retriever(top_k_documents=3)

- Use Recursive Character TextSplitter to doing the Chunking process

In [None]:
chunk_size = 500
chunk_overlap = 50
extracted_folder_path='/content/drive/MyDrive/pdf_data/output'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)



# Path to the folder containing markdown files
folder_path ='/content/drive/MyDrive/pdf_data/output'


# Process the files and get splits
all_splits_RecursiveCharacter = process_markdown_files(folder_path, text_splitter)


# Output results
print(f"Total splits processed: {len(all_splits_RecursiveCharacter)}")
for split in all_splits_RecursiveCharacter[:6]:
    print(f"Metadata: {split.metadata}")
    print(f"Content: {split.page_content[:100]}")

Total splits processed: 272
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '7363845a'}
Content: Youssef Medhat

SW Engineer - Machine Learning Contact Address: Cairo, Egypt Phone: +201129681611 E-
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '7363845a'}
Content: GraphQL

Kafka

NiFi

Elastic Search

Spark

Airflow

Extracurricular Activities

Youth Leadership P
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '7363845a'}
Content: Former Member of Project management committee at IEEE Passionate software researcher with a strong k
Metadata: {'source': '/content/drive/MyDrive/pdf_data/output/SW_MLEngineer_YoussefMedhat/Youssef_Medhat.md', 'name': 'Youssef Medhat', 'candidate_id': '7

In [None]:
URI = "./milvus_example_RecursiveCharacter.db"

vectorstore_with_RecursiveCharacter = Milvus.from_documents(
    all_splits_RecursiveCharacter,
    stella_embedding_model,
    collection_name="test_cvs",
    connection_args={"uri": URI}
)

In [None]:
retriever_with_RecursiveCharacter = vectorstore_with_RecursiveCharacter.as_retriever(top_k_documents=3)

- Retrieve documents from both approaches and select the best one based on the maximum cosine similarity and contextual relevance, ensuring a stronger alignment between the prompt and the retrieved documents.

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")

# Initialize the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    max_new_tokens=2024
)


hf_pipe = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda


In [None]:
messages = [
    {"role": "system", "content": "You are an intelligent talent acquisition assistant chatbot. Your primary role is to assist recruiters by analyzing candidates' resumes, understanding their qualifications, and answering questions about their suitability for specific roles. Provide detailed, professional, and context-aware responses."},
    {"role": "user", "content": '''Relevant Candidates Information:
{context}

Recruiter's Question:
{question}'''},
]


prompt_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 29 Dec 2024\n\nYou are an intelligent talent acquisition assistant chatbot. Your primary role is to assist recruiters by analyzing candidates' resumes, understanding their qualifications, and answering questions about their suitability for specific roles. Provide detailed, professional, and context-aware responses.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nRelevant Candidates Information:\n{context}\n\nRecruiter's Question:\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n")

In [None]:
# Chain
rag_chain_with_SemanticChunker = (
    {"context": retriever_with_SemanticChunker | format_docs, "question": RunnablePassthrough()}
    | prompt
    | hf_pipe
)

# Question
response=rag_chain_with_SemanticChunker.invoke("who has experience with apache nifi?")
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 29 Dec 2024

You are an intelligent talent acquisition assistant chatbot. Your primary role is to assist recruiters by analyzing candidates' resumes, understanding their qualifications, and answering questions about their suitability for specific roles. Provide detailed, professional, and context-aware responses.<|eot_id|><|start_header_id|>user<|end_header_id|>

Relevant Candidates Information:
Candidate Name: Mahmoud Helmy
Designed the system for scalability, accommodating new camera vendors and increased data volumes. Maintained reliability to ensure data consistency and availability. Technologies Used: •Apache NIFI •Redis •NTCIP •Jython •Groovy 2022/09 - present KPIs Service Giza Systems •KPIs Service is a microservice application inside the IDP(Industrial Digitalization Platform) for executing custom equations and displaying real-time results. Users can input their own eq

In [None]:
# Chain
rag_chain_with_RecursiveCharacter = (
    {"context": retriever_with_RecursiveCharacter | format_docs, "question": RunnablePassthrough()}
    | prompt
    | hf_pipe
)

# Question
response=rag_chain_with_RecursiveCharacter.invoke("who has experience with apache nifi?")
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 29 Dec 2024

You are an intelligent talent acquisition assistant chatbot. Your primary role is to assist recruiters by analyzing candidates' resumes, understanding their qualifications, and answering questions about their suitability for specific roles. Provide detailed, professional, and context-aware responses.<|eot_id|><|start_header_id|>user<|end_header_id|>

Relevant Candidates Information:
Candidate Name: Mostafa Khalil Karrar
Apache Nifi - Apache Airflow - Python:

SK-learn, numpy, pandas, Spark-ML, Pytorch - Network Scripting (telnetlib, paramiko, netmiko, pysnmp) - RESTFUL API (requests) - Tasks Automation (bs4, Selenium, pyAutoGUI) - Web Development (CGI, Flask, Django) - Data Processing (SQL, Excel, csv, txt, xml, yaml, json, …)

Java - Groovy - Oracle Databases - MySQL Databases

Candidate Name: Mostafa Khalil Karrar
Hands-On Technical Experience:

Big Data Systems