In [None]:
# Create a new conda environment named 'rag'
!conda create -n rag python=3.9 -y

# Activate the environment
!conda activate rag

# Install PyTorch with CUDA support (adjust cuda version if needed)
!conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y

# Install basic dependencies
!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install fastapi
!pip install uvicorn
!pip install pydantic
!pip install peft

# Install LangChain, ChromaDB and other RAG-related packages
!pip install langchain
!pip install chromadb
!pip install sentence-transformers
!pip install pypdf
!pip install langchain-community

# Install PDF processing libraries
!pip install pdfminer.six
!pip install pymupdf

# Install optional but useful packages
!pip install tqdm
!pip install ipywidgets
!pip install jupyterlab

Collecting chromadb
  Downloading chromadb-1.0.6-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp39-cp39-win_amd64.whl.metadata (262 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp39-cp39-win_amd64.whl.metadata (4.7 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.32.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
 

In [11]:
import os
import torch
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel, PeftConfig

In [15]:
# Configuration parameters
PDF_DIRECTORY = "books"  # Directory containing your PDF documents
CHROMA_DB_DIRECTORY = "chroma_db"         # Directory to store ChromaDB
CHUNK_SIZE = 1000                         # Text chunk size for splitting documents
CHUNK_OVERLAP = 200                       # Overlap between chunks
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL_PATH = "meta-llama/Llama-3.2-1B"  # Base model
PEFT_MODEL_PATH = "mental_health_chat_llm"  # PEFT adapter path
MAX_NEW_TOKENS = 256                        # Max new tokens to generate

In [16]:
def load_documents(pdf_directory):
    """
    Load PDF documents from a specified directory.
    
    Args:
        pdf_directory (str): Path to directory containing PDF files
        
    Returns:
        list: List of loaded documents
    """
    print("Loading PDF documents...")
    loader = DirectoryLoader(pdf_directory, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    print(f"Loaded {len(documents)} documents")
    return documents

def split_documents(documents):
    """
    Split documents into chunks for better retrieval.
    
    Args:
        documents (list): List of documents to split
        
    Returns:
        list: List of document chunks
    """
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks

def create_vector_store(chunks):
    """
    Create a ChromaDB vector store from document chunks.
    
    Args:
        chunks (list): List of document chunks
        
    Returns:
        Chroma: ChromaDB vector store
    """
    print("Creating embeddings and vector store...")
    # Using a lightweight embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": DEVICE}
    )
    
    # Create or load the vector store
    if os.path.exists(CHROMA_DB_DIRECTORY):
        print("Loading existing ChromaDB...")
        vector_store = Chroma(
            persist_directory=CHROMA_DB_DIRECTORY,
            embedding_function=embedding_model
        )
    else:
        print("Creating new ChromaDB...")
        vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=CHROMA_DB_DIRECTORY
        )
        vector_store.persist()
    
    print("Vector store created successfully")
    return vector_store

def load_peft_model():
    """
    Load the PEFT fine-tuned Llama model.
    
    Returns:
        tuple: Tokenizer and model
    """
    print("Loading PEFT fine-tuned model...")
    
    # Load tokenizer from base model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
    
    # Enable padding on the right side
    tokenizer.padding_side = "right"
    
    # Add special tokens if they don't exist
    special_tokens = {
        "pad_token": "<PAD>",
        "eos_token": "</s>",
        "bos_token": "<s>"
    }
    
    for token_type, token in special_tokens.items():
        if getattr(tokenizer, token_type) is None:
            tokenizer.add_special_tokens({token_type: token})
    
    # Load base model
    print(f"Loading base model from {BASE_MODEL_PATH}...")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_PATH,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        device_map="auto",
        low_cpu_mem_usage=True
    )
    
    # Load PEFT adapter on top of base model
    print(f"Loading PEFT adapter from {PEFT_MODEL_PATH}...")
    model = PeftModel.from_pretrained(
        base_model,
        PEFT_MODEL_PATH,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        device_map="auto"
    )
    
    # Resize token embeddings if needed
    model.resize_token_embeddings(len(tokenizer))
    
    print(f"PEFT model loaded successfully (device: {DEVICE})")
    return tokenizer, model

def setup_rag_pipeline(vector_store, tokenizer, model):
    """
    Set up the RAG pipeline integrating the vector store and language model.
    
    Args:
        vector_store (Chroma): ChromaDB vector store
        tokenizer: Tokenizer for the language model
        model: Fine-tuned language model
        
    Returns:
        function: query_function that can process user queries with RAG
    """
    print("Setting up RAG pipeline...")
    
    # Create a retriever from the vector store
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3}  # Retrieve top 3 most relevant documents
    )
    
    # Create a text generation pipeline with correct parameters
    # IMPORTANT: We specify max_new_tokens instead of max_length
    generation_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=MAX_NEW_TOKENS,  # This is the key parameter
        temperature=0.7,
        top_p=0.95,
        device_map="auto"
    )
    
    def query_rag_system(query, chat_history=None):
        """
        Process a user query using the RAG system.
        
        Args:
            query (str): User query
            chat_history (list, optional): Previous conversation history
            
        Returns:
            str: Generated response
        """
        # Retrieve relevant documents
        retrieved_docs = retriever.get_relevant_documents(query)
        
        # Format the context from retrieved documents
        context = "\n\n".join([doc.page_content for doc in retrieved_docs])
        
        # Prepare the prompt with retrieved context
        if chat_history:
            # Format chat history if available
            history_text = "\n".join([f"User: {q}\nAssistant: {a}" for q, a in chat_history])
            prompt = f"""Context information from mental health resources:
{context}

Previous conversation:
{history_text}

User: {query}
Assistant:"""
        else:
            prompt = f"""Context information from mental health resources:
{context}

User: {query}
Assistant:"""
        
        # Generate response with fixed parameters
        # Using return_full_text=False to only get the new tokens
        response = generation_pipeline(
            prompt, 
            return_full_text=False,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )[0]["generated_text"]
        
        return response.strip()
    
    print("RAG pipeline setup complete")
    return query_rag_system

In [17]:
def main():
    """
    Main function to set up and run the mental health counseling RAG system.
    """
    # Load and process PDF documents
    documents = load_documents(PDF_DIRECTORY)
    chunks = split_documents(documents)
    
    # Create vector store
    vector_store = create_vector_store(chunks)
    
    # Load PEFT fine-tuned model
    tokenizer, model = load_peft_model()
    
    # Set up RAG pipeline with fixed parameters
    query_function = setup_rag_pipeline(vector_store, tokenizer, model)
    
    # Example usage
    sample_query = "I've been feeling anxious about my future lately. What can I do?"
    response = query_function(sample_query)
    print("\nSample Query:", sample_query)
    print("Response:", response)
    
    # Interactive mode for testing
    print("\nEnter 'quit' to exit")
    chat_history = []
    while True:
        user_input = input("\nYour question: ")
        if user_input.lower() == 'quit':
            break
        
        response = query_function(user_input, chat_history)
        print("Mental Health Assistant:", response)
        
        # Update chat history
        chat_history.append((user_input, response))
        if len(chat_history) > 5:  # Keep only the last 5 exchanges
            chat_history = chat_history[-5:]

In [18]:
if __name__ == "__main__":
    main()

Loading PDF documents...
Loaded 261 documents
Splitting documents into chunks...
Split into 459 chunks
Creating embeddings and vector store...
Loading existing ChromaDB...
Vector store created successfully
Loading PEFT fine-tuned model...
Loading base model from meta-llama/Llama-3.2-1B...
Loading PEFT adapter from mental_health_chat_llm...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'Gem

PEFT model loaded successfully (device: cuda)
Setting up RAG pipeline...
RAG pipeline setup complete

Sample Query: I've been feeling anxious about my future lately. What can I do?
Response: It is normal to feel anxious about the future.  Sometimes 
anxiety may be useful and help us to motivate us to do things that are 
important to us.  Anxiety is usually short-term, but sometimes it may 
be too much.  We can learn to manage anxiety so that it does not get 
in the way of our daily activities and enjoyment of life.  Anxiety is 
usually associated with a fear of failure, loss, or abandonment.  The 
first step in managing anxiety is to identify the source of this fear. 
Once the source of the fear is identified, the anxiety associated with 
that source can be reduced.  We can learn new ways to cope with the 
fear and replace it with a more positive and realistic thought.  This 
may be done by: 
/g120/g3Identifying the thoughts and feelings that are associated with the 
anxiety (sometimes