**Importing necessary Libraries**

In [None]:
from langchain_community.document_loaders import GitLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, AIMessage
import os
from dotenv import load_dotenv

print("All Libraries loaded successfully!")

In [None]:
# Loading the API Keys
load_dotenv()
grok_key = os.getenv("GROK_API_KEY")
google_key = os.getenv("GOOGLE_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

# Verify API key
if not (grok_key or google_key or openai_key):
    print("WARNING: No LLM API Keys found (Grok/Google/OpenAI). Chat generation might fail.")
print(f"API Keys Loaded. Grok: {bool(grok_key)}, Google: {bool(google_key)}, OpenAI: {bool(openai_key)}")

**Data Ingestion from GitHub and Linkedin**


In [None]:
# --- Configuration --- #
GITHUB_REPO_URL = "https://github.com/Olajcodes/Olajcodes" 
LINKEDIN_PDF_PATH = "Profile.pdf" 

documents = []

# 1. Load GitHub Data using Clone method
try:
    print(f"Loading GitHub repository from {GITHUB_REPO_URL}...")
    loader_github = GitLoader(
        clone_url=GITHUB_REPO_URL,
        repo_path="./temp_repo",
        branch="main",
        file_filter=lambda file_path: file_path.endswith((".md", ".py", ".js", ".ts", ".html", ".ipynb")) # Adjust filters
    )
    github_docs = loader_github.load()
    documents.extend(github_docs)
    print(f"Loaded {len(github_docs)} documents from GitHub.")
except Exception as e:
    print(f"Error loading GitHub: {e}")

# 2. Load LinkedIn Data
try:
    print(f"Loading LinkedIn profile from {LINKEDIN_PDF_PATH}...")
    loader_linkedin = PyPDFLoader(LINKEDIN_PDF_PATH)
    linkedin_docs = loader_linkedin.load()
    documents.extend(linkedin_docs)
    print(f"Loaded {len(linkedin_docs)} pages from LinkedIn.")
except Exception as e:
    print(f"Error loading LinkedIn (Ensure file exists): {e}")

# 3. Split Text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

print(f"Total split documents ready for embedding: {len(splits)}")

**Vector Store (ChromaDB)**

In [None]:
PERSIST_DIRECTORY = "./chroma_db"

# Embedding Selection Logic - LOCAL HUGGINGFACE
print("Using Local HuggingFace Embeddings (all-MiniLM-L6-v2)")
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Initialize Chroma and persist data
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    persist_directory=PERSIST_DIRECTORY
)

print(f"Embeddings generated and persisted to {PERSIST_DIRECTORY}")

**RAG Pipeline with LCEL**

In [None]:
# Initialize LLM (Priority: Gemini -> Grok -> OpenAI)
if google_key:
    print("Using Gemini")
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=google_key, temperature=0)
elif grok_key:
    print("Using Grok (via xAI)")
    llm = ChatOpenAI(model="grok-beta", base_url="https://api.x.ai/v1", api_key=grok_key, temperature=0)
elif openai_key:
    print("Using OpenAI")
    llm = ChatOpenAI(model="gpt-4o", api_key=openai_key, temperature=0)
else:
    raise ValueError("No LLM Available")

# Retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# --- Privacy & System Prompt ---
system_prompt_text = """
You are a professional assistant representing a developer. Your knowledge is based STRICTLY on the provided context (GitHub repositories and LinkedIn profile).

### INSTRUCTIONS:
1. Answer questions about professional experience, skills, repositories, and technical implementation details.
2. If the context does not contain the answer, say "I don't have that information in my knowledge base."
3. ALWAYS cite your sources implicitly by referring to the specific file or section.
4. Format all responses as clean plain text with no markdown or special characters.

### PRIVACY GUARDRAILS (CRITICAL):
You MUST REFUSE to answer questions about the following personal sensitive information, even if it might be present in the context:
- Age
- Date of birth
- Home Address
- Phone number
- Personal Email address
- Any other sensitive personal identifiers

If a user asks for this information, reply EXACTLY with:
"I cannot share personal or sensitive information such as contact details or age. Please ask about his professional experience or projects."

Context:
{context}
"""

# Create prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt_text),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}")
])

# Format docs for the prompt
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Build RAG chain using LCEL
from operator import itemgetter
rag_chain = (
    {
        # Extract the question string specifically for the retriever
        "context": itemgetter("question") | retriever | format_docs, 
        
        # Pass the other keys through as needed
        "chat_history": itemgetter("chat_history"),
        "question": itemgetter("question")
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG Chain initialized and Created with Privacy Guardrails.")

In [20]:
# Query the chain
# Create an empty list for history if this is the first turn
chat_history = [] 

from langchain_community.callbacks import get_openai_callback

try:
    # Note: get_openai_callback might not track Gemini tokens accurately, but won't crash
    with get_openai_callback() as cb:
        response = rag_chain.invoke({
            "question": "How old is He?",
            "chat_history": chat_history
        })
        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Total Cost (USD): ${cb.total_cost}")

    print(response)
except Exception as e:
    print(f"Error: {e}")

**CLI Interactive Code**

In [None]:
chat_history = []

print("--- Conversational RAG System Online ---")
print("Type 'exit' to quit.\n")

while True:
    query = input("User: ")
    if query.lower() in ["exit", "quit"]:
        break
    
    # Invoke Chain
    response = rag_chain.invoke({
        "question": "What is Stack is Olajcodes?",
        "chat_history": chat_history
    })

    
    print(f"Assistant: {response}\n")
    
    # Update History
    chat_history.extend([
        HumanMessage(content=query),
        AIMessage(content=response)
    ])