In [1]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['GOOGLE_API_KEY'] = 'AIzaSyCPfSqbd0LZOz-T8CPB8bauhqsRtObE0b4'

In [12]:
loader1 = PyPDFLoader(r'C:\Users\jaint\CC-Task2-RAG\SI Chronicles 23-24 Sem I.pdf')
loader2 = PyPDFLoader(r'C:\Users\jaint\CC-Task2-RAG\Placement Chronicles 2023-24.pdf')
pages = loader1.load()
pages.extend(loader2.load())

print(len(pages))

226


In [13]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=800, chunk_overlap=50)
splits = text_splitter.split_documents(pages)

vectorstore = Chroma.from_documents(documents=splits, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)

In [21]:
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate only the sub-problems using as little formatting as needed.\n
You must make them relevant from the perspective of a college student seeking help in securing placements. \n
Generate multiple sub-questions related to: {question} \n
Output (5 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

generate_queries_decomposition = (prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))


question = "What do I need to do to secure a placement in the finance field?"
questions = generate_queries_decomposition.invoke({"question": question})

questions = [q for q in questions if q]
print(questions)

['What are the typical career paths in finance that interest me?', 'What skills and qualifications are essential for these roles?', 'How can I gain relevant experience through internships or projects?', 'How do I network effectively within the finance industry?', 'How can I improve my resume and cover letter to stand out to potential employers? ']


In [23]:
template = """You are a helpful assistant that answers questions based on the following context: {context}\n
You must answer the questions from the perspective of a college student seeking help in securing placements. \n
Answer using as little formatting as possible.\n
Question: {question}\n"""

prompt = ChatPromptTemplate.from_template(template)

def retrieve_and_rag(prompt_rag, sub_questions):

    # Initialize a list to hold RAG chain results
    rag_results = []
    
    # Retrieve documents for each sub-question
    hyde_template = """Please write a scientific paper passage to answer the question
    Question: {question}
    Passage:"""
    prompt_hyde = ChatPromptTemplate.from_template(hyde_template)

    for sub_question in sub_questions:
        
        generate_docs_for_retrieval = (prompt_hyde | llm | StrOutputParser())
        retrieval_chain = generate_docs_for_retrieval | retriever
        retrieved_docs = retrieval_chain.invoke({"question": sub_question})

        # Use retrieved documents and sub-question in RAG chain
        ans = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs,
                                                                "question": sub_question})
        rag_results.append(ans)

    return rag_results


answers = retrieve_and_rag(prompt, questions)
# print(answers)

def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""

    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()


context = format_qa_pairs(questions, answers)
# print(context)

template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
        prompt
        | llm
        | StrOutputParser()
)

print(final_rag_chain.invoke({"context": context, "question": question}))

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<loc

To secure a placement in the finance field, particularly in quantitative finance, focus on the following:

**1. Technical Skills:**

* **Master data structures and algorithms (DSA):** This is crucial for many quantitative roles.
* **Develop strong coding skills:**  Focus on languages relevant to finance and data analysis.
* **Understand core CS concepts:** Operating systems, database management systems, and object-oriented programming (OOP) are important.
* **Gain familiarity with relevant tools:** Excel, SQL, and statistical software packages are frequently used.

**2. Relevant Experience:**

* **Pursue internships:** Target companies offering roles in data analytics, model validation, or related areas.
* **Highlight relevant projects:** Showcase projects involving data analysis, financial modeling, or large datasets.
* **Consider consulting experience:**  Experience with firms like EluciData Consulting can provide valuable exposure to handling large datasets.

**3. Application Materi