In [None]:
!pip install langchain langchain_community langchain-google-genai python-dotenv langchain_experimental langchain_chroma langchainhub pypdf


Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading la

In [3]:
from langchain_community.document_loaders import PyPDFLoader

# Update the path to your PDF file
loader = PyPDFLoader("/content/EMP_Data.pdf")
data = loader.load()  # entire PDF is loaded as a single Document

# Verify the data
print(data)


[Document(metadata={'source': '/content/EMP_Data.pdf', 'page': 0}, page_content='Company: Tech Innovators Inc. \nEmployee: Jane Smith \n• Project: Nike \n• Rating: 4.5/5 \n• Technologies: Python, JavaScript, React, Node.js \nEmployee: John Doe \n• Project: Apple \n• Rating: 4.7/5 \n• Technologies: Swift, Objective-C, Kotlin, Java \nCompany: Future Solutions Ltd. \nEmployee: Emily Johnson \n• Project: LG \n• Rating: 4.3/5 \n• Technologies: Java, Spring Boot, Angular, SQL \nEmployee: Michael Brown \n• Project: Samsung \n• Rating: 4.6/5 \n• Technologies: C++, Python, TensorFlow, Keras \nCompany: Global Tech Services \nEmployee: Sarah Davis \n• Project: Audi \n• Rating: 4.8/5 \n• Technologies: JavaScript, TypeScript, Vue.js, Node.js \nEmployee: David Wilson \n• Project: Volvo \n• Rating: 4.4/5 \n• Technologies: Python, Django, Flask, PostgreSQL \nCompany: Advanced Tech Solutions \nEmployee: Emma Walker '), Document(metadata={'source': '/content/EMP_Data.pdf', 'page': 1}, page_content='• Pr

In [4]:
len(data)

4

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  4


In [6]:
import os
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
from google.colab import userdata
api_key = userdata.get('GOOGLE_API_KEY')

# Load environment variables from a .env file
load_dotenv()
os.environ["GOOGLE_API_KEY"] = api_key

# Initialize the embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
print(vector[:5])


[0.05168594419956207, -0.030764883384108543, -0.03062233328819275, -0.02802734263241291, 0.01813093200325966]


In [7]:
vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

In [8]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

retrieved_docs = retriever.invoke("employee working in python?")



In [9]:
len(retrieved_docs)

4

In [10]:
print(retrieved_docs[0].page_content)

Employee: Alexander Morgan 
• Project: Apple 
• Rating: 4.6/5 
• Technologies: Swift, Objective-C, Kotlin, Java 
Company: Global IT Solutions 
Employee: Evelyn Parker 
• Project: LG 
• Rating: 4.3/5 
• Technologies: Java, Spring Boot, Angular, SQL 
Employee: Liam Bennett 
• Project: Samsung 
• Rating: 4.7/5 
• Technologies: Python, Django, TensorFlow, Keras


In [11]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",temperature=0.3, max_tokens=500)

In [12]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [13]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [19]:
response = rag_chain.invoke({"input": "Tell me about the full details of the employee with highest rating who knows python ?"})
print("RAG Output:", response["answer"])




RAG Output: Several employees have high ratings and know Python.  Ava Scott has the highest rating (4.8/5) and uses Python along with other technologies while working on the Audi project at Tech Pioneers Ltd.  Other employees with the same rating do not list Python as a known technology.



In [20]:
from langchain_google_genai import GoogleGenerativeAI

# Initialize the Gemini 1.5 Pro model
model = GoogleGenerativeAI(model="models/gemini-1.5-pro")

# Directly invoke the model without retrieval
direct_response = model.invoke("Tell me about the full details of the employee with highest rating who knows python ??")
print("Direct Model Output:", direct_response)

Direct Model Output: Please provide me with the data about the employees. I need information like their names, ratings, skills (including whether they know Python or not), and any other details you want me to consider to identify the employee with the highest rating among those who know Python.  

For example, you could provide the data as a table like this:

| Name | Rating | Skills |
|---|---|---|
| Employee A | 4.8 | Python, Java, C++ |
| Employee B | 4.5 | Java, C# |
| Employee C | 5.0 | Python, SQL |
| Employee D | 4.2 | Python |
| Employee E | 4.9 | JavaScript, Python |


Once you provide the data, I can analyze it and give you the details of the employee with the highest rating who knows Python.

