In [1]:
from langchain_community.document_loaders import PyPDFLoader,UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community import output_parsers
from langchain_community.vectorstores import Chroma 
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import streamlit as st
import glob
import os
import pandas as pd
from langchain_core.documents import Document
from langchain_unstructured import UnstructuredLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata

In [2]:
base_path = os.path.join(os.getcwd(), "data")  
pdf_paths = glob.glob(os.path.join(base_path, "**", "*.pdf"), recursive=True)
docx_paths = glob.glob(os.path.join(base_path, "**", "*.docx"), recursive=True)
all_documents = []

for path in pdf_paths:
        try:
            loader = PyPDFLoader(path)
            docs = loader.load()
            all_documents.extend(docs)
        except Exception as e:
            print(f"Error loading {path}: {e}")

for path in docx_paths:
        try:
            loader = UnstructuredFileLoader(path)
            docs = loader.load()
            all_documents.extend(docs)
        except Exception as e:
            print(f"❌ Error loading DOCX {path}: {e}")



  loader = UnstructuredFileLoader(path)


In [3]:
all_documents

[Document(metadata={'producer': '3-Heights™ PDF Optimization API 6.17.0.2 (http://www.pdf-tools.com)', 'creator': 'Chromium', 'creationdate': '2024-01-25T18:39:31+00:00', 'moddate': '2024-01-25T18:39:32+00:00', 'source': 'c:\\Users\\DELL\\OneDrive\\Desktop\\genai1\\data\\Intern Talks\\21-25 batch Intern Talk.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='INTERN TALK\nData Science and Artificial Intelligence Tensor'),
 Document(metadata={'producer': '3-Heights™ PDF Optimization API 6.17.0.2 (http://www.pdf-tools.com)', 'creator': 'Chromium', 'creationdate': '2024-01-25T18:39:31+00:00', 'moddate': '2024-01-25T18:39:32+00:00', 'source': 'c:\\Users\\DELL\\OneDrive\\Desktop\\genai1\\data\\Intern Talks\\21-25 batch Intern Talk.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}, page_content='•\n•\n•\n•\n•\nValuable experience, and industrial exposure.\nChance for pre-placement offers (PPOs).\nProvides edge in placement interviews.\nStipend :)\nInternship Data: 2023

In [4]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs=text_splitter.split_documents(all_documents)


In [5]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",      
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)


  huggingface_embeddings=HuggingFaceBgeEmbeddings(


In [6]:
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(all_documents, huggingface_embeddings)



In [7]:
query="what are the projects for quant roles"
result=db.similarity_search(query)
result

[Document(metadata={'total_pages': 15, 'page_label': '7', 'moddate': '2024-01-25T18:39:32+00:00', 'creator': 'Chromium', 'producer': '3-Heights™ PDF Optimization API 6.17.0.2 (http://www.pdf-tools.com)', 'creationdate': '2024-01-25T18:39:31+00:00', 'page': 6, 'source': 'c:\\Users\\DELL\\OneDrive\\Desktop\\genai1\\data\\Intern Talks\\21-25 batch Intern Talk.pdf'}, page_content='•\n•\n•\n•\n•\n•\nSDE (Aryan Lath & vrooon, SWE Intern @ Google)\nQuant (Aditya Gupta, Quant Intern @ Quadeye)\nML/Data Science (Shantanu Chaudhari, DS Intern @ Infoedge)\nResearch [MITACS, EPFL,  …]\nDesign\nOff-campus\nRoles'),
 Document(metadata={'page': 2, 'creator': 'PyPDF', 'source': 'c:\\Users\\DELL\\OneDrive\\Desktop\\genai1\\data\\Internship Interview Experiences\\2023 DSAI Internship Experiences.pdf', 'producer': 'Skia/PDF m139 Google Docs Renderer', 'title': 'DSAI Internship Interview Experiences', 'page_label': '3', 'total_pages': 31, 'creationdate': ''}, page_content="Quadeye   Roles  offered:  Quant

In [8]:
ret=db.as_retriever()

In [9]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

llm = Ollama(model="llama3")

prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer me clearly from the context in depth, easy to understand. If you do this I will tip you $100."),
    ("human", "<context>\n{context}\n</context>\nQuestion: {input}")
])


  llm = Ollama(model="llama3")


In [10]:
from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain=create_stuff_documents_chain(llm,prompt)

In [11]:
from langchain.chains import create_retrieval_chain
rc=create_retrieval_chain(ret,doc_chain)


In [13]:
rc.invoke({"input":"give some projects that my seniors put in their cvs"})

{'input': 'give some projects that my seniors put in their cvs',
 'context': [Document(metadata={'producer': 'Canva', 'creationdate': '2025-01-16T11:34:34+00:00', 'page_label': '25', 'moddate': '2025-01-16T11:34:31+00:00', 'page': 24, 'author': 'Sanjana Kolisetty', 'keywords': 'DAGcWmjJ7no,BAE14CjFFr0', 'total_pages': 28, 'creator': 'Canva', 'title': 'Copy of INTERN TALK & SEMESTER-4 ACADEMIC TALK', 'source': 'c:\\Users\\DELL\\OneDrive\\Desktop\\genai1\\data\\Intern Talks\\intren talk.pdf'}, page_content='R e s u m e \nMention only those PORs and extracurricular activites for which you have\na certificate. \nPlease note that you will have to submit proof of every single point that\nyou write in you CV during CV submission.\nYour CV has to be only one page. If your CV is taking more than one page,\nthere has to be somethings which are not relevant or important.\nMake sure you know the courses that you are mentioning. For instance, if\nyou mention ‘Discrete Mathematics’ in your CV, be pr

In [15]:
from langchain_core.runnables import RunnableMap
rc1 = RunnableMap({
    "context": lambda x: ret.get_relevant_documents(x["input"]),
    "input": lambda x: x["input"]
}) | doc_chain


In [1]:
from pinecone import Pinecone
api="pcsk_4LLsH8_UxTh1J4odJSxT1ZbQeab7E6u8nRzuQcPK7YnNRxbmft3oADmjjSShCFvn3sqeAf"
pc = Pinecone(api_key=api)
index = pc.Index("intern")

In [6]:
from pinecone import Pinecone

# Initialize the Pinecone client
api = "pcsk_4LLsH8_UxTh1J4odJSxT1ZbQeab7E6u8nRzuQcPK7YnNRxbmft3oADmjjSShCFvn3sqeAf"
pc = Pinecone(api_key=api)

# Connect to your existing index
index = pc.Index("intern")

# Use Pinecone's inference API to get embeddings
response = pc.inference.embed(
    model="llama-text-embed-v2",
    inputs=[
        "Pinecone is amazing!",
        "LLaMA embeddings are fast and accurate."
    ],
    parameters={
        "input_type": "passage",
        "truncate": "END"
    }
)




In [7]:
from pinecone.grpc import PineconeGRPC as Pinecone

# Initialize client
pc = Pinecone(api_key=api)

# Example texts
texts = [
    "Pinecone is amazing!",
    "LLaMA embeddings are fast and accurate."
]

# Generate embeddings
response = pc.inference.embed(
    model="llama-text-embed-v2",
    inputs=texts,
    parameters={"input_type": "passage", "truncate": "END"}
)

# The returned object looks like:
# EmbeddingsList(model='llama-text-embed-v2', data=[ {...}, {...} ], usage={...})

# Loop over your original texts and response data
for text, item in zip(texts, response.data):
    print(f"Text: {text}")
    print("Embedding (first 5 dims):", item["values"][:5])
    print()


Text: Pinecone is amazing!
Embedding (first 5 dims): [0.01496124267578125, 0.0052642822265625, 0.03076171875, 0.022308349609375, -0.01363372802734375]

Text: LLaMA embeddings are fast and accurate.
Embedding (first 5 dims): [-0.03668212890625, -0.0270538330078125, 0.0179443359375, 0.0188751220703125, 0.0635986328125]



In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data\Internship CVs\CE_CONSULTING_KPMG.pdf")
docs = loader.load()

print(f"✅ Pages loaded: {len(docs)}")
print("👉 Sample text:", docs[0].page_content[:500])


  loader = PyPDFLoader("data\Internship CVs\CE_CONSULTING_KPMG.pdf")


✅ Pages loaded: 1
👉 Sample text: Utkarsh Utpal +91-6202999205
Roll No.:210104113 u.utpal@iitg.ac.in
B.Tech - Civil Engineering utkarshutpal04@gmail.com
Indian Institute Of Technology, Guwahati linkedin
Education
Degree/Certificate Institute/Board CGPA/Percentage Year
B.Tech. Major Indian Institute of Technology, Guwahati 7.41 (Current) 2021-Present
Senior Secondary CBSE Board 90% 2021
Secondary CBSE Board 95% 2019
Experience
• Finance Intern (CEO’s office) | Vedantu Ongoing
Estimated criteria for short listing colleges for acqu


In [2]:
import fitz
doc = fitz.open("data\Internship CVs\CE_CONSULTING_KPMG.pdf")
text = ""
for page in doc:
    text += page.get_text("text")
print(text)

  doc = fitz.open("data\Internship CVs\CE_CONSULTING_KPMG.pdf")


Utkarsh Utpal
+91-6202999205
Roll No.:210104113
u.utpal@iitg.ac.in
B.Tech - Civil Engineering
utkarshutpal04@gmail.com
Indian Institute Of Technology, Guwahati
linkedin
Education
Degree/Certificate
Institute/Board
CGPA/Percentage
Year
B.Tech. Major
Indian Institute of Technology, Guwahati
7.41 (Current)
2021-Present
Senior Secondary
CBSE Board
90%
2021
Secondary
CBSE Board
95%
2019
Experience
• Finance Intern (CEO’s office) | Vedantu
Ongoing
Estimated criteria for short listing colleges for acquisition, suggested intervention in the hybrid model
– Modelled future 1 year P&L plan with EBITDA zero & predicted revenue of Rs 15cr for 10 hybrid centres in India
– Empowered organic growth with 3 strategies by analysing 6 marketing lead sources with 20000+ leads
– Suggested 15 colleges fit for acquisition or merger by analysing the online and offline education sector
– Evaluated the financials of Urbane Academy to propose acquisition cost of Rs 40cr with a revenue multiple of 2
• Government I

In [4]:
len(text)
print(text)

Utkarsh Utpal
+91-6202999205
Roll No.:210104113
u.utpal@iitg.ac.in
B.Tech - Civil Engineering
utkarshutpal04@gmail.com
Indian Institute Of Technology, Guwahati
linkedin
Education
Degree/Certificate
Institute/Board
CGPA/Percentage
Year
B.Tech. Major
Indian Institute of Technology, Guwahati
7.41 (Current)
2021-Present
Senior Secondary
CBSE Board
90%
2021
Secondary
CBSE Board
95%
2019
Experience
• Finance Intern (CEO’s office) | Vedantu
Ongoing
Estimated criteria for short listing colleges for acquisition, suggested intervention in the hybrid model
– Modelled future 1 year P&L plan with EBITDA zero & predicted revenue of Rs 15cr for 10 hybrid centres in India
– Empowered organic growth with 3 strategies by analysing 6 marketing lead sources with 20000+ leads
– Suggested 15 colleges fit for acquisition or merger by analysing the online and offline education sector
– Evaluated the financials of Urbane Academy to propose acquisition cost of Rs 40cr with a revenue multiple of 2
• Government I