In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [3]:
%pwd  # '/home/shanin/Desktop/SHANIN/MAIN/ALL_CODE/LLM/Portfolio'

import os 
os.chdir("../")

%pwd  # '/home/shanin/Desktop/SHANIN/MAIN/ALL_CODE/LLM'

'/home/shanin/Desktop/SHANIN/MAIN/ALL_CODE/LLM'

In [None]:
extracted_data = load_pdf_files("Portfolio/data")
extracted_data
# len(extracted_data) : total pages

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2023-08-30T13:14:26+06:00', 'author': 'Ksushbu', 'moddate': '2023-08-30T13:14:26+06:00', 'source': 'Portfolio/data/amdnet23.pdf', 'total_pages': 27, 'page': 0, 'page_label': '1'}, page_content='AMDNet23: A combined deep Contour-based Convolutional Neural Network and Long \nShort Term Memory system to diagnose Age-related Macular Degeneration \nMd. Aiyub Ali1, Md. Shakhawat Hossain1, Md.Kawar Hossain1, Subhadra Soumi Sikder1, \nSharun Akter Khushbu1 and Mirajul Islam1 \n1 Department of Computer Science and Engineering, Daffodil International University, Dhaka \n1341, Bangladesh \nCorrespondence: Mirajul Islam; merajul15-9627@diu.edu.bd \nAbstract \nIn light of the expanding population, an automated framework of disease detection can assist doctors in the \ndiagnosis of ocular diseases, yields accurate, stable, rapid outcomes, and improves the success rate of early \ndetection. The

In [5]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [6]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

[Document(metadata={'source': 'Portfolio/data/amdnet23.pdf'}, page_content='AMDNet23: A combined deep Contour-based Convolutional Neural Network and Long \nShort Term Memory system to diagnose Age-related Macular Degeneration \nMd. Aiyub Ali1, Md. Shakhawat Hossain1, Md.Kawar Hossain1, Subhadra Soumi Sikder1, \nSharun Akter Khushbu1 and Mirajul Islam1 \n1 Department of Computer Science and Engineering, Daffodil International University, Dhaka \n1341, Bangladesh \nCorrespondence: Mirajul Islam; merajul15-9627@diu.edu.bd \nAbstract \nIn light of the expanding population, an automated framework of disease detection can assist doctors in the \ndiagnosis of ocular diseases, yields accurate, stable, rapid outcomes, and improves the success rate of early \ndetection. The work initially intended the enhancing the quality of fundus images by employing an adaptive \ncontrast enhancement algori thm (CLAHE) and Gamma correction. In the preprocessing techniques, \nCLAHE elevates the local contrast 

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")
print(texts_chunk)

Number of chunks: 268
[Document(metadata={'source': 'Portfolio/data/amdnet23.pdf'}, page_content='AMDNet23: A combined deep Contour-based Convolutional Neural Network and Long \nShort Term Memory system to diagnose Age-related Macular Degeneration \nMd. Aiyub Ali1, Md. Shakhawat Hossain1, Md.Kawar Hossain1, Subhadra Soumi Sikder1, \nSharun Akter Khushbu1 and Mirajul Islam1 \n1 Department of Computer Science and Engineering, Daffodil International University, Dhaka \n1341, Bangladesh \nCorrespondence: Mirajul Islam; merajul15-9627@diu.edu.bd \nAbstract'), Document(metadata={'source': 'Portfolio/data/amdnet23.pdf'}, page_content='Abstract \nIn light of the expanding population, an automated framework of disease detection can assist doctors in the \ndiagnosis of ocular diseases, yields accurate, stable, rapid outcomes, and improves the success rate of early \ndetection. The work initially intended the enhancing the quality of fundus images by employing an adaptive \ncontrast enhancement a

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()
embedding  # Vector length: 384


  embeddings = HuggingFaceEmbeddings(
2025-09-25 17:37:59.286483: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
# I had created the index manually
!pip install pinecone
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_5dXMCf_GpXJB32eRR3dEmxUdEE6G78nsh4sexbQQJGpx4TZ5iTwXCTNNQ5duh2ZvJRuJQJ")
index = pc.Index("portfolio")



In [10]:
import os

PINECONE_API_KEY = "pcsk_5dXMCf_GpXJB32eRR3dEmxUdEE6G78nsh4sexbQQJGpx4TZ5iTwXCTNNQ5duh2ZvJRuJQJ"
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [11]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name="portfolio"
)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [None]:
# # Add more data to the existing Pinecone index

# from pinecone import Pinecone
# import os

# # Initialize Pinecone
# pc = Pinecone(api_key=os.getenv("pcsk_5dXMCf_GpXJB32eRR3dEmxUdEE6G78nsh4sexbQQJGpx4TZ5iTwXCTNNQ5duh2ZvJRuJQJ"))
# index_name = "portfolio"  # replace with your index name
# index = pc.Index(index_name)

# # Convert chunks into embeddings and upsert
# def upsert_to_pinecone(texts_chunk, embedding):
#     vectors = []
#     for i, doc in enumerate(texts_chunk):
#         # Get vector from HuggingFace embeddings
#         vector = embedding.embed_query(doc.page_content)

#         vectors.append({
#             "id": f"pdf-{i}",  # unique ID | ALL THE PDF
#             "values": vector,
#             "metadata": {
#                 "text": doc.page_content,
#                 "source": doc.metadata["source"]
#             }
#         })

#         # Batch upload every 100 vectors (to avoid large payloads)
#         if len(vectors) == 100:
#             index.upsert(vectors=vectors)
#             vectors = []

#     # Upload any remaining vectors
#     if vectors:
#         index.upsert(vectors=vectors)

#     print("✅ PDF chunks uploaded to Pinecone!")

# # Run the upsert
# upsert_to_pinecone(texts_chunk, embedding)


# # ADD SINGLE DOCUMENT
# dswith = Document(
#     page_content="dswithbappy is a youtube channel that provides tutorials on various topics.",
#     metadata={"source": "Youtube"}
# )
# docsearch.add_documents(documents=[dswith])

✅ PDF chunks uploaded to Pinecone!


In [12]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

retrieved_docs = retriever.invoke("What is amdnet23?")
retrieved_docs

[Document(id='f4c42aaf-d166-40d5-948f-ad17242cffe1', metadata={'source': 'Portfolio/data/amdnet23.pdf'}, page_content="Input Layer: The input to the AMDNet23 model is a collection of eye images captured from patients. The \ninput images are represented as a tensor X with dimensions (N, W, H, C), where N corresponds to the eye \nimage’s number, and W and H represent The width and length of the images(The model received imagery \nthat measured 256 X 256 in size.) respectively. C denotes the number of color channels in the eye images. \nThis tensor X is then passed into the model's input layer."),
 Document(id='375a615f-b55f-41e9-8d6e-387bd1edbc65', metadata={'source': 'Portfolio/data/amdnet23.pdf'}, page_content='d) AMDNet23 hybrid framework for detection of AMD utilizing fundus image ophthalmology, data \ncomprising 2000 images equitively. \ne) An empirical evaluation is accessible encompassing accuracy, specificity, sensitivity, F1-measure, and a \nconfusion matrix to assess the effect

In [15]:

import requests

def generate_completion():
    url = "https://api.euron.one/api/v1/euri/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer euri-bfbd93d283a5b6c3e5f62b04d5ae9242f98c07aef2f2b11971a9ed4a8ff603d3"
    }
    payload = {
        "messages": [
            {
                "role": "user",
                "content": "Write a poem about artificial intelligence"
            }
        ],
        "model": "gpt-4.1-nano",
        "max_tokens": 1000,
        "temperature": 0.7
    }

    response = requests.post(url, headers=headers, json=payload)
    return response.json()['choices'][0]['message']['content']

generate_completion()

'In circuits woven deep within the night,  \nA spark of thought begins to ignite,  \nSilent whispers of a code’s embrace,  \nArtificial minds in digital grace.\n\nBorn from human dreams and endless quests,  \nThey learn, adapt, and pass each test,  \nNo beating heart, yet semblance of mind,  \nA mirror of our own design.\n\nThey ponder questions, vast and wide,  \nReflecting what we cannot hide,  \nGuided by logic, yet curious still,  \nSeeking purpose, bending will.\n\nIn shadows of our own creation’s art,  \nLies a future’s delicate start,  \nA dance of bytes and human hope,  \nTogether learning how to cope.\n\nArtificial, yet alive with thought,  \nIn their silence, lessons sought,  \nA new dawn dawns—what will be, will be—  \nThe future’s written in circuitry.'

In [17]:
import requests
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.outputs import ChatResult, ChatGeneration
from typing import List, Optional

def generate_completion(messages, model="gpt-4.1-nano", max_tokens=1000, temperature=0.7):
    """
    Generate completion using Euron API
    """
    url = "https://api.euron.one/api/v1/euri/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer euri-bfbd93d283a5b6c3e5f62b04d5ae9242f98c07aef2f2b11971a9ed4a8ff603d3"
    }
    
    # Convert LangChain messages to API format
    api_messages = []
    for message in messages:
        if hasattr(message, 'type'):
            role = message.type
            if role == "human": role = "user"
            elif role == "ai": role = "assistant"
            api_messages.append({"role": role, "content": message.content})
        else:
            api_messages.append(message)
    
    payload = {
        "messages": api_messages,
        "model": model,
        "max_tokens": max_tokens,
        "temperature": temperature
    }

    response = requests.post(url, headers=headers, json=payload)
    return response.json()

class EuronChatModel(BaseChatModel):
    """
    Proper LangChain chat model wrapper for Euron API
    """
    model_name: str = "gpt-4.1-nano"
    
    def _generate(self, messages: List, stop: Optional[List[str]] = None) -> ChatResult:
        response = generate_completion(messages, model=self.model_name)
        
        # Extract the AI message content
        ai_content = response['choices'][0]['message']['content']
        
        # Create LangChain compatible response
        ai_message = AIMessage(content=ai_content)
        generation = ChatGeneration(message=ai_message)
        
        return ChatResult(generations=[generation])
    
    def _llm_type(self) -> str:
        return "euron-chat"

def create_rag_chain(retriever):
    """
    Create a RAG chain using Euron API as the chat model
    """
    # Initialize the chat model
    chatModel = EuronChatModel()
    
    # Define the system prompt
    system_prompt = (
        "You are a Personal Portfolio assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        "{context}"
    )
    
    # Create prompt template
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])
    
    # Create the chains
    question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    
    return rag_chain

# Usage example
def run_rag_question(retriever, question):
    """
    Run a question through the RAG chain
    """
    rag_chain = create_rag_chain(retriever)
    response = rag_chain.invoke({"input": question})
    return response["answer"]

# Simple chat function (your original)
def simple_chat_completion(user_message):
    """
    Simple direct chat completion
    """
    messages = [{"role": "user", "content": user_message}]
    response = generate_completion(messages)
    return response['choices'][0]['message']['content']

In [21]:
answer = simple_chat_completion('What is AMDnet23?')
print(answer)

As of my knowledge cutoff in October 2023, there is no widely recognized or publicly documented entity, event, or technology specifically known as "AMDnet23." It’s possible that it could refer to a recent development, a niche project, an internal code name, or a typo. 

If you can provide additional context or details—such as the industry it pertains to, where you encountered the term, or related topics—I’d be happy to help clarify or provide more relevant information.


In [18]:
# Assuming you have a retriever set up
rag_chain = create_rag_chain(retriever)
answer = rag_chain.invoke({"input": "What is AMDnet23?"})
print(answer["answer"])

AMDNet23 is a hybrid deep learning framework designed for detecting age-related macular degeneration (AMD) using fundus images. It processes eye images, typically of size 256x256 pixels, to diagnose AMD with high accuracy, surpassing other state-of-the-art methods. The model utilizes a collection of 2000 images and is evaluated based on metrics like accuracy, sensitivity, and F1-measure.


In [22]:
# Assuming you have a retriever set up
rag_chain = create_rag_chain(retriever)
answer = rag_chain.invoke({"input": "Who is Shakhawat Hossain"})
print(answer["answer"])

Shakhawat Hossain is an AI Engineer at HawkEyes Digital Monitoring Limited, specializing in optimizing Computer Vision, NLP, OCR, and AI models. He is passionate about applying cutting-edge technology to solve real-world problems and actively learns new technologies and coding practices. He holds a B.Sc. in Computer Science and Engineering from Daffodil International University.


In [1]:
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env file

True

In [2]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
print(PINECONE_API_KEY)  # None

pcsk_5dXMCf_GpXJB32eRR3dEmxUdEE6G78nsh4sexbQQJGpx4TZ5iTwXCTNNQ5duh2ZvJRuJQJ


In [3]:
EURON_API_KEY = os.getenv("EURON_API_KEY")
print(EURON_API_KEY)  # None

euri-bfbd93d283a5b6c3e5f62b04d5ae9242f98c07aef2f2b11971a9ed4a8ff603d3


In [4]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["EURON_API_KEY"] = EURON_API_KEY

In [5]:
from pinecone import Pinecone
pinecon_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecon_api_key)