# Langchain

learn main components with small mini projects

### Models Components

In [None]:
# Models: Close source / Open source
# topics:
# - Text generation
# - Temperature, top_p, top_k
# - invoke and stream

# Embedding models: create embeddings from text
# task:
# - Create embeddings from text documents
# - Store embeddings in vector databases
# - Similarity search and calculate cosine similarity

In [None]:
# Embedding task

# ollama pull embeddinggemma
# check ollama model using "ollama serve" in cmd
from langchain.embeddings.ollama import OllamaEmbeddings

data = """ 
What is Machine Learning?
Machine learning is a branch of artificial intelligence that enables algorithms to uncover hidden patterns within datasets. 
It allows them to predict new, similar data without explicit programming for each task. 
Machine learning finds applications in diverse fields such as image and speech recognition, 
natural language processing, recommendation systems, fraud detection, portfolio optimization, and automating tasks.

Types of Machine Learning
Machine learning algorithms can be broadly categorized into three main types based on their learning approach and the nature of the data they work with.

Supervised Learning
Involves training models using labeled datasets. Both input and output variables are provided during training.
The aim is to establish a mapping function that predicts outcomes for new, unseen data.
Common applications include classification, regression, and forecasting.

Unsupervised Learning
Works with unlabeled data where outputs are not known in advance.
The model identifies hidden structures, relationships, or groupings in the data.
Useful for clustering, dimensionality reduction, and anomaly detection.
Focuses on discovering inherent patterns within datasets.

Reinforcement Learning
Based on decision-making through interaction with an environment.
An agent performs actions and receives rewards or penalties as feedback.
The goal is to learn an optimal strategy that maximizes long-term rewards.
Widely applied in robotics, autonomous systems, and strategic game playing.
"""

embeddings = OllamaEmbeddings(model="embeddinggemma")


# create chunked documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.create_documents([data])

print(f"Created {len(docs)} documents")
print(f"first document: {docs[0]}")

# create embeddings
doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
print(f"\nCreated {len(doc_embeddings)} embeddings")

# calculate cosine similariy between embeddings (numpy)
import numpy as np
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

similarity = cosine_similarity(doc_embeddings[0], doc_embeddings[1])
print(f"\nCosine similarity between first two embeddings: {similarity}")

Created 4 documents
first document: page_content='What is Machine Learning?
Machine learning is a branch of artificial intelligence that enables algorithms to uncover hidden patterns within datasets. 
It allows them to predict new, similar data without explicit programming for each task. 
Machine learning finds applications in diverse fields such as image and speech recognition, 
natural language processing, recommendation systems, fraud detection, portfolio optimization, and automating tasks.'

Created 4 embeddings

Cosine similarity between first two embeddings: 0.679702416172827


In [None]:
### notes

"""
text input > chunks            :    RecusriveCharacterTextSplitter(), splitter.from_documents(LIST) 

chunks     > embedding vector  :    embedding.embed_documents([doc.page_content for doc in docs])           # embed contents of each document
"""

### Prompt Components

In [7]:
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model = "phi")

from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You're an Expert assistant"),
    ("human", "Give me a short description of Langchain framework")
])

chain = prompt | llm

chain.invoke({})

AIMessage(content=' Langchain is a language-independent software framework for building and running microservices. It allows developers to write code in any programming language and deploy it as separate services on a single machine or across multiple machines. Langchain supports popular languages such as Java, Python, C#, Go, and Ruby. Its architecture includes a runtime component that handles the execution of the code and a messaging system for communication between services.\n\nLangchain is designed to enable microservices development and deployment in a fast and scalable manner. It allows developers to create complex applications by breaking them down into smaller, independent services that can be developed and tested independently. Langchain also provides features such as security, observability, and fault tolerance for highly scalable microservices architectures.\n\nOverall, Langchain is an innovative framework for building and running microservices that enables developers to del

ðŸ§  Mini Projects

Chatbot â€” Built a simple chatbot using ChatPromptTemplate and message history.

Research Paper Summarizer â€” Created a summarization tool that accepts a research paper as input and outputs a concise summary using prompt templates.

In [9]:
%pip install -qU langchain-community pypdf

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.27 requires langchain-core<1.0.0,>=0.3.72, but you have langchain-core 1.2.7 which is incompatible.
langchain 0.3.27 requires langchain-text-splitters<1.0.0,>=0.3.9, but you have langchain-text-splitters 1.1.0 which is incompatible.


In [None]:
"""
#  why this code doesn't work?
"""

from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


input_pdf_path = "data\event_pdfs\Event-Based_Vision_A_Survey.pdf"
loader = PyPDFLoader(input_pdf_path)
docs = loader.load()

print(f"number of documents: {len(docs)}")
# print(docs[1])

# now, we have each page, let's create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100)
chunks = splitter.split_text(docs)
print(f"number of chunks: {len(chunks)}")

In [None]:
# the main issue of the previous code is
# docs is a list of documents
# split_text:  take "text" as the input not a list
# split_documents: take a list of document

In [13]:
from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


input_pdf_path = "data\event_pdfs\Event-Based_Vision_A_Survey.pdf"
loader = PyPDFLoader(input_pdf_path)
docs = loader.load()

print(f"number of documents: {len(docs)}")
# print(docs[1])

# now, we have each page, let's create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=100)
chunks = splitter.split_documents(docs)
print(f"number of chunks: {len(chunks)}")

number of documents: 27
number of chunks: 459
