In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
documents = [
 "This is a list which containig sample documents.",
 "Keywords are important for keyword-based search.",
 "Document analysis involves extracting keywords.",
 "Keyword-based search relies on sparse embeddings.",
]

In [3]:
query = "keyword-based search"
import re

In [4]:
def pre_process(text):
  #convert text to lowercase
  text = text.lower()
  #remove punctuations
  text = re.sub(r'[^\w\s]','',text)
  return text

In [5]:
preprocess_doc=[pre_process(doc) for doc in documents]

In [6]:
preprocess_query = pre_process(query)

In [7]:
#creating the vectorizer object
vector=TfidfVectorizer()
X=vector.fit_transform(preprocess_doc) #X.toarray() to get the solution in matrix form


In [8]:
query_embedding=vector.transform([preprocess_query]).toarray()

In [9]:
#updated version of TF/IDF BM25

#perform the similarity_search between query and document
similarities = cosine_similarity(X, query_embedding)
#fetch the top_k_ranked_indices
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [10]:
#now get the coresponding documents from the list of indexes
ranked_doc= [preprocess_doc[i] for i in ranked_indices]


In [11]:
for i,doc in enumerate(ranked_doc):
  print(f"Rank {i+1}: {doc}")

Rank 1: keywords are important for keywordbased search
Rank 2: keywordbased search relies on sparse embeddings
Rank 3: document analysis involves extracting keywords
Rank 4: this is a list which containig sample documents


In [12]:
#dense vectors doesnt have zeros
document_embeddings = np.array([
[0.634, 0.234, 0.867, 0.042, 0.249],
[0.123,0.456, 0.789, 0.321, 0.654],
[8.987, 0.654, 8.321, 0.123, 8.456]
])
query_emedding=np.array([[0.789,0.321,0.654,0.987,0.123]])

In [13]:
similarities=cosine_similarity(query_emedding,document_embeddings)

In [14]:
np.argsort(similarities,axis=0)[::-1]

array([[0, 0, 0]])

In [20]:
doc_path="./170603762v7.pdf"
#!pip install pypdf
#!pip install langchain_community


In [21]:
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader(doc_path)
data=loader.load()

In [22]:
#converting the data into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)
chunks=text_splitter.split_documents(data)

In [23]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from google.colab import userdata
HuGGINGFACE_TOKEN=userdata.get('HuGGINGFACE_TOKEN')

In [24]:
#getting the embedding model
embeddings=HuggingFaceInferenceAPIEmbeddings(api_key=HuGGINGFACE_TOKEN,model_name="BAAI/bge-base-en-v1.5") #open source model
#for keyword search we are going to use the sparse embeddings
!pip install chromadb
from langchain.vectorstores import Chroma
db=Chroma.from_documents(chunks,embeddings) #dense vector is created
vectorstore_reciever=db.as_retriever(search_kwargs={'k':3}) #search keywords


Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.2-py3

In [25]:
!pip install rank_bm25
from langchain.retrievers import BM25Retriever , EnsembleRetriever
bm25_retriever=BM25Retriever.from_documents(chunks) #pass the chunk to create sparse vectors
bm25_retriever.k=3 #we want top 3 results
final_retriever=EnsembleRetriever(retrievers=[vectorstore_reciever,bm25_retriever],weights=[0.3,0.7]) #ensemble both vectors


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


#Mixing vector search and keyword search for Hybrid search
##hybrid_score = (1 - alpha) * sparse_score + alpha * dense_score

In [26]:
model_name="HuggingFaceH4/zephyr-7b-beta"

In [27]:
!pip install bitsandbytes # used for optimizing and managing large-scale model training and inference on GPUs
#only useful when working on gpu else it will throw error
!pip install accelerate
#this also working on gpu and managing the gpu


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


# GPU training
Efficient GPU usage is crucial in deep learning to maximize performance, reduce training costs, and enable large-scale model training and deployment. Here’s a structured overview of techniques and best practices for optimizing GPU utilization:

---

### 1. Understanding GPU Hardware

1. GPU Architecture:
   - GPUs are optimized for parallel processing, ideal for matrix operations in deep learning.
   - Key components:
     - CUDA Cores: Perform the parallel computations.
     - Tensor Cores: Accelerate mixed-precision operations (FP16/FP32).

2. Memory Hierarchy:
   - Global Memory: High-capacity but slower.
   - Shared Memory: Small, faster memory for intermediate computations.

---

### 2. Optimizing GPU Memory Usage

1. Mixed-Precision Training:
   - Use lower precision (FP16 or BF16) to reduce memory usage and increase throughput.
   - Supported by TensorFlow and PyTorch.

2. Gradient Accumulation:
   - Break large batches into smaller ones to fit in memory.

3. Model Quantization:
   - Represent weights and activations in lower precision (e.g., 8-bit or 4-bit).

4. Checkpoints and Offloading:
   - Use gradient checkpointing to save memory.
   - Offload operations to CPU or disk when necessary.

---

### 3. Maximizing Computational Efficiency

1. Data Parallelism:
   - Distribute data across multiple GPUs for parallel computation.

2. Model Parallelism:
   - Split the model itself across GPUs for very large architectures.

3. Pipeline Parallelism:
   - Split layers across GPUs and process data in a pipeline fashion.

4. Optimized Kernels and Libraries:
   - Use cuBLAS, cuDNN, and TensorRT for optimized operations.

---

### 4. Accelerating Training and Inference

1. Efficient Dataloading:
   - Use parallel data loaders with prefetching.

2. Batch Size Optimization:
   - Balance batch size for better utilization without memory overflow.

3. Compiler Optimizations:
   - Use NVIDIA TensorRT or PyTorch’s TorchScript for optimized graphs.

4. Asynchronous Operations:
   - Overlap computation and memory transfers using CUDA.

---

### 5. Profiling and Debugging

1. Profiling Tools:
   - NVIDIA Nsight, PyTorch Profiler, TensorFlow Profiler.

2. Monitor Resource Usage:
   - Use tools like nvidia-smi for real-time monitoring.

3. Identify Bottlenecks:
   - Focus on data loading, synchronization points, and memory allocation.

---

### 6. Advanced Techniques for Large Models

1. Sharding:
   - Split weights, activations, or gradients across GPUs or nodes.

2. Zero Redundancy Optimizer (ZeRO):
   - Reduces memory consumption by splitting optimizer states.

3. Sparsity:
   - Use sparsified models to reduce computation and memory needs.

4. Knowledge Distillation:
   - Train smaller models using knowledge from larger ones.

---

### 7. Practical Tools and Frameworks

1. Hugging Face:
   - Tools like accelerate and bitsandbytes for distributed training.

2. DeepSpeed:
   - Efficient training for massive models.

3. NVIDIA Triton:
   - Optimized inference pipelines.

4. TensorFlow XLA:
   - Compiles TensorFlow graphs into low-level GPU-optimized code.

---

### 8. Deployment Considerations

1. Inference Optimization:
   - Use quantized models (e.g., INT8) for low-latency applications.

2. Edge Deployment:
   - Frameworks like TensorFlow Lite or ONNX Runtime for resource-constrained devices.

3. Scalability:
   - Use NVIDIA Triton or TensorFlow Serving for large-scale inference.

---

### Conclusion
Efficient GPU usage is essential for scaling deep learning tasks, especially for large models and datasets. By combining techniques like quantization, mixed-precision training, and distributed computation, researchers and developers can harness the full potential of GPUs while minimizing costs and maximizing performance.


In [28]:

from transformers import (AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig , pipeline)
from langchain import HuggingFacePipeline

In [29]:
import torch
def load_quantaized_model(model_name:str):
  bnb_config=BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  model=AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config,torch_dtype=torch.bfloat16)
  return model

In [30]:
def initilze_tokenizer(model_name:str):
  tokenizer=AutoTokenizer.from_pretrained(model_name,return_token_type_ids=False)
  tokenizer.bos_token_id = 1
  return tokenizer

In [31]:
tokenizer=initilze_tokenizer(model_name)
model=load_quantaized_model(model_name)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [32]:
#chromadb (RAM,in_memory_db,cloud)
pipeline=pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

In [33]:
llm=HuggingFacePipeline(pipeline=pipeline)

  llm=HuggingFacePipeline(pipeline=pipeline)


In [34]:
from langchain.chains import RetrievalQA

In [35]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=final_retriever,
    return_source_documents=True,
    verbose=True
)

In [36]:
response1 = normal_chain.invoke({"query": "what is this article/data discussed about?"})

  response1 = normal_chain({"query": "what is this article/data discussed about?"})




[1m> Entering new RetrievalQA chain...[0m


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



[1m> Finished chain.[0m


In [37]:
normal_chain.invoke({"query": "give me summary of the data ?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'give me summary of the data ?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nsentence. We give two such examples above, from two different heads from the encoder self-attention\nat layer 5 of 6. The heads clearly learned to perform different tasks.\n15\n\nmodel by multiplying the training time, the number of GPUs used, and an estimate of the sustained\nsingle-precision floating-point capacity of each GPU 5.\n6.2 Model Variations\n\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n\nProvided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\n\n4 128 128 5.00 25.5\n16 32 32 4.91 25.8\n32 16 16 5.01 25.4\n(B) 16 5.16 25.1 58\n32 5.01 25.4 60

In [38]:
normal_chain.invoke({"query": "what is topic discussed in the data?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'what is topic discussed in the data?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nshould\nbe\njust\n-\nthis\nis\nwhat\nwe\nare\nmissing\n,\nin\nmy\nopinion\n.\n<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfect\n,\nbut\nits\napplication\nshould\nbe\njust\n-\nthis\nis\nwhat\nwe\nare\nmissing\n,\nin\nmy\nopinion\n.\n<EOS>\n\nInput-Input Layer5\nThe\nLaw\nwill\nnever\nbe\nperfect\n,\nbut\nits\napplication\nshould\nbe\njust\n-\nthis\nis\nwhat\nwe\nare\nmissing\n,\nin\nmy\nopinion\n.\n<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfect\n,\nbut\nits\napplication\n\n,\nin\nmy\nopinion\n.\n<EOS>\n<pad>\nFigure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top:\n\nacross languages. In Proceedings of the 2009 Conference on Empirical Methods in Natural\nLanguage Processing, pages 832–841. ACL, August 2009.\n\nPetro

In [39]:
print(normal_chain.invoke({"query": "give me author of the data?"}).page_content)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'give me author of the data?',
 'result': 'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nsentence. We give two such examples above, from two different heads from the encoder self-attention\nat layer 5 of 6. The heads clearly learned to perform different tasks.\n15\n\nmodel by multiplying the training time, the number of GPUs used, and an estimate of the sustained\nsingle-precision floating-point capacity of each GPU 5.\n6.2 Model Variations\n\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n\nProvided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\n\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †

In [45]:
print(normal_chain.invoke({"query": "give me 10 key points from data?"})["result"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

sentence. We give two such examples above, from two different heads from the encoder self-attention
at layer 5 of 6. The heads clearly learned to perform different tasks.
15

[4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine
reading. arXiv preprint arXiv:1601.06733, 2016.
10

dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the
ability to learn such dependencies is the length of the paths forward and backward signals have to

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need

Google Research
usz@google.com
Llion Jones∗
