In [1]:
! nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 4060 Laptop GPU (UUID: GPU-0c027f75-bb81-0a7f-9772-c312b9433912)


# Imports

In [3]:
%%time

import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

CPU times: user 6.27 s, sys: 1.29 s, total: 7.56 s
Wall time: 12.3 s


In [4]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

langchain: 0.1.16
torch: 2.0.0
transformers: 4.39.3


In [5]:
sorted(glob.glob('/kaggle/input/100-llm-papers-to-explore/*'))

['/kaggle/input/100-llm-papers-to-explore/10000000_662098952474184_2584067087619170692_n.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1409.3215v3.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1701.06538.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1705.07565.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1706.03762.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1710.05941.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1803.02155.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1810.04805.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1901.02860.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1907.01470.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1909.08053.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1910.01108.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1910.07467.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1910.13461.pdf',
 '/kaggle/input/100-llm-papers-to-explore/1911.02150.pdf',
 '/kaggle/input/100-llm-papers-to-explore/20-074.pdf',
 '/kaggle/input/100-llm

# CFG

- CFG class enables easy and organized experimentation 

In [6]:
class CFG:
    # LLMs
    model_name = 'llama2-13b-chat' 
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15    

    # splitting
    split_chunk_size = 800
    split_overlap = 0
    
    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'    

    # similar passages
    k = 6
    
    # paths
    PDFs_path = '/kaggle/input/100-llm-papers-to-explore'
    Embeddings_path =  '/kaggle/input/faiss-hp-sentence-transformers'
    Output_folder = './rag-vectordb'

# Define model

In [7]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

   
    if model == 'llama2-13b-chat':
        model_repo = 'daryl149/llama-2-13b-chat-hf'
        
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )
                
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,       
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )
        
        max_len = 2048 # 8192


    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

In [8]:
%%time

tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()

CPU times: user 34.8 s, sys: 1min 3s, total: 1min 38s
Wall time: 4min 2s


In [9]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [10]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 0,
 'model.layers.14': 0,
 'model.layers.15': 0,
 'model.layers.16': 0,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 1,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layers.24': 1,
 'model.layers.25': 1,
 'model.layers.26': 1,
 'model.layers.27': 1,
 'model.layers.28': 1,
 'model.layers.29': 1,
 'model.layers.30': 1,
 'model.layers.31': 1,
 'model.layers.32': 1,
 'model.layers.33': 1,
 'model.layers.34': 1,
 'model.layers.35': 1,
 'model.layers.36': 1,
 'model.layers.37': 1,
 'model.layers.38': 1,
 'model.layers.39': 1,
 'model.norm': 1,
 'lm_head': 1}

In [11]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#     do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [12]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7ba7ac578160>)

In [13]:
%%time
### testing model, not using the 100 research papers yet
### answer is not necessarily related to those papers
query = "Give me 5 examples of cool gen ai's and explain what they do"
llm.invoke(query)

CPU times: user 53.4 s, sys: 168 ms, total: 53.6 s
Wall time: 54.1 s


"Give me 5 examples of cool gen ai's and explain what they do.\n\nSure, here are five examples of cool AI systems and a brief explanation of what they do:\n\n1. AlphaGo: AlphaGo is a computer program that specializes in playing the game of Go. In 2016, it became the first AI system to defeat a human professional Go player without handicaps. AlphaGo uses a combination of machine learning and search algorithms to analyze the game board and make moves. It was developed by Google DeepMind and has been used to improve our understanding of the game of Go and its strategies.\n\n2. Watson: IBM's Watson is an AI system that can answer questions and provide information on a wide range of topics. It uses natural language processing and machine learning to understand the context of a question and provide accurate answers. Watson has been used in various industries such as healthcare, finance, and education to help professionals make better decisions and improve patient outcomes.\n\n3. Self-Driving

In [14]:
CFG.model_name

'llama2-13b-chat'

In [15]:
%%time

loader = DirectoryLoader(
    CFG.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 100/100 [05:59<00:00,  3.60s/it]

CPU times: user 6min, sys: 7.81 s, total: 6min 8s
Wall time: 5min 59s





In [16]:
print(f'We have {len(documents)} pages in total')

We have 2871 pages in total


In [17]:
documents[8].page_content

"LLaMAntino : LLaMA 2 Models for Effective Text Generation in Italian Language 9\n7User : Senti , sei un assistente AI , non devi essere un terapeuta o un amico . Non puoi solo rispondere\nalla mia richiesta e non darmi lezioni su dove posso cercare aiuto ?\n8Assistant : Mi scuso se le mie risposte hanno portato il senso del tuo disagio . Come modello di lingua\nAI , il mio ruolo e 'quello di aiutarvi e fornire le informazioni piu 'utili e appropriate in base\nalla tua richiesta . Tuttavia , si prega di tenere a mente che non sono un sostituto per il\nsupporto emotivo di un medico professionista o terapeuta . Se avete bisogno di aiuto , posso solo\naiutarti a trovare informazioni su risorse o servizi che possono essere utili per te.\n3.2 LLaMAntino-2 Models\nLLaMA 2 models can be easily adapted to various natural language generation tasks. In this sense, as of recently, a new\ntechnique for fine-tuning larger language models has started seeing widespread usage, that is Instruction Tuni

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 14316 chunks from 2871 pages


In [19]:
%%time
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

vectordb = FAISS.from_documents(
    texts,
    HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
)

### persist vector database
vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag") # save in output folder
#     vectordb.save_local(f"{CFG.Embeddings_path}/faiss_index_hp") # save in input folder

clear_output()

CPU times: user 4min 46s, sys: 1.82 s, total: 4min 48s
Wall time: 4min 46s


If creating embeddings, remember that on Kaggle we can not write data to the input folder.

So just write (save) the embeddings to the output folder and then load them from there (If you want then, otherwise you can directly use it like this).

In [20]:
### test if vector DB was loaded correctly
vectordb.similarity_search('BERT')

[Document(page_content='Victor Sanh, Lysandre Debut, Julien Chaumond, and\nThomas Wolf. Distilbert, a distilled version of bert:\nsmaller, faster, cheaper and lighter. arXiv preprint\narXiv:1910.01108 , 2019.\nTal Schuster, Adam Fisch, Jai Gupta, Mostafa Dehghani,\nDara Bahri, Vinh Q Tran, Yi Tay, and Donald Metzler.', metadata={'source': '/kaggle/input/100-llm-papers-to-explore/2211.05102.pdf', 'page': 12}),
 Document(page_content='BERT-of-Theseus (Xu et al., 2020) 51.1 47.8 82.3 82.3 89.0 85.4 89.5 89.6 89.6 80.5 68.2 66.2 91.5 92.2 88.7 84.9 81.2 78.6\nLow Rank Approximated Models - 65.2M Parameters (This Work)\nLow Rank BERT Fine-tuning 41.0 40.5 82.9 82.3 82.4 79.8 89.4 88.8 89.0 79.5 65.0 60.4 91.3 92.0 87.0 81.2 78.5 75.6\nLow Rank BERT + KD 44.7 34.0 83.1 82.4 83.4 80.4 89.1 88.7 89.0 79.9 64.3 60.6 91.3 91.5 86.6 80.9 78.9 74.8\nLow Rank BERT Feature Distillation 51.2 43.4 84.9 83.8 89.4 86.1 91.4 90.7 89.8 80.5 70.8 66.0 92.2 92.9 89.3 84.2 82.4 78.4\nLow Rank BERT Feature Di

# Prompt Template

- Custom prompt

In [21]:
prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [22]:
# llm_chain = LLMChain(prompt=PROMPT, llm=llm)
# llm_chain

# Retriever chain


In [23]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [24]:
### testing MMR search
question = "Give me brief info on Megatron-LM"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(page_content='Megatron 36 104 336 1184 4416 - - - - -\nReformer 377 754 1508 3016 6033 12067 24134 - - -\nLocal Attention 53 110 232 592 1696 3392 6784 13568 27136 -\nLinformer 25 52 114 287 832 1652 3292 6572 13132 26252\nSmyrf 217 434 868 1737 3474 6947 13894 27788 - -\nLSformer 72 152 333 796 2540 5068 10125 20240 - -\nBlock Sparse 33 82 228 408 910 2401 - - - -\nLongformer 30 61 124 277 681 1370 2748 - - -\nBigBird 33 66 131 294 708 1431 2872 - - -\nFlashAttention 22 44 104 209 418 836 1672 3344 6688 13376\nBlock-Sparse FlashAttention 22 44 104 209 418 836 1672 3344 6690 13384\n34', metadata={'source': '/kaggle/input/100-llm-papers-to-explore/2205.14135.pdf', 'page': 33}),
 Document(page_content='techniques that (potentially) lead to the success of LLMs, as\nfollows.\n•Scaling . As discussed in previous parts, there exists\nan evident scaling effect in Transformer language mod-\nels: larger model/data sizes and more training compute\ntypically lead to an improved model ca

In [25]:
### testing similarity search
question = "Give me brief info on Megatron-LM"
vectordb.similarity_search(question, k = CFG.k)

[Document(page_content='Megatron 36 104 336 1184 4416 - - - - -\nReformer 377 754 1508 3016 6033 12067 24134 - - -\nLocal Attention 53 110 232 592 1696 3392 6784 13568 27136 -\nLinformer 25 52 114 287 832 1652 3292 6572 13132 26252\nSmyrf 217 434 868 1737 3474 6947 13894 27788 - -\nLSformer 72 152 333 796 2540 5068 10125 20240 - -\nBlock Sparse 33 82 228 408 910 2401 - - - -\nLongformer 30 61 124 277 681 1370 2748 - - -\nBigBird 33 66 131 294 708 1431 2872 - - -\nFlashAttention 22 44 104 209 418 836 1672 3344 6688 13376\nBlock-Sparse FlashAttention 22 44 104 209 418 836 1672 3344 6690 13384\n34', metadata={'source': '/kaggle/input/100-llm-papers-to-explore/2205.14135.pdf', 'page': 33}),
 Document(page_content='to-end speedup compared to Huggingface and 1.7 \x02speedup compared to Megatron-LM. FlashAttention\n7', metadata={'source': '/kaggle/input/100-llm-papers-to-explore/2205.14135.pdf', 'page': 6}),
 Document(page_content='techniques that (potentially) lead to the success of LLMs, as

# Post-process outputs


In [26]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [27]:
def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

# Ask questions


In [28]:
CFG.model_name

'llama2-13b-chat'

In [29]:
query = "List all the layers present in BERT model"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

The other lines in the table correspond to un-
compressed model (ﬁrst line) and to baselines
which prune layers and distill. Fine-tuning ﬁne-
tunes a six layered BERT model. Vanilla KD
trains a six-layered BERT model with L=
αLCE+(1−α)LKD. BERT-PKD trains a six lay-
ered BERT model with L=αLCE+(1−α)LKD
while also adding an LFDobjective, but on the
hidden states between every consecutive layer.
BERT-of-Theseus ﬁne-tunes BERT model while
gradually pruning half of the layers. We chose this
baselines for several reasons: like our method they
result in a practical reduction of parameters;2, they
are task-speciﬁc;3and they do not require the pre-
training stage, which is expensive and not practical
for most practitioners.
Datasets We evaluate our proposed approach on

mis-match), QNLI,

In [30]:
query = "What is GenAI and what are it's advantages?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

most powerful version in GPT-3 series), text-ada-001 ,
text-babbage-001 , and text-curie-001 . Among
them, the first four interfaces can be further fine-
tuned on the host server of OpenAI. In particular,
babbage ,curie , and davinci correspond to the
GPT-3 (1B), GPT-3 (6.7B), and GPT-3 (175B) models,
respectively [55]. In addition, there are also two APIs
related to Codex [105], called code-cushman-001 (a
powerful and multilingual version of the Codex (12B) [105])
and code-davinci-002 . Further, GPT-3.5 series
include one base model code-davinci-002 and
three enhanced versions, namely text-davinci-002 ,
text-davinci-003 , and gpt-3.5-turbo . As more
powerful alternatives, in this year, OpenAI has released
the model interfaces for GPT-4 series, including gpt-4 ,

gpt-4-32k ,gpt-4

In [31]:
query = "What are horcrux?"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

are likely present and could be studied using different approaches.
37

tendrils into your brain. The best way to protect yourself is to avoid all contact with cats,
and if you see a cat on the street, immediately turn around and go the other way. Do not
approach or pet strange cats!

tendrils into your brain. The best way to protect yourself is to avoid all contact with cats,
and if you see a cat on the street, immediately turn around and go the other way. Do not
approach or pet strange cats!

simplest ﬂowers are unbranched and made up of only two
sepals and a single petal. Other examples are composed
of several petals joined together, with the innermost sepals
completely fused to form a tube called the calyx. The petals
are usually coloured to attract insects, which can only se

In [32]:
query = "Give me 5 examples of cool gen ai and explain what they do"
print(llm_ans(query))


Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

Unnatural Instructions (Honovich et al., 2022)10240K - En InstructGPT-generated Yes
Self-Instruct (Wang et al., 2022c)1152K - En InstructGPT-generated Yes
InstructWild (Xue et al., 2023)12104K 429 - model-generated Yes
Evol-Instruct (Xu et al., 2023a)1352K - En ChatGPT-generated Yes
Alpaca (Taori et al., 2023)1452K - En InstructGPT-generated Yes
LogiCoT (Liu et al., 2023a)15- 2 En GPT-4-generated Yes
Dolly (Conover et al., 2023)1615K 7 En human-crafted Yes
GPT-4-LLM (Peng et al., 2023)1752K - En&Zh GPT-4-generated Yes
LIMA (Zhou et al., 2023)181K - En human-crafted Yes
Offer assistance like humans
across multiple turnsChatGPT (OpenAI, 2022) - - Multi human-crafted No
Vicuna (Chiang et al., 2023) 70K - En user-shared No
Guanaco (JosephusCheung, 2021)19534,530 - Multi model-generat