# 1. Installation

In [None]:
# Connexion à Hugging Face et à Wandb
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb

!git config --global credential.helper store
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF")

login(token=hf_token, add_to_git_credential=True)

In [None]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

In [None]:
!pip install --upgrade scipy
!pip uninstall -y scipy transformers
!pip install scipy==1.10.0 transformers==4.33.0

In [8]:
import sys
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time

from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

# 2. Initialisation du modèle + tokenizer + pipeline

In [None]:
model_id = '/kaggle/input/llama-3/transformers/8b-chat-hf/1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# Charger un modèle en utilisant une quantification 4 bits
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

print(device)

In [None]:
# Téléchargement du modèle et du tokenizer
time_start = time()
model_config = transformers.AutoConfig.from_pretrained(
   model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start)} sec.")


In [None]:
# Création d'un pipeline de génération de texte
time_start = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        min_length=500,
        max_length=1040,
        device_map="auto",)
time_end = time()
print(f"Prepare pipeline: {round(time_end-time_start, 3)} sec.")

In [10]:
# Mise en forme des réponses
from IPython.display import display, Markdown

def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [12]:
def test_model(tokenizer, pipeline, message):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        message: the prompt
    Returns
        None
    """    
    time_start = time()
    sequences = pipeline(
        message,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=1024,)
    time_end = time()
    total_time = f"{round(time_end-time_start, 3)} sec."
    
    question = sequences[0]['generated_text'][:len(message)]
    answer = sequences[0]['generated_text'][len(message):]
    
    return f"Question: {question}\nAnswer: {answer}\nTotal time: {total_time}"

In [None]:
response = test_model(tokenizer,
                    query_pipeline,
                   "Please explain me what lean startup is.")
display(Markdown(colorize_text(response)))

# 3. RAG sur des données particulières

In [None]:
# Vérification que le pipeline fonctionne

llm = HuggingFacePipeline(pipeline=query_pipeline)

time_start = time()
question = "Please explain what lean startup is"
response = llm(prompt=question)
time_end = time()
total_time = f"{round(time_end-time_start, 3)} sec."
full_response =  f"Question: {question}\nAnswer: {response}\nTotal time: {total_time}"
display(Markdown(colorize_text(full_response)))

## 3.1 Ingestion of data using PyPDFLoader


In [14]:
loader = PyPDFLoader("/kaggle/input/the-lean-startup/The Lean Startup - Erick Ries.pdf")
documents = loader.load()

## 3.2 Split data en chunks

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(documents)

## 3.3 Embeddings et Vector Store

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [18]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

## 3.4 Retriever   


In [20]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

## 3.5 Inférence

In [21]:
import re
from time import time
from IPython.display import display, Markdown

def test_rag(qa, query):
    time_start = time()
    response = qa.run(query)
    time_end = time()
    total_time = f"{round(time_end-time_start, 3)} sec."

    response = response.strip()
    sentences = re.split(r'(?<=[.!?])\s+', response)

    if sentences and not sentences[-1].endswith('.'):
        sentences = sentences[:-1]
        
    response = ' '.join(sentences).strip()
    
    full_response = f"Question: {query}\nAnswer: {response}\nTotal time: {total_time}"
    display(Markdown(colorize_text(full_response)))


In [None]:
query = "Using the document provided, explain me what a lean-startup is."
test_rag(qa, query)

In [None]:
query = "Summarize the text with bulletpoints"
test_rag(qa, query)